BryanW commited on Mar 23

Commit

f5b5a3b

verified ·

1 Parent(s): 7f9dddc

Add files using upload-large-folder tool

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/_dynamo/backends/cudagraphs.py +299 -0
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/_dynamo/backends/distributed.py +621 -0
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/_dynamo/backends/onnxrt.py +39 -0
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/_dynamo/backends/registry.py +179 -0
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/_dynamo/backends/torchxla.py +55 -0
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/_export/__pycache__/__init__.cpython-312.pyc +0 -0
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/_export/__pycache__/config.cpython-312.pyc +0 -0
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/_export/__pycache__/converter.cpython-312.pyc +0 -0
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/_export/__pycache__/error.cpython-312.pyc +0 -0
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/_export/__pycache__/non_strict_utils.cpython-312.pyc +0 -0
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/_export/__pycache__/pass_base.cpython-312.pyc +0 -0
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/_export/__pycache__/tools.cpython-312.pyc +0 -0
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/_export/__pycache__/utils.cpython-312.pyc +0 -0
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/_export/__pycache__/verifier.cpython-312.pyc +0 -0
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/_export/__pycache__/wrappers.cpython-312.pyc +0 -0
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/_export/db/__init__.py +5 -0
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/_export/db/case.py +175 -0
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/_export/db/gen_example.py +21 -0
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/_export/db/logging.py +47 -0
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/_export/pass_infra/__init__.py +0 -0
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/_export/pass_infra/node_metadata.py +32 -0
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/_export/pass_infra/proxy_value.py +45 -0
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/_export/passes/__init__.py +1 -0
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/_export/passes/__pycache__/__init__.cpython-312.pyc +0 -0
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/_export/passes/__pycache__/add_runtime_assertions_for_constraints_pass.cpython-312.pyc +0 -0
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/_export/passes/__pycache__/collect_tracepoints_pass.cpython-312.pyc +0 -0
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/_export/passes/__pycache__/constant_folding.cpython-312.pyc +0 -0
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/_export/passes/__pycache__/functionalize_side_effectful_ops_pass.cpython-312.pyc +0 -0
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/_export/passes/__pycache__/insert_custom_op_guards.cpython-312.pyc +0 -0
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/_export/passes/__pycache__/remove_runtime_assertions.cpython-312.pyc +0 -0
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/_export/passes/__pycache__/replace_autocast_with_hop_pass.cpython-312.pyc +0 -0
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/_export/passes/__pycache__/replace_set_grad_with_hop_pass.cpython-312.pyc +0 -0
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/_export/passes/__pycache__/replace_view_ops_with_view_copy_ops_pass.cpython-312.pyc +0 -0
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/_export/passes/__pycache__/replace_with_hop_pass_util.cpython-312.pyc +0 -0
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/_export/passes/_node_metadata_hook.py +111 -0
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/_export/passes/add_runtime_assertions_for_constraints_pass.py +254 -0
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/_export/passes/collect_tracepoints_pass.py +146 -0
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/_export/passes/constant_folding.py +304 -0
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/_export/passes/functionalize_side_effectful_ops_pass.py +99 -0
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/_export/passes/insert_custom_op_guards.py +80 -0
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/_export/passes/lift_constants_pass.py +417 -0
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/_export/passes/remove_runtime_assertions.py +36 -0
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/_export/passes/replace_autocast_with_hop_pass.py +189 -0
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/_export/passes/replace_quantized_ops_with_standard_ops_pass.py +676 -0
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/_export/passes/replace_set_grad_with_hop_pass.py +121 -0
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/_export/passes/replace_view_ops_with_view_copy_ops_pass.py +65 -0
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/_export/passes/replace_with_hop_pass_util.py +190 -0
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/_export/serde/__init__.py +0 -0
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/_export/serde/dynamic_shapes.py +324 -0
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/_export/serde/export_schema.thrift +377 -0

Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/_dynamo/backends/cudagraphs.py ADDED Viewed

	@@ -0,0 +1,299 @@

+"""
+This module implements CUDA graphs support for TorchDynamo backends.
+CUDA graphs allow for capturing and replaying GPU operations, which can significantly
+reduce CPU overhead in GPU-accelerated PyTorch models. This module provides:
+- CUDA graph creation and management for both forward and backward passes
+- Input mutation detection and handling
+- Device compatibility checking
+- Stack trace management for debugging
+- Integration with TorchInductor's cudagraph trees
+The backend supports two main modes:
+1. cudagraphs: Full CUDA graph support with both forward and backward pass optimization
+2. cudagraphs_inner: Lower-level CUDA graph implementation used for benchmarking
+Key components:
+- CudagraphsBackend: Main backend class for CUDA graph integration
+- Mutation detection utilities to ensure graph safety
+- Device mapping and compatibility checks
+- Stack trace collection for debugging
+"""
+import functools
+from collections import defaultdict
+from collections.abc import Callable, Sequence
+from typing import Any, Optional
+import torch
+import torch.fx
+from torch._dynamo import config
+from torch._dynamo.backends.common import aot_autograd
+from torch._dynamo.backends.debugging import boxed_nop
+from torch._inductor.cudagraph_utils import (
+    BoxedDeviceIndex,
+    check_multiple_devices_or_any_cpu_nodes,
+    format_default_skip_message,
+    get_mutation_stack_trace,
+    get_placeholder_info,
+    log_cudagraph_skip_and_bump_counter,
+)
+from torch._inductor.utils import (
+    BoxedBool,
+    count_tangents,
+    get_first_incompatible_cudagraph_node,
+    num_fw_fixed_arguments,
+    output_node,
+)
+from torch.multiprocessing.reductions import StorageWeakRef
+from .registry import register_backend
+def find_input_mutations(g: torch.fx.Graph) -> set[int]:
+    def meta_fk(meta: dict[str, Any]) -> Any:
+        return meta["val"] if "val" in meta else meta["fake_result"]
+    inputs = defaultdict(set)
+    input_idx = 0
+    mutated_inputs = set()
+    for n in g.nodes:
+        if n.op == "placeholder":
+            if isinstance(meta_fk(n.meta), torch.Tensor):
+                inputs[StorageWeakRef(meta_fk(n.meta)._typed_storage())].add(input_idx)
+            input_idx += 1
+        elif n.op == "call_function":
+            if not hasattr(n.target, "_schema"):
+                continue
+            schema = n.target._schema
+            for i, arg in enumerate(schema.arguments):
+                if i < len(n.args):
+                    argument = n.args[i]
+                else:
+                    if arg.name not in n.kwargs:
+                        continue
+                    argument = n.kwargs[arg.name]
+                mut_arg = False
+                if arg.alias_info:
+                    if arg.alias_info.is_write:
+                        mut_arg = True
+                if mut_arg:
+                    # TODO: not correct for args that contain tensors in a struct
+                    # like list
+                    mutated_inputs |= inputs[
+                        StorageWeakRef(meta_fk(argument.meta)._typed_storage())
+                    ]
+        # TODO: error on unrecognized nodes
+    return mutated_inputs
+def get_device_node_mapping(
+    gm: torch.fx.GraphModule,
+) -> dict[torch.device, torch.fx.Node]:
+    device_node_mapping: dict[torch.device, torch.fx.Node] = {}
+    for n in gm.graph.nodes:
+        t = n.meta.get("val", None)
+        if isinstance(t, torch.Tensor) and t.device not in device_node_mapping:
+            device_node_mapping[t.device] = n
+    return device_node_mapping
+def check_for_mutation_ignore_cuda_graph_managed_tensor(
+    aot_model: torch.fx.GraphModule, num_fixed: int
+) -> Optional[str]:
+    mutation_indices = find_input_mutations(aot_model.graph) - set(range(num_fixed))
+    if not mutation_indices:
+        return None
+    placeholders = get_placeholder_info(aot_model.graph)
+    return get_mutation_stack_trace(placeholders, mutation_indices)
+def check_for_skip(aot_model: torch.fx.GraphModule, num_fixed: int) -> Optional[str]:
+    if not config.cudagraph_backend_support_input_mutation:
+        if mut_skip := check_for_mutation_ignore_cuda_graph_managed_tensor(
+            aot_model, num_fixed
+        ):
+            return mut_skip
+    if skip := check_multiple_devices_or_any_cpu_nodes(
+        get_device_node_mapping(aot_model)
+    ):
+        return skip
+    if node := get_first_incompatible_cudagraph_node(aot_model):
+        return format_default_skip_message(f"incompatible op ({node.name})")
+    return None
+def get_device_index(gm: torch.fx.GraphModule) -> int:
+    device = next(iter(get_device_node_mapping(gm)))
+    assert device.type == "cuda"
+    return device.index
+def get_stack_traces(gm: torch.fx.GraphModule) -> list[Optional[str]]:
+    output = output_node(gm)
+    assert len(output.args) == 1
+    args = output.args[0]
+    if not hasattr(args, "__iter__"):
+        return []
+    return [
+        (arg.stack_trace if isinstance(arg, torch.fx.node.Node) else None)
+        for arg in args  # type: ignore[union-attr]
+    ]
+def cudagraphs(dynamo_model: torch.fx.GraphModule, dynamo_inputs: Sequence[Any]) -> Any:
+    from torch._inductor.cudagraph_trees import cudagraphify_impl
+    do_cudagraphs = BoxedBool(True)
+    boxed_device_index = BoxedDeviceIndex(None)
+    def forward_cudagraphs(
+        aot_model: torch.fx.GraphModule,
+        aot_inputs: list[Any],
+        is_inference: bool = False,
+    ) -> Any:
+        interp = boxed_nop(aot_model, aot_inputs)
+        fixed = num_fw_fixed_arguments(len(dynamo_inputs), len(aot_inputs))
+        if skip_msg := check_for_skip(aot_model, fixed):
+            BoxedBool.disable(do_cudagraphs)
+            log_cudagraph_skip_and_bump_counter(
+                f"skipping cudagraphs due to {skip_msg}"
+            )
+            return interp
+        boxed_device_index.set(get_device_index(aot_model))
+        out = cudagraphify_impl(
+            interp,
+            aot_inputs,
+            range(fixed),
+            device_index=boxed_device_index.value,
+            is_backward=False,
+            is_inference=False,  # Q: should forward is_inference here?
+            stack_traces=get_stack_traces(aot_model),
+            placeholders=get_placeholder_info(aot_model.graph),
+            mutated_input_idxs=find_input_mutations(aot_model.graph),
+        )
+        out._boxed_call = True  # type: ignore[attr-defined]
+        return out
+    def backward_cudagraphs(
+        aot_model: torch.fx.GraphModule, aot_inputs: list[Any]
+    ) -> Any:
+        interp = boxed_nop(aot_model, aot_inputs)
+        if not do_cudagraphs:
+            return aot_model
+        fixed = count_tangents(aot_model)
+        if skip_msg := check_for_skip(aot_model, fixed):
+            log_cudagraph_skip_and_bump_counter(
+                f"skipping cudagraphs due to {skip_msg}"
+            )
+            # See [Backward Generation Handling]
+            device_idx = boxed_device_index.value
+            if device_idx is None:
+                device_idx = 0  # Default to device 0 if not set
+            manager = torch._inductor.cudagraph_trees.get_manager(
+                device_idx, create_if_none_exists=False
+            )
+            assert manager is not None
+            def fn(inputs: list[Any]) -> Any:
+                # pyrefly: ignore [missing-attribute]
+                manager.set_to_running_backward()
+                return aot_model(inputs)
+            fn._boxed_call = True  # type: ignore[attr-defined]
+            return fn
+        out = cudagraphify_impl(
+            interp,
+            aot_inputs,
+            range(fixed),
+            device_index=get_device_index(aot_model),
+            is_backward=True,
+            is_inference=False,
+            stack_traces=get_stack_traces(aot_model),
+            placeholders=get_placeholder_info(aot_model.graph),
+            mutated_input_idxs=find_input_mutations(aot_model.graph),
+        )
+        out._boxed_call = True  # type: ignore[attr-defined]
+        return out
+    aot_cudagraphs = aot_autograd(
+        fw_compiler=forward_cudagraphs,
+        bw_compiler=backward_cudagraphs,
+        inference_compiler=functools.partial(forward_cudagraphs, is_inference=True),
+        keep_inference_input_mutations=torch._dynamo.config.cudagraph_backend_keep_input_mutation,
+    )
+    return aot_cudagraphs(dynamo_model, dynamo_inputs)
+class CudagraphsBackend:
+    compiler_name = "cudagraphs"
+    @staticmethod
+    def reset() -> None:
+        from torch._inductor.cudagraph_trees import reset_cudagraph_trees
+        reset_cudagraph_trees()
+    @staticmethod
+    def __call__(model: torch.fx.GraphModule, inputs: Sequence[Any]) -> Any:
+        return cudagraphs(model, inputs)
+# aot_cudagraphs only applies CUDA graphs to the graph.  It is also helpful
+# for debugging and can serve as a perf baseline.
+register_backend(name="cudagraphs", compiler_fn=CudagraphsBackend())
+def cudagraphs_inner(
+    model: Callable[..., Any],
+    inputs: Sequence[Any],
+    copy_outputs: bool = True,
+    copy_inputs: bool = True,
+) -> Callable[..., Sequence[Any]]:
+    """This isn't registered as a backend, but is used in some benchmarks"""
+    assert isinstance(inputs, (list, tuple))
+    if copy_inputs:
+        static_inputs = [torch.zeros_like(x) for x in inputs]
+    else:
+        static_inputs = list(inputs)
+    # warmup
+    torch.cuda.synchronize()
+    stream = torch.cuda.Stream()
+    stream.wait_stream(torch.cuda.current_stream())
+    with torch.cuda.stream(stream):
+        model(*inputs)
+    stream.synchronize()
+    torch.cuda.current_stream().wait_stream(stream)
+    torch.cuda.synchronize()
+    # record
+    graph = torch.cuda.CUDAGraph()
+    with torch.cuda.graph(graph, stream=stream):
+        static_outputs = model(*static_inputs)
+    if not isinstance(static_outputs, (list, tuple)):
+        static_outputs = (static_outputs,)
+    def run(*new_inputs: Any) -> Sequence[Any]:
+        assert len(static_inputs) == len(new_inputs)
+        if copy_inputs:
+            for dst, src in zip(static_inputs, new_inputs):
+                dst.copy_(src)
+        graph.replay()
+        if copy_outputs:
+            return [x.clone() for x in static_outputs]
+        else:
+            return static_outputs
+    return run

Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/_dynamo/backends/distributed.py ADDED Viewed

	@@ -0,0 +1,621 @@

+"""
+This module implements distributed training optimizations for TorchDynamo backends.
+It provides functionality to optimize models wrapped in DistributedDataParallel (DDP)
+by intelligently splitting compiled graphs to align with DDP's gradient synchronization
+boundaries. Key features include:
+- Graph partitioning based on parameter bucket sizes
+- Optimization of allreduce operations for distributed training
+- Support for parameter ignoring and buffer handling
+- Submodule compilation and management
+- Debugging utilities for distributed training
+The main component is the DDPOptimizer class, which handles graph splitting and
+recompilation to enable efficient distributed training while maintaining the benefits
+of compilation.
+"""
+import logging
+import traceback
+from collections.abc import Callable
+from dataclasses import dataclass, field
+from typing import Any, Optional, TYPE_CHECKING
+from unittest import mock
+import torch
+from torch import fx
+from torch._dynamo.backends.registry import CompiledFn, CompilerFn
+from torch._dynamo.output_graph import GraphCompileReason
+from torch._dynamo.utils import deepcopy_to_fake_tensor, detect_fake_mode
+from torch._logging import trace_structured
+from torch.fx.node import Node
+if TYPE_CHECKING:
+    from torch._functorch._aot_autograd.schemas import ViewAndMutationMeta
+# Regular log messages should go through 'log'.
+# ddp_graph_log is a separate artifact logger reserved for dumping graphs.
+# See docs/source/logging.rst for more info.
+log = logging.getLogger(__name__)
+ddp_graph_log = torch._logging.getArtifactLogger(__name__, "ddp_graphs")
+def args_str(args: Any) -> str:
+    # a debug helper
+    if torch.is_tensor(args):
+        return f"T[{args.shape}]"
+    elif isinstance(args, tuple):
+        return f"tuple({', '.join([args_str(x) for x in args])})"
+    elif isinstance(args, list):
+        return f"list({', '.join([args_str(x) for x in args])})"
+    else:
+        return str(args)
+@dataclass
+class Bucket:
+    size: int = 0
+    params: list[str] = field(default_factory=list)
+    nodes: list[fx.Node] = field(default_factory=list)
+    # param_ids is just used for unit testing
+    param_ids: list[int] = field(default_factory=list)
+    # keep track of any buckets that were extended for logging purposes
+    opcount_increased_to_capture_external_output: int = 0
+    paramsize_before_opcount_increase: int = 0
+def bucket_has_external_output(bucket: Bucket) -> bool:
+    nodes_in_bucket = set()
+    # we want to iterate in reverse order, but clumsi-luckily the bucket.nodes list was already created backwards
+    # so we don't reverse it here
+    for node in bucket.nodes:
+        # assume node.op != output, since those are filtered in the original iteration
+        nodes_in_bucket.add(node)
+        for user in node.users:
+            if user not in nodes_in_bucket:
+                return True
+    return False
+def pretty_print_buckets(buckets: list[Bucket], bucket_bytes_cap: int) -> None:
+    headers = ("Index", "Size (b)", "Param Names")
+    rows: list[tuple[Optional[int], Optional[int], str]] = []
+    extended_buckets = []
+    for idx, bucket in enumerate(reversed(buckets)):
+        if len(bucket.params) > 0:
+            rows.append((idx, bucket.size, bucket.params[0]))
+            rows.extend((None, None, param) for param in bucket.params[1:])
+        if bucket.opcount_increased_to_capture_external_output > 0:
+            extended_buckets.append(
+                (
+                    idx,
+                    bucket.opcount_increased_to_capture_external_output,
+                    bucket.size - bucket.paramsize_before_opcount_increase,
+                )
+            )
+    if rows:
+        log.info(
+            "\nDDPOptimizer used bucket cap %s and created %d buckets. Enable debug logs for detailed bucket info.",
+            bucket_bytes_cap,
+            len(buckets),
+        )
+        if extended_buckets:
+            log.warning(
+                "Some buckets were extended beyond their requested parameter capacities"
+                " in order to ensure each subgraph has an output node, required for fx graph partitioning."
+                " This can be the case when a subgraph would have only contained nodes performing inplace mutation,"
+                " and returning no logical outputs. This should not be a problem, unless it results in too few graph"
+                " partitions for optimal DDP performance."
+            )
+        try:
+            from tabulate import tabulate
+            log.debug(
+                "\nDDPOptimizer produced the following bucket assignments:\n%s",
+                tabulate(rows, headers=headers, tablefmt="simple_grid"),
+            )
+            if extended_buckets:
+                log.warning(
+                    "DDPOptimizer extended these buckets to ensure per-subgraph output nodes:\n%s",
+                    tabulate(
+                        extended_buckets,
+                        headers=("Index", "Extra Ops", "Extra Param Size (b)"),
+                        tablefmt="simple_grid",
+                    ),
+                )
+        except ImportError:
+            log.debug(
+                "Please `pip install tabulate` in order to display ddp bucket sizes and diagnostic information."
+            )
+    else:
+        log.debug("DDPOptimizer captured no parameters and did not split this graph.")
+def has_higher_order_op(gm: fx.GraphModule) -> bool:
+    # Check if there is a higher order op in the graph
+    for node in gm.graph.nodes:
+        if node.op == "get_attr":
+            maybe_param = getattr(gm, node.target)
+            if isinstance(maybe_param, torch.fx.GraphModule):
+                return True
+    return False
+def propagate_metadata(orig_gm: fx.GraphModule, split_gm: fx.GraphModule) -> None:
+    for name, module in split_gm.named_modules():
+        if "." not in name and len(name):
+            # TODO: add split id to CompileId: https://github.com/pytorch/tlparse/pull/83/files#r1880649384
+            module.meta = orig_gm.meta
+            module._param_name_to_source = orig_gm._param_name_to_source
+def propagate_dynamo_source(orig_gm: fx.GraphModule, split_gm: fx.GraphModule) -> None:
+    name_to_dynamo_source = {}
+    for node in orig_gm.graph.find_nodes(op="placeholder"):
+        name_to_dynamo_source[node.name] = node._dynamo_source
+    for name, module in split_gm.named_modules():
+        if "." not in name and len(name):
+            for node in module.graph.find_nodes(op="placeholder"):
+                # non-placeholder in original_gm may become placeholder in submodules
+                node._dynamo_source = name_to_dynamo_source.get(node.name, None)
+class DDPOptimizerContext:
+    def __init__(self) -> None:
+        self.curr_bucket: int = -1
+        self.metadata_per_bucket: list[ViewAndMutationMeta] = []
+# compile each of the partitioned submodules using the user-provided compiler
+class SubmodCompiler(torch.fx.interpreter.Interpreter):
+    def __init__(
+        self,
+        module: fx.GraphModule,
+        compiler: CompilerFn,
+        fake_mode: torch._subclasses.fake_tensor.FakeTensorMode,
+    ) -> None:
+        super().__init__(module)
+        self.compiler = compiler
+        self.fake_mode = fake_mode
+        # See Note [DDPOptimizer and fw_metadata]
+        ctx = torch._guards.TracingContext.try_get()
+        if ctx is not None:
+            ctx.ddp_optimizer_ctx = DDPOptimizerContext()
+    def compile_submod(
+        self, input_mod: fx.GraphModule, args: list[torch.Tensor], kwargs: Any
+    ) -> Any:
+        """
+        Compile the submodule,
+        using a wrapper to make sure its output is always a tuple,
+        which is required by AotAutograd based compilers
+        """
+        assert len(kwargs) == 0, "We assume only args for these modules"
+        class WrapperModule(torch.nn.Module):
+            def __init__(
+                self, submod: Callable[..., Any], unwrap_singleton_tuple: bool
+            ) -> None:
+                super().__init__()
+                self.submod = submod
+                self.unwrap_singleton_tuple = unwrap_singleton_tuple
+            def forward(self, *args: Any) -> Any:
+                x = self.submod(*args)
+                # TODO(whc)
+                # for some reason the isinstance check is necessary if I split one node per submod
+                # - even though I supposedly wrapped the output in a tuple in those cases, the real
+                # compiled module was still returning a tensor
+                if self.unwrap_singleton_tuple and isinstance(x, (tuple, list)):
+                    return x[0]
+                return x
+        unwrap_singleton_tuple = False
+        for sn in input_mod.graph.nodes:
+            if sn.op == "output":
+                if not isinstance(sn.args[0], tuple):
+                    unwrap_singleton_tuple = True
+                    sn.args = (sn.args,)
+        input_mod.recompile()
+        input_mod.compile_subgraph_reason = GraphCompileReason(  # type: ignore[assignment]
+            "DDPOptimizer intentional graph-break (See Note [DDPOptimizer])."
+            " Set `torch._dynamo.config.optimize_ddp = False` to disable.",
+            [
+                # it's close to useless to get a real stacktrace here, and quite verbose.
+                traceback.FrameSummary(__file__, 0, "DDPOptimizer"),
+            ],
+        )
+        wrapper = WrapperModule(
+            self.compiler(input_mod, args),
+            unwrap_singleton_tuple,
+        )
+        return wrapper
+    # Note:
+    #
+    # The way distributed works today around fake tensors can be somewhat confusing.
+    # Some of these codepaths are shared in both runtime, and compile time. The presence
+    # of a fake_mode, read off of fake tensor inputs, dictates how we will operate.
+    #
+    # A few things to keep in mind:
+    #
+    # 1) We invoke `compile_submod` with a real module. The output of that gets stored
+    # on the graph via `self.module.add_submodule(n.target, compiled_submod_real)`.
+    #
+    # 2) When running a call_module targeted node, if we have a fake_mode, we fakify the
+    # module we got from self.fetch_attr(n.target). Regardless of fake_mode, we then execute it.
+    #
+    # 3) Fake tensors should always be around during compile time.
+    #
+    # 4) Fake tensors should never be around at runtime.
+    #
+    # 5) We end up with a compilation mode that takes a real submodule and fake tensors,
+    # to match what aot_autograd expects. See Note: [Fake Modules and AOTAutograd]
+    def run_node(self, n: Node) -> Any:
+        args, kwargs = self.fetch_args_kwargs_from_env(n)
+        new_args = []
+        assert self.fake_mode
+        for arg in args:
+            if isinstance(arg, torch.Tensor) and not isinstance(
+                arg, torch._subclasses.FakeTensor
+            ):
+                new_args.append(torch._dynamo.utils.to_fake_tensor(arg, self.fake_mode))
+            else:
+                new_args.append(arg)
+        log.debug("run_node %s, %s got args %s", n.op, n.target, args_str(args))
+        assert isinstance(args, tuple)
+        assert isinstance(kwargs, dict)
+        if n.op == "call_module":
+            real_mod = self.fetch_attr(str(n.target))
+            if self.fake_mode:
+                curr_submod = deepcopy_to_fake_tensor(real_mod, self.fake_mode)
+            else:
+                curr_submod = real_mod
+            ddp_graph_log.debug("\n---%s graph---\n%s", n.target, curr_submod.graph)
+            # When calling the compiler on the submod, inputs (new_args) are expected to
+            # be FakeTensors already since Dynamo would have made them FakeTensors in the
+            # non-DDP flow.  However, the parameters are _not_ expected to be FakeTensors,
+            # since this wrapping happens during compilation
+            # Note: Returning Fake Tensors on First AOT Autograd Call
+            #
+            # Inductor will optimize strides of outputs when it deems it profitable.
+            # For instance, converting to channels last. When we split the graph here
+            # into multiple inductor compilations, we need to make sure that the
+            # output strides of one compilation is appropriately passed to the subsequent
+            # compilations. However, the mapping from inductor output to dynamo output
+            # is non-trivial due to aot_autograd's deduping, de-aliasing, mutation, re-writing,
+            # subclass handling, etc. In order to replay all this logic we set a flag such that
+            # the first invocation of inductor in aot_autograd will return Fake Tensors with
+            # appropriate strides. Then, all of aot autograd's runtime logic is replayed.
+            # This gives us the appropriately strided outputs here which will reflect runtime strides.
+            class FakeifyFirstAOTInvocationGuard:
+                def __init__(self) -> None:
+                    self.tc = torch._guards.TracingContext.try_get()
+                    assert self.tc
+                    self.tc.fakify_first_call = True
+                def __del__(self) -> None:
+                    self.tc.fakify_first_call = False  # type: ignore[union-attr]
+            # For aot_eager and other backends, tracing context is not set
+            has_tracing_context = torch._guards.TracingContext.try_get() is not None
+            if has_tracing_context:
+                g = FakeifyFirstAOTInvocationGuard()  # noqa: F841
+            from torch._dynamo.utils import counters
+            init = counters["aot_autograd"]["total"]
+            compiled_submod_real = self.compile_submod(real_mod, new_args, kwargs)
+            # TODO - better way of doing this?
+            # Only aot autograd handles fakifying first call
+            invoked_aot_autograd = init != counters["aot_autograd"]["total"]
+            # We update the original (outer) graph with a call into the compiled module
+            # instead of the uncompiled one.
+            self.module.delete_submodule(n.target)  # type: ignore[operator]
+            n.target = "compiled_" + n.target  # type: ignore[operator]
+            self.module.add_submodule(n.target, compiled_submod_real)  # type: ignore[operator]
+            # Finally, we have to produce inputs for use compiling the next submodule,
+            # and these need to be FakeTensors, so we execute the module under fake_mode
+            # Because parameters are not fake we patch fake tensor mode to allow non fake inputs
+            with (
+                self.fake_mode,
+                mock.patch.object(self.fake_mode, "allow_non_fake_inputs", True),
+            ):
+                if has_tracing_context and invoked_aot_autograd:
+                    tracing_ctx = torch._guards.TracingContext.try_get()
+                    assert tracing_ctx is not None
+                    # DDPOptimizer maintains 1 dynamo graph -> N AOT graphs
+                    # Dynamo only has 1 tracing context, so it needs to maintain all N AOT metadata instances
+                    ddp_ctx = tracing_ctx.ddp_optimizer_ctx
+                    assert ddp_ctx is not None
+                    assert tracing_ctx.fw_metadata is not None
+                    ddp_ctx.curr_bucket += 1
+                    ddp_ctx.metadata_per_bucket.append(tracing_ctx.fw_metadata)
+                    out = compiled_submod_real(*new_args, **kwargs)
+                    # output should be fake or subclass
+                    assert all(
+                        (not isinstance(t, torch.Tensor) or type(t) is not torch.Tensor)
+                        for t in (out if isinstance(out, (list, tuple)) else [out])
+                    )
+                    return out
+                else:
+                    return curr_submod(*new_args, **kwargs)
+        else:
+            # placeholder or output nodes don't need to get compiled, just executed
+            return getattr(self, n.op)(n.target, new_args, kwargs)
+class DDPOptimizer:
+    """Note [DDPOptimizer]
+    DDPOptimizer applies when dynamo compiles models wrapped in DistributedDataParallel (DDP),
+    breaking the dynamo graph into chunks to compile separately, with the breaks aligning to
+    the boundaries of gradient-allreduce buckets chosen by DDP.
+    Background/Motivation
+     - DDP uses allreduce collectives to synchronize partial gradients computed on different workers
+     - DDP groups gradient allreduces into 'buckets' to optimize communication efficiency of all-reduce
+     - Parameters grouped into buckets are assumed to be adjacent in time, so they become ready
+       at around the same time during backward and thus can share the same allreduce efficiently
+     - Allreduces must overlap with backward compute for optimal training performance
+     - DDP schedules allreduces using 'hooks' fired from the c++ autograd engine in pytorch, which
+       operates when individual grads become 'ready'
+     - Dynamo+AOTAutograd produces a single fused graph that runs 'atomically' from the perspective of the
+       autograd engine, such that all gradients become 'ready' at the same time.  Hooks fire after the whole
+       fused backward function executes, preventing any overlap of compute and communication
+    Algorithm
+     - DDPOptimizer starts off with an FX graph traced by dynamo which represents forward.  It can traverse
+       this graph in reverse order to determine the true order that gradients will become ready during backward.
+     - Parameter sizes are counted in reverse order, up to a bucket size limit, at which point a new bucket is started
+       and a graph break introduced
+     - Each of the subgraphs is compiled by the compiler provided to dynamo by the user, and then fused back together
+       into an outer module that is returned to the user
+    Notes
+     - It would be better to enforce (by adding an API to DDP) that the bucket splits chosen here are used by DDP,
+       and that DDP does not need to detect or optimize bucket order by observing execution at runtime, as it does
+       in eager.
+     - If Dynamo can't capture a whole graph for the portion of the model wrapped by DDP, this algorithm will currently
+       produce splits that do not necessarily align with the buckets used by DDP.  This should result in performance
+       degradation approaching the baseline case where graph-splits are not used, but not worse.
+     - If the backend compiler fails to compile a single subgraph, it will execute eagerly despite the rest of the
+       subgraphs being compiled
+     - DDP has a 'parameters_and_buffers_to_ignore' field, which DDPOptimizer attempts to honor by reading markers
+       left by DDP on individual parameters.  In cases where other transformations, such as reparameterization, are
+       also used, the ignore markers could be lost.  If DDPOptimizer fails to ignore a parameter ignored by DDP,
+       it is not catastrophic but could impact performance by choosing sub-optimal bucket splits.
+     - DDPOptimizer always ignores all buffers, regardless of their ignore flag, since buffers do not require gradients,
+       and therefore aren't allreduced by DDP.  (They are broadcast during forward, but this is not covered by
+       DDPOptimizer)
+    Debugging
+     - Generally, it is easiest to debug DDPOptimizer in a single process program, using pdb.
+     - In many cases, the log messages are helpful (they show bucket size assignments)-
+       just set TORCH_LOGS env to include any of 'dynamo', 'distributed', or 'dist_ddp'.
+     - See `benchmarks/dynamo/distributed.py` for a simple harness that will run a toy model or a torchbench model
+       in a single process (or with torchrun, in multiple processes)
+    Args:
+        bucket_bytes_cap (int): Controls the size of buckets, in bytes, used to determine graphbreaks.  Should be
+            set to match the equivalent parameter on the original DDP module.
+        backend_compile_fn (callable): A dynamo compiler function, to be invoked to compile each subgraph.
+        first_bucket_cap (int): Controls the size of the first bucket.  Should match DDP's first bucket cap.  DDP
+            special-cases the first bucket size since it is sometimes optimal to start a small allreduce early.
+    """
+    def __init__(
+        self,
+        bucket_bytes_cap: int,
+        backend_compile_fn: CompilerFn,
+        first_bucket_cap: Optional[int] = None,
+    ) -> None:
+        if first_bucket_cap is not None:
+            self.first_bucket_cap = first_bucket_cap
+        elif torch.distributed.is_available():
+            # this constant comes from C10D lib which is not always built
+            self.first_bucket_cap = torch.distributed._DEFAULT_FIRST_BUCKET_BYTES
+        else:
+            self.first_bucket_cap = bucket_bytes_cap
+        self.bucket_bytes_cap = bucket_bytes_cap
+        assert self.first_bucket_cap <= self.bucket_bytes_cap, (
+            "First bucket should be smaller/equal to other buckets to get comms warmed up ASAP"
+        )
+        self.backend_compile_fn = backend_compile_fn
+    def _ignore_parameter(self, parameter: torch.nn.Parameter) -> bool:
+        return hasattr(parameter, "_ddp_ignored") and parameter._ddp_ignored
+    def add_param(self, bucket: Bucket, param: torch.nn.Parameter, name: str) -> None:
+        bucket.size += param.untyped_storage().nbytes()
+        bucket.params.append(name)
+        bucket.param_ids.append(id(param))
+    def add_module_params_to_bucket(
+        self,
+        mod: torch.nn.Module,
+        bucket: Bucket,
+        processed_modules: set[torch.nn.Module],
+        prefix: str,
+    ) -> None:
+        processed_modules.add(mod)
+        for name, param in mod.named_parameters():
+            if param.requires_grad and not self._ignore_parameter(param):
+                self.add_param(bucket, param, f"{prefix}_{name}")
+    def add_param_args(self, bucket: Bucket, node: fx.Node) -> None:
+        for arg in node.args:
+            if not isinstance(arg, torch.fx.node.Node):
+                continue
+            if arg.op != "placeholder":
+                continue
+            param = arg.meta["example_value"]
+            if (
+                isinstance(param, torch.nn.Parameter)
+                and param.requires_grad
+                and not self._ignore_parameter(param)
+            ):
+                self.add_param(bucket, param, str(arg.target))
+    def compile_fn(
+        self, gm: fx.GraphModule, example_inputs: list[torch.Tensor]
+    ) -> CompiledFn:
+        """
+        Implements graph splitting, first determining a set of of buckets by counting
+        parameter sizes in reverse graph order, then invoking the user/backend compiler
+        to compile each subgraph. Finally, stiches compiled graphs into one graphmodule
+        and returns its callable.
+        """
+        # 1: compute the partition map according to DDP bucket logic
+        buckets = [Bucket()]  # (size, param_names)
+        processed_modules: set[torch.nn.Module] = set()
+        for node in reversed(gm.graph.nodes):
+            if node.op in ("output", "placeholder"):
+                continue
+            if (
+                buckets[0].size >= self.bucket_bytes_cap
+                or len(buckets) == 1
+                and buckets[0].size >= self.first_bucket_cap
+            ):
+                if bucket_has_external_output(buckets[0]):
+                    buckets.insert(0, Bucket())
+                else:
+                    # continue building this bucket past the point of filling its parameter capacity,
+                    # to increase chances it contains at least one node that is either a global output or
+                    # passed as input to a subsequent graph
+                    if buckets[0].opcount_increased_to_capture_external_output == 0:
+                        buckets[0].paramsize_before_opcount_increase = buckets[0].size
+                    buckets[0].opcount_increased_to_capture_external_output += 1
+            if node.op == "call_function":
+                self.add_param_args(buckets[0], node)
+            elif node.op == "call_module":
+                target_mod = gm.get_submodule(node.target)
+                if target_mod not in processed_modules:
+                    self.add_module_params_to_bucket(
+                        target_mod, buckets[0], processed_modules, node.target
+                    )
+            elif node.op == "call_method":
+                if isinstance(node.args[0].target, str):
+                    target_mod = None
+                    try:
+                        target_mod = gm.get_submodule(node.args[0].target)
+                    except AttributeError:
+                        pass
+                    if target_mod is not None and target_mod not in processed_modules:
+                        self.add_module_params_to_bucket(
+                            target_mod, buckets[0], processed_modules, node.target
+                        )
+                    # This handles situations like  tmp = torch.mm(x, self.weight.t())
+                    # t: "f32[512, 512]" = l_self_seq_2_weight.t();  l_self_seq_2_weight = None
+                    # tmp: "f32[512, 512]" = torch.mm(input_2, t);  input_2 = t = None
+                    self.add_param_args(buckets[0], node)
+            elif node.op == "get_attr":
+                maybe_param = getattr(gm, node.target)
+                if (
+                    isinstance(maybe_param, torch.nn.Parameter)
+                    and maybe_param.requires_grad
+                    and not self._ignore_parameter(maybe_param)
+                ):
+                    self.add_param(buckets[0], maybe_param, node.target)
+            # All nodes have to be mapped to a bucket, even if they don't have their own params
+            # Ignored params still end up in buckets, we just don't count them towards the capacity
+            buckets[0].nodes.append(node)
+        if len(buckets) > 1 and buckets[0].size == 0:
+            # we collected a small preamble graph with ops that don't include parameters, fuse it back
+            buckets[1].nodes.extend(buckets[0].nodes)
+            assert len(buckets[0].params) == 0, "Params should be empty if size is 0"
+            del buckets[0]
+        # stash buckets for testing/debugging purposes
+        self.buckets = buckets
+        pretty_print_buckets(buckets, self.bucket_bytes_cap)
+        if len(buckets) == 1:
+            # bypass split/fuse logic if there is only one bucket
+            return self.backend_compile_fn(gm, example_inputs)
+        # 2: partition the graphmodule according to bucket capacity
+        partition_map = {}
+        for idx, b in enumerate(buckets):
+            for node in b.nodes:
+                partition_map[node] = idx
+        split_gm = fx.passes.split_module.split_module(
+            gm,
+            None,  # type: ignore[arg-type]
+            lambda node: partition_map[node],
+        )
+        # See note [Assumption on Dynamo Metadata]
+        propagate_dynamo_source(gm, split_gm)
+        propagate_metadata(gm, split_gm)
+        debug_str = (
+            f"\n---orig graph---\n{gm.graph}\n"
+            + f"\n---split graph---\n{split_gm.graph}\n"
+        )
+        for name, module in split_gm.named_modules():
+            if "." not in name and len(name):
+                # only print the submod graphs, not their children
+                debug_str += f"\n---{name} graph---\n{module.graph}\n"
+        debug_str += "\n---------------\n"
+        ddp_graph_log.debug(debug_str)
+        trace_structured(
+            "optimize_ddp_split_graph",
+            payload_fn=lambda: split_gm.print_readable(print_output=False),
+        )
+        for name, module in split_gm.named_modules():
+            if "." not in name and len(name):
+                trace_structured(
+                    "optimize_ddp_split_child",
+                    lambda: {"name": name},
+                    payload_fn=lambda: module.print_readable(print_output=False),
+                )
+        fake_mode = detect_fake_mode(example_inputs)
+        if fake_mode is None:
+            fake_mode = torch._subclasses.fake_tensor.FakeTensorMode()
+        submod_compiler = SubmodCompiler(split_gm, self.backend_compile_fn, fake_mode)
+        with torch._dynamo.utils._disable_saved_tensors_hooks_during_tracing():
+            submod_compiler.run(*example_inputs)
+        split_gm.recompile()
+        ddp_graph_log.debug(
+            "\n---final graph---\n%s\n---------------\n", split_gm.graph
+        )
+        return split_gm

Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/_dynamo/backends/onnxrt.py ADDED Viewed

	@@ -0,0 +1,39 @@

+# This backend is maintained by ONNX team. To direct issues
+# to the right people, please tag related GitHub issues with `module: onnx`.
+#
+# Maintainers' Github IDs: wschin, xadupre
+# from torch.onnx._internal.onnxruntime import (
+#     is_onnxrt_backend_supported,
+#     torch_compile_backend,
+# )
+# from .registry import register_backend
+"""
+Placeholder for onnxruntime backend for dynamo
+"""
+# def has_onnxruntime():
+#     # FIXME: update test/dynamo/test_backends.py to call is_onnxrt_backend_supported()
+#     return is_onnxrt_backend_supported()
+# if is_onnxrt_backend_supported():
+#     register_backend(name="onnxrt", compiler_fn=torch_compile_backend)
+# else:
+#     def information_displaying_backend(*args, **kwargs):
+#         raise ImportError(
+#             "onnxrt is not registered as a backend. "
+#             "Please make sure all dependencies such as "
+#             "numpy, onnx, onnxscript, and onnxruntime-training are installed. "
+#             "Suggested procedure to fix dependency problem:\n"
+#             "  (1) pip or conda install numpy onnx onnxscript onnxruntime-training.\n"
+#             "  (2) Open a new python terminal.\n"
+#             "  (3) Call the API `torch.onnx.is_onnxrt_backend_supported()`:\n"
+#             "  (4)   If it returns `True`, then you can use `onnxrt` backend.\n"
+#             "  (5)   If it returns `False`, please execute the package importing section in "
+#             "torch/onnx/_internal/onnxruntime.py under pdb line-by-line to see which import fails."
+#         )
+#     register_backend(name="onnxrt", compiler_fn=information_displaying_backend)

Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/_dynamo/backends/registry.py ADDED Viewed

	@@ -0,0 +1,179 @@

+"""
+This module implements TorchDynamo's backend registry system for managing compiler backends.
+The registry provides a centralized way to register, discover and manage different compiler
+backends that can be used with torch.compile(). It handles:
+- Backend registration and discovery through decorators and entry points
+- Lazy loading of backend implementations
+- Lookup and validation of backend names
+- Categorization of backends using tags (debug, experimental, etc.)
+Key components:
+- CompilerFn: Type for backend compiler functions that transform FX graphs
+- _BACKENDS: Registry mapping backend names to entry points
+- _COMPILER_FNS: Registry mapping backend names to loaded compiler functions
+Example usage:
+    @register_backend
+    def my_compiler(fx_graph, example_inputs):
+        # Transform FX graph into optimized implementation
+        return compiled_fn
+    # Use registered backend
+    torch.compile(model, backend="my_compiler")
+The registry also supports discovering backends through setuptools entry points
+in the "torch_dynamo_backends" group. Example:
+```
+setup.py
+---
+from setuptools import setup
+setup(
+    name='my_torch_backend',
+    version='0.1',
+    packages=['my_torch_backend'],
+    entry_points={
+        'torch_dynamo_backends': [
+            # name = path to entry point of backend implementation
+            'my_compiler = my_torch_backend.compiler:my_compiler_function',
+        ],
+    },
+)
+```
+```
+my_torch_backend/compiler.py
+---
+def my_compiler_function(fx_graph, example_inputs):
+    # Transform FX graph into optimized implementation
+    return compiled_fn
+```
+Using `my_compiler` backend:
+```
+import torch
+model = ...  # Your PyTorch model
+optimized_model = torch.compile(model, backend="my_compiler")
+```
+"""
+import functools
+import logging
+from collections.abc import Callable, Sequence
+from importlib.metadata import EntryPoint
+from typing import Any, Optional, Protocol, Union
+import torch
+from torch import fx
+log = logging.getLogger(__name__)
+class CompiledFn(Protocol):
+    def __call__(self, *args: torch.Tensor) -> tuple[torch.Tensor, ...]: ...
+CompilerFn = Callable[[fx.GraphModule, list[torch.Tensor]], CompiledFn]
+_BACKENDS: dict[str, Optional[EntryPoint]] = {}
+_COMPILER_FNS: dict[str, CompilerFn] = {}
+def register_backend(
+    compiler_fn: Optional[CompilerFn] = None,
+    name: Optional[str] = None,
+    tags: Sequence[str] = (),
+) -> Callable[..., Any]:
+    """
+    Decorator to add a given compiler to the registry to allow calling
+    `torch.compile` with string shorthand.  Note: for projects not
+    imported by default, it might be easier to pass a function directly
+    as a backend and not use a string.
+    Args:
+        compiler_fn: Callable taking a FX graph and fake tensor inputs
+        name: Optional name, defaults to `compiler_fn.__name__`
+        tags: Optional set of string tags to categorize backend with
+    """
+    if compiler_fn is None:
+        # @register_backend(name="") syntax
+        return functools.partial(register_backend, name=name, tags=tags)  # type: ignore[return-value]
+    assert callable(compiler_fn)
+    name = name or compiler_fn.__name__
+    assert name not in _COMPILER_FNS, f"duplicate name: {name}"
+    if compiler_fn not in _BACKENDS:
+        _BACKENDS[name] = None
+    _COMPILER_FNS[name] = compiler_fn
+    compiler_fn._tags = tuple(tags)  # type: ignore[attr-defined]
+    return compiler_fn
+register_debug_backend = functools.partial(register_backend, tags=("debug",))
+register_experimental_backend = functools.partial(
+    register_backend, tags=("experimental",)
+)
+def lookup_backend(compiler_fn: Union[str, CompilerFn]) -> CompilerFn:
+    """Expand backend strings to functions"""
+    if isinstance(compiler_fn, str):
+        if compiler_fn not in _BACKENDS:
+            _lazy_import()
+        if compiler_fn not in _BACKENDS:
+            from ..exc import InvalidBackend
+            raise InvalidBackend(name=compiler_fn)
+        if compiler_fn not in _COMPILER_FNS:
+            entry_point = _BACKENDS[compiler_fn]
+            if entry_point is not None:
+                register_backend(compiler_fn=entry_point.load(), name=compiler_fn)
+        compiler_fn = _COMPILER_FNS[compiler_fn]
+    return compiler_fn
+# NOTE: can't type this due to public api mismatch; follow up with dev team
+def list_backends(exclude_tags=("debug", "experimental")) -> list[str]:  # type: ignore[no-untyped-def]
+    """
+    Return valid strings that can be passed to:
+        torch.compile(..., backend="name")
+    """
+    _lazy_import()
+    exclude_tags_set = set(exclude_tags or ())
+    backends = [
+        name
+        for name in _BACKENDS
+        if name not in _COMPILER_FNS
+        or not exclude_tags_set.intersection(_COMPILER_FNS[name]._tags)  # type: ignore[attr-defined]
+    ]
+    return sorted(backends)
+@functools.cache
+def _lazy_import() -> None:
+    from .. import backends
+    from ..utils import import_submodule
+    import_submodule(backends)
+    from ..repro.after_dynamo import dynamo_minifier_backend
+    assert dynamo_minifier_backend is not None
+    _discover_entrypoint_backends()
+@functools.cache
+def _discover_entrypoint_backends() -> None:
+    # importing here so it will pick up the mocked version in test_backends.py
+    from importlib.metadata import entry_points
+    group_name = "torch_dynamo_backends"
+    eps = entry_points(group=group_name)
+    eps_dict = {name: eps[name] for name in eps.names}
+    for backend_name in eps_dict:
+        _BACKENDS[backend_name] = eps_dict[backend_name]

Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/_dynamo/backends/torchxla.py ADDED Viewed

	@@ -0,0 +1,55 @@

+import logging
+from collections.abc import Callable
+from typing import Any
+import torch
+from functorch.compile import make_boxed_func
+from torch import fx
+from ..backends.common import aot_autograd
+from .registry import CompiledFn, register_backend, register_experimental_backend
+log = logging.getLogger(__name__)
+@register_experimental_backend
+def openxla_eval(
+    model: fx.GraphModule, fake_tensor_inputs: list[torch.Tensor]
+) -> CompiledFn:
+    return xla_backend_helper(model, fake_tensor_inputs, boxed=False)
+def openxla_eval_boxed(
+    model: fx.GraphModule, fake_tensor_inputs: list[torch.Tensor]
+) -> Callable[..., Any]:
+    return xla_backend_helper(model, fake_tensor_inputs, boxed=True)
+def xla_backend_helper(
+    model: fx.GraphModule, fake_tensor_inputs: list[torch.Tensor], boxed: bool = False
+) -> Callable[..., Any]:
+    try:
+        import torch_xla.core.dynamo_bridge as bridge
+    except ImportError as e:
+        raise ImportError(
+            "Please follow the instruction in https://github.com/pytorch/xla#pytorchxla to install torch_xla"
+        ) from e
+    compiled_graph = None
+    def fwd(*args: torch.Tensor) -> Any:
+        nonlocal model
+        nonlocal compiled_graph
+        if compiled_graph is None:
+            compiled_graph = bridge.extract_compiled_graph(model, args)
+            del model
+        return compiled_graph(*args)
+    return make_boxed_func(fwd) if boxed else fwd
+openxla = aot_autograd(
+    fw_compiler=openxla_eval_boxed,
+)
+register_backend(name="openxla", compiler_fn=openxla)

Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/_export/__pycache__/__init__.cpython-312.pyc ADDED Viewed

Binary file (8.73 kB). View file

Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/_export/__pycache__/config.cpython-312.pyc ADDED Viewed

Binary file (1.2 kB). View file

Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/_export/__pycache__/converter.cpython-312.pyc ADDED Viewed

Binary file (78.2 kB). View file

Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/_export/__pycache__/error.cpython-312.pyc ADDED Viewed

Binary file (2.42 kB). View file

Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/_export/__pycache__/non_strict_utils.cpython-312.pyc ADDED Viewed

Binary file (47.7 kB). View file

Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/_export/__pycache__/pass_base.cpython-312.pyc ADDED Viewed

Binary file (26 kB). View file

Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/_export/__pycache__/tools.cpython-312.pyc ADDED Viewed

Binary file (6.17 kB). View file

Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/_export/__pycache__/utils.cpython-312.pyc ADDED Viewed

Binary file (73 kB). View file

Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/_export/__pycache__/verifier.cpython-312.pyc ADDED Viewed

Binary file (27.2 kB). View file

Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/_export/__pycache__/wrappers.cpython-312.pyc ADDED Viewed

Binary file (16.3 kB). View file

Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/_export/db/__init__.py ADDED Viewed

	@@ -0,0 +1,5 @@

+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.

Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/_export/db/case.py ADDED Viewed

	@@ -0,0 +1,175 @@

+# mypy: allow-untyped-defs
+import inspect
+import re
+import string
+from dataclasses import dataclass, field
+from enum import Enum
+from typing import Any, Optional
+from types import ModuleType
+import torch
+_TAGS: dict[str, dict[str, Any]] = {
+    "torch": {
+        "cond": {},
+        "dynamic-shape": {},
+        "escape-hatch": {},
+        "map": {},
+        "dynamic-value": {},
+        "operator": {},
+        "mutation": {},
+    },
+    "python": {
+        "assert": {},
+        "builtin": {},
+        "closure": {},
+        "context-manager": {},
+        "control-flow": {},
+        "data-structure": {},
+        "standard-library": {},
+        "object-model": {},
+    },
+}
+class SupportLevel(Enum):
+    """
+    Indicates at what stage the feature
+    used in the example is handled in export.
+    """
+    SUPPORTED = 1
+    NOT_SUPPORTED_YET = 0
+ArgsType = tuple[Any, ...]
+def check_inputs_type(args, kwargs):
+    if not isinstance(args, tuple):
+        raise ValueError(
+            f"Expecting args type to be a tuple, got: {type(args)}"
+        )
+    if not isinstance(kwargs, dict):
+        raise ValueError(
+            f"Expecting kwargs type to be a dict, got: {type(kwargs)}"
+        )
+    for key in kwargs:
+        if not isinstance(key, str):
+            raise ValueError(
+                f"Expecting kwargs keys to be a string, got: {type(key)}"
+            )
+def _validate_tag(tag: str):
+    parts = tag.split(".")
+    t = _TAGS
+    for part in parts:
+        assert set(part) <= set(
+            string.ascii_lowercase + "-"
+        ), f"Tag contains invalid characters: {part}"
+        if part in t:
+            t = t[part]
+        else:
+            raise ValueError(f"Tag {tag} is not found in registered tags.")
+@dataclass(frozen=True)
+class ExportCase:
+    example_args: ArgsType
+    description: str  # A description of the use case.
+    model: torch.nn.Module
+    name: str
+    example_kwargs: dict[str, Any] = field(default_factory=dict)
+    extra_args: Optional[ArgsType] = None  # For testing graph generalization.
+    # Tags associated with the use case. (e.g dynamic-shape, escape-hatch)
+    tags: set[str] = field(default_factory=set)
+    support_level: SupportLevel = SupportLevel.SUPPORTED
+    dynamic_shapes: Optional[dict[str, Any]] = None
+    def __post_init__(self):
+        check_inputs_type(self.example_args, self.example_kwargs)
+        if self.extra_args is not None:
+            check_inputs_type(self.extra_args, {})
+        for tag in self.tags:
+            _validate_tag(tag)
+        if not isinstance(self.description, str) or len(self.description) == 0:
+            raise ValueError(f'Invalid description: "{self.description}"')
+_EXAMPLE_CASES: dict[str, ExportCase] = {}
+_MODULES: set[ModuleType] = set()
+_EXAMPLE_CONFLICT_CASES: dict[str, list[ExportCase]] = {}
+_EXAMPLE_REWRITE_CASES: dict[str, list[ExportCase]] = {}
+def register_db_case(case: ExportCase) -> None:
+    """
+    Registers a user provided ExportCase into example bank.
+    """
+    if case.name in _EXAMPLE_CASES:
+        if case.name not in _EXAMPLE_CONFLICT_CASES:
+            _EXAMPLE_CONFLICT_CASES[case.name] = [_EXAMPLE_CASES[case.name]]
+        _EXAMPLE_CONFLICT_CASES[case.name].append(case)
+        return
+    _EXAMPLE_CASES[case.name] = case
+def to_snake_case(name):
+    name = re.sub("(.)([A-Z][a-z]+)", r"\1_\2", name)
+    return re.sub("([a-z0-9])([A-Z])", r"\1_\2", name).lower()
+def _make_export_case(m, name, configs):
+    if not isinstance(m, torch.nn.Module):
+        raise TypeError("Export case class should be a torch.nn.Module.")
+    if "description" not in configs:
+        # Fallback to docstring if description is missing.
+        assert (
+            m.__doc__ is not None
+        ), f"Could not find description or docstring for export case: {m}"
+        configs = {**configs, "description": m.__doc__}
+    # pyrefly: ignore [bad-argument-type]
+    return ExportCase(**{**configs, "model": m, "name": name})
+def export_case(**kwargs):
+    """
+    Decorator for registering a user provided case into example bank.
+    """
+    def wrapper(m):
+        configs = kwargs
+        module = inspect.getmodule(m)
+        if module in _MODULES:
+            raise RuntimeError("export_case should only be used once per example file.")
+        assert module is not None
+        _MODULES.add(module)
+        module_name = module.__name__.split(".")[-1]
+        case = _make_export_case(m, module_name, configs)
+        register_db_case(case)
+        return case
+    return wrapper
+def export_rewrite_case(**kwargs):
+    def wrapper(m):
+        configs = kwargs
+        parent = configs.pop("parent")
+        assert isinstance(parent, ExportCase)
+        key = parent.name
+        if key not in _EXAMPLE_REWRITE_CASES:
+            _EXAMPLE_REWRITE_CASES[key] = []
+        configs["example_args"] = parent.example_args
+        case = _make_export_case(m, to_snake_case(m.__name__), configs)
+        _EXAMPLE_REWRITE_CASES[key].append(case)
+        return case
+    return wrapper

Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/_export/db/gen_example.py ADDED Viewed

	@@ -0,0 +1,21 @@

+import os
+import sys
+import torch._export.db.examples as examples
+TEMPLATE = '''import torch
+def {case_name}(x):
+    """
+    """
+    return
+'''
+if __name__ == "__main__":
+    assert len(sys.argv) == 2
+    root_dir = examples.__name__.replace(".", "/")
+    assert os.path.exists(root_dir)
+    with open(os.path.join(root_dir, sys.argv[1] + ".py"), "w") as f:
+        print("Writing to", f.name, "...")
+        f.write(TEMPLATE.format(case_name=sys.argv[1]))

Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/_export/db/logging.py ADDED Viewed

	@@ -0,0 +1,47 @@

+from typing import Optional
+def exportdb_error_message(case_name: str) -> str:
+    from .examples import all_examples
+    from torch._utils_internal import log_export_usage
+    ALL_EXAMPLES = all_examples()
+    # Detect whether case_name is really registered in exportdb.
+    if case_name in ALL_EXAMPLES:
+        url_case_name = case_name.replace("_", "-")
+        return f"See {case_name} in exportdb for unsupported case. \
+                https://pytorch.org/docs/main/generated/exportdb/index.html#{url_case_name}"
+    else:
+        log_export_usage(
+            event="export.error.casenotregistered",
+            message=case_name,
+        )
+        return f"{case_name} is unsupported."
+def get_class_if_classified_error(e: Exception) -> Optional[str]:
+    """
+    Returns a string case name if the export error e is classified.
+    Returns None otherwise.
+    """
+    from torch._dynamo.exc import TorchRuntimeError, Unsupported, UserError
+    ALWAYS_CLASSIFIED = "always_classified"
+    DEFAULT_CLASS_SIGIL = "case_name"
+    # add error types that should be classified, along with any attribute name
+    # whose presence acts like a sigil to further distinguish which errors of
+    # that type should be classified. If the attribute name is None, then the
+    # error type is always classified.
+    _ALLOW_LIST = {
+        Unsupported: DEFAULT_CLASS_SIGIL,
+        UserError: DEFAULT_CLASS_SIGIL,
+        TorchRuntimeError: None,
+    }
+    if type(e) in _ALLOW_LIST:
+        # pyrefly: ignore [index-error]
+        attr_name = _ALLOW_LIST[type(e)]
+        if attr_name is None:
+            return ALWAYS_CLASSIFIED
+        return getattr(e, attr_name, None)
+    return None

Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/_export/pass_infra/__init__.py ADDED Viewed

File without changes

Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/_export/pass_infra/node_metadata.py ADDED Viewed

	@@ -0,0 +1,32 @@

+from typing import Any
+NodeMetadataValue = Any
+PROTECTED_KEYS: set[str] = {
+    "val",
+    "stack_trace",
+    "nn_module_stack",
+    "debug_handle",
+    "tensor_meta",
+}
+class NodeMetadata:
+    def __init__(self, data: dict[str, Any]) -> None:
+        self.data: dict[str, Any] = data.copy()
+    def __getitem__(self, key: str) -> NodeMetadataValue:
+        return self.data[key]
+    def __setitem__(self, key: str, value: NodeMetadataValue) -> NodeMetadataValue:
+        if key in PROTECTED_KEYS:
+            raise RuntimeError(f"Could not override node key: {key}")
+        self.data[key] = value
+    def __contains__(self, key: str) -> bool:
+        return key in self.data
+    def copy(self) -> "NodeMetadata":
+        return NodeMetadata(self.data.copy())

Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/_export/pass_infra/proxy_value.py ADDED Viewed

	@@ -0,0 +1,45 @@

+# pyre-strict
+from collections.abc import Iterable, Iterator
+from typing import Generic, TypeVar, Union
+import torch
+_T = TypeVar("_T")
+class ProxyValue(Generic[_T]):
+    # pyre-ignore
+    def __init__(self, data: Iterable[_T], proxy: Union[torch.fx.Proxy, torch.fx.Node]):
+        # pyre-ignore
+        self.data = data
+        self.proxy_or_node = proxy
+    @property
+    def node(self) -> torch.fx.Node:
+        if isinstance(self.proxy_or_node, torch.fx.Node):
+            return self.proxy_or_node
+        assert isinstance(self.proxy_or_node, torch.fx.Proxy)
+        return self.proxy_or_node.node
+    @property
+    def proxy(self) -> torch.fx.Proxy:
+        if not isinstance(self.proxy_or_node, torch.fx.Proxy):
+            raise RuntimeError(
+                f"ProxyValue doesn't have attached Proxy object. Node: {self.proxy_or_node.format_node()}"
+            )
+        return self.proxy_or_node
+    def to_tensor(self) -> torch.Tensor:
+        assert isinstance(self.data, torch.Tensor)
+        return self.data
+    def is_tensor(self) -> bool:
+        return isinstance(self.data, torch.Tensor)
+    # pyre-ignore
+    def __iter__(self) -> Iterator[_T]:
+        yield from self.data
+    def __bool__(self) -> bool:
+        return bool(self.data)

Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/_export/passes/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ from .replace_view_ops_with_view_copy_ops_pass import ReplaceViewOpsWithViewCopyOpsPass

Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/_export/passes/__pycache__/__init__.cpython-312.pyc ADDED Viewed

Binary file (327 Bytes). View file

Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/_export/passes/__pycache__/add_runtime_assertions_for_constraints_pass.cpython-312.pyc ADDED Viewed

Binary file (11.7 kB). View file

Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/_export/passes/__pycache__/collect_tracepoints_pass.cpython-312.pyc ADDED Viewed

Binary file (6.99 kB). View file

Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/_export/passes/__pycache__/constant_folding.cpython-312.pyc ADDED Viewed

Binary file (14.9 kB). View file

Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/_export/passes/__pycache__/functionalize_side_effectful_ops_pass.cpython-312.pyc ADDED Viewed

Binary file (4.92 kB). View file

Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/_export/passes/__pycache__/insert_custom_op_guards.cpython-312.pyc ADDED Viewed

Binary file (5.09 kB). View file

Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/_export/passes/__pycache__/remove_runtime_assertions.cpython-312.pyc ADDED Viewed

Binary file (2.27 kB). View file

Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/_export/passes/__pycache__/replace_autocast_with_hop_pass.cpython-312.pyc ADDED Viewed

Binary file (8.49 kB). View file

Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/_export/passes/__pycache__/replace_set_grad_with_hop_pass.cpython-312.pyc ADDED Viewed

Binary file (6.05 kB). View file

Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/_export/passes/__pycache__/replace_view_ops_with_view_copy_ops_pass.cpython-312.pyc ADDED Viewed

Binary file (3.7 kB). View file

Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/_export/passes/__pycache__/replace_with_hop_pass_util.cpython-312.pyc ADDED Viewed

Binary file (8.68 kB). View file

Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/_export/passes/_node_metadata_hook.py ADDED Viewed

	@@ -0,0 +1,111 @@

+# mypy: allow-untyped-defs
+import contextlib
+from typing import Any, Optional
+import torch
+import torch.utils._pytree as pytree
+from torch._dispatch.python import enable_python_dispatcher
+from torch._subclasses.fake_tensor import FakeTensorMode
+from torch.fx.graph_module import GraphModule
+_EMPTY_NN_MODULE_STACK_KEY = "_empty_nn_module_stack_from_metadata_hook"
+def _node_metadata_hook(
+    node: torch.fx.Node,
+    metadata: Optional[dict[str, Any]] = None,
+    fake_mode: Optional[FakeTensorMode] = None,
+) -> None:
+    """
+    Hook for adding the appropriate metadata to nodes that are created during a
+    pass using graph.create_node. An example of how to use it:
+    ```
+    with _set_node_metadata_hook(gm,
+        functools.partial(_node_metadata_hook, metadata={"stack_trace": "file"})
+    ):
+        pass(gm)
+    ```
+    This hook should not work for all generic cases -- specifically it assumes
+    that nodes being added are only call_function nodes, and copies over the
+    first argument node's nn_module_stack.
+    """
+    # pyrefly: ignore [bad-assignment]
+    fake_mode = fake_mode or contextlib.nullcontext()
+    assert node.op == "call_function" and callable(node.target), (
+        f"node: {node}, target: {node.target}"
+    )
+    if (
+        isinstance(node.target, torch._ops.OpOverload)
+        and len(node.target._schema.returns) == 0
+    ):
+        node.meta["val"] = None
+    else:
+        fake_args, fake_kwargs = pytree.tree_map_only(
+            torch.fx.Node, lambda arg: arg.meta["val"], (node.args, node.kwargs)
+        )
+        # pyrefly: ignore [bad-context-manager]
+        with fake_mode, enable_python_dispatcher():
+            fake_res = node.target(*fake_args, **fake_kwargs)
+        node.meta["val"] = fake_res
+    if metadata is not None:
+        for k, v in metadata.items():
+            node.meta[k] = v
+    # Copy over metadata from argument nodes
+    arg_meta = [
+        arg.meta
+        for arg in pytree.tree_flatten((node.args, node.kwargs))[0]
+        if isinstance(arg, torch.fx.Node)
+    ]
+    if len(arg_meta) == 0:
+        return
+    arg_meta = arg_meta[0]
+    node.meta["nn_module_stack"] = node.meta.get(
+        "nn_module_stack",
+        arg_meta.get(
+            "nn_module_stack",
+            {
+                _EMPTY_NN_MODULE_STACK_KEY: (
+                    _EMPTY_NN_MODULE_STACK_KEY,
+                    _EMPTY_NN_MODULE_STACK_KEY,
+                )
+            },
+        ),
+    )
+    node.meta["torch_fn"] = node.meta.get(
+        "torch_fn",
+        (
+            f"{node.target.__name__}_0",
+            # pyrefly: ignore [missing-attribute]
+            f"{node.target.__class__.__name__}.{node.target.__name__}",
+        ),
+    )
+@contextlib.contextmanager
+def _set_node_metadata_hook(gm: torch.fx.GraphModule, f):
+    """
+    Takes a callable which will be called after we create a new node. The
+    callable takes the newly created node as input and returns None.
+    """
+    assert callable(f), "node_metadata_hook must be a callable."
+    # Add the hook to all submodules
+    for m in gm.modules():
+        if isinstance(m, GraphModule):
+            m._register_create_node_hook(f)
+    try:
+        yield
+    finally:
+        # Restore hook for all submodules
+        for m in gm.modules():
+            if isinstance(m, GraphModule):
+                m._unregister_create_node_hook(f)

Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/_export/passes/add_runtime_assertions_for_constraints_pass.py ADDED Viewed

	@@ -0,0 +1,254 @@

+# mypy: allow-untyped-defs
+import math
+import operator
+import traceback
+from functools import partial
+from typing import NamedTuple, TYPE_CHECKING
+import sympy
+import torch
+import torch.fx
+from torch.fx.experimental.symbolic_shapes import free_unbacked_symbols
+from torch.fx.passes.infra.pass_base import PassBase, PassResult
+from torch.utils._sympy.numbers import int_oo
+from torch.utils._sympy.value_ranges import ValueRanges
+if TYPE_CHECKING:
+    from collections.abc import Callable
+__all__ = ["InputDim"]
+class InputDim(NamedTuple):
+    input_name: str
+    dim: int
+def _convert_to_int(val):
+    # Convert simple sympy Integers into concrete int
+    if val in (sympy.oo, int_oo):
+        return math.inf
+    if val in (-sympy.oo, -int_oo):
+        return -math.inf
+    if isinstance(val, sympy.Integer):
+        return int(val)
+    raise RuntimeError("Export constraints cannot be non-integer expressions")
+def _convert_range_to_int(range: ValueRanges):
+    assert isinstance(range, ValueRanges)
+    min_val = _convert_to_int(range.lower)
+    max_val = _convert_to_int(range.upper)
+    return min_val, max_val
+class _AddRuntimeAssertionsForInlineConstraintsPass(PassBase):
+    def __init__(
+        self,
+        range_constraints: dict[sympy.Symbol, ValueRanges],
+    ):
+        super().__init__()
+        self.range_constraints: dict[sympy.Symbol, ValueRanges] = range_constraints
+        self._asserts_generated_unbacked_symbols: set[sympy.Symbol] = set()
+        self.counter = 0
+    def _assert_range_constraint(self, node, lower, upper, assert_msg):
+        last_node = node
+        if lower > -math.inf:
+            last_node = self._insert_assert_async(
+                last_node, operator.ge, node, lower, assert_msg
+            )
+        if upper < math.inf:
+            last_node = self._insert_assert_async(
+                last_node, operator.le, node, upper, assert_msg
+            )
+    def _insert_assert_async(self, last_node, op, lower, upper, assert_msg):
+        """
+        Inserts assert_async call_function nodes in the graph. This function is
+        called **during** the interpreter-based pass.
+        """
+        self.counter += 1
+        graph = last_node.graph
+        with graph.inserting_after(last_node):
+            cmp = graph.call_function(op, (lower, upper), {})
+        with graph.inserting_after(cmp):
+            cmp_tensor = graph.call_function(
+                torch.ops.aten.scalar_tensor.default, (cmp,), {}
+            )
+        with graph.inserting_after(cmp_tensor):
+            assert_async = graph.call_function(
+                torch.ops.aten._assert_async.msg,
+                (cmp_tensor, assert_msg),
+                {},
+            )
+        return assert_async
+    def call(self, graph_module) -> PassResult:
+        self.existing_inline_assertions = _get_existing_inline_assertions(
+            graph_module, self.range_constraints
+        )
+        for module in graph_module.modules():
+            if not isinstance(module, torch.fx.GraphModule):
+                continue
+            for node in module.graph.nodes:
+                if node.op != "call_function":
+                    continue
+                if "val" not in node.meta:
+                    continue
+                val = node.meta["val"]
+                # In general, we may have to deal the case such as: ret[1].shape[0].
+                # We need first find out what symbols require assertion, then we need to follow the path
+                # from ret to the symbol, construct the proxies along the way and construct the messages
+                # piece-wise at the same time.
+                #
+                # We use post-order traversal to collect all the proxies callbacks needed, construct
+                # the error message callbacks, and at the top-level traversal tree we execute all the callbacks.
+                # We need the callbacks because, in order to call the function to create a proxy for shape[0], we
+                # need the proxy for shape, which further requires the proxy for ret[1], etc.
+                def add_assertions(val):
+                    call_backs: list[Callable] = []
+                    messages: list[str] = []
+                    if isinstance(val, (torch.SymInt, torch.SymFloat, torch.SymBool)):
+                        symbol = val.node.expr
+                        if symbol in self.existing_inline_assertions:
+                            return call_backs, messages
+                        if isinstance(symbol, sympy.Symbol) and free_unbacked_symbols(
+                            symbol
+                        ):
+                            if symbol in self._asserts_generated_unbacked_symbols:
+                                return call_backs, messages
+                            # We only care about unbacked symints for these inline
+                            # constraints, which are prefixed with 'u'
+                            constraint = self.range_constraints[symbol]
+                            min_val, max_val = _convert_range_to_int(constraint)
+                            assert_msg = f" is outside of inline constraint [{min_val}, {max_val}]."
+                            call_backs.append(
+                                partial(
+                                    self._assert_range_constraint,
+                                    lower=min_val,
+                                    upper=max_val,
+                                )
+                            )
+                            messages.append(assert_msg)
+                            self._asserts_generated_unbacked_symbols.add(symbol)
+                    elif isinstance(val, torch.Tensor):
+                        for i, sym in enumerate(val.shape):
+                            cbs, msgs = add_assertions(sym)
+                            for cb, msg in zip(cbs, msgs):
+                                def sym_size_cb(node, assert_msg, dim):
+                                    with node.graph.inserting_after(node):
+                                        dim_node = module.graph.call_function(
+                                            torch.ops.aten.sym_size.int,
+                                            (node, dim),
+                                            {},
+                                        )
+                                    cb(node=dim_node, assert_msg=assert_msg)
+                                call_backs.append(partial(sym_size_cb, dim=i))
+                                messages.append(f".shape[{i}]" + msg)
+                    return call_backs, messages
+                callbacks, messages = add_assertions(val)
+                for cb, msg in zip(callbacks, messages):
+                    cb(node=node, assert_msg=f"{node}" + msg)
+            module.recompile()
+        # Sometimes this pass would return a wrong graph where we have mismatched
+        # node names in signature. Before we fix it, let's just skip it.
+        if (
+            self.counter == 0
+            and type(self) is _AddRuntimeAssertionsForInlineConstraintsPass
+        ):
+            return PassResult(graph_module, False)
+        # Populate the stack trace with dummy vals to respect IR
+        for node in graph_module.graph.nodes:
+            if not node.meta.get("stack_trace", None) and node.op not in [
+                "placeholder",
+                "output",
+            ]:
+                node.meta["stack_trace"] = "".join(traceback.format_stack(limit=1))
+        return PassResult(graph_module, True)
+def _get_existing_inline_assertions(
+    graph_module: torch.fx.GraphModule,
+    range_constraints: dict[sympy.Symbol, ValueRanges],
+) -> dict[sympy.Symbol, ValueRanges]:
+    existing_inline_assertions: dict[sympy.Symbol, ValueRanges] = {}
+    for module in graph_module.modules():
+        if not isinstance(module, torch.fx.GraphModule):
+            continue
+        # Find all the existing inline assertions. They will look something like:
+        # %_local_scalar_dense = call_function[target=torch.ops.aten._local_scalar_dense.default](args = (%arg1_1,), kwargs = {})
+        # %ge = call_function[target=operator.ge](args = (%_local_scalar_dense, 0), kwargs = {})
+        # %_assert_scalar = call_function[target=torch.ops.aten._assert_scalar.default](args = (%scalar_tensor, "..."), kwargs = {})
+        for node in module.graph.nodes:
+            if node.target != torch.ops.aten._assert_scalar.default:
+                continue
+            compare_arg = node.args[0]
+            if not (
+                isinstance(compare_arg, torch.fx.Node)
+                and compare_arg.op == "call_function"
+                and compare_arg.target in (operator.le, operator.ge)
+                and len(compare_arg.args) == 2
+            ):
+                continue
+            compare_op = compare_arg.target
+            lhs, rhs = compare_arg.args
+            def maybe_get_symint(x):
+                if (
+                    isinstance(x, torch.fx.Node)
+                    and "val" in x.meta
+                    and isinstance(x.meta["val"], torch.SymInt)
+                ):
+                    return x.meta["val"].node.expr
+                return x
+            lhs = maybe_get_symint(lhs)
+            rhs = maybe_get_symint(rhs)
+            if compare_op is operator.ge:
+                lhs, rhs = rhs, lhs
+            if isinstance(lhs, sympy.Symbol) and isinstance(rhs, int):
+                symint = lhs
+                scalar = rhs
+            elif isinstance(rhs, sympy.Symbol) and isinstance(lhs, int):
+                symint = rhs
+                scalar = lhs
+            else:
+                continue
+            if symint not in range_constraints:
+                raise RuntimeError(
+                    f"Unable to find symint {symint} in {range_constraints}"
+                )
+            previous_range = existing_inline_assertions.get(
+                symint, ValueRanges(-math.inf, math.inf)
+            )
+            if symint is lhs:
+                bounds = ValueRanges(-math.inf, scalar)
+            else:
+                bounds = ValueRanges(scalar, math.inf)
+            existing_inline_assertions[symint] = previous_range & bounds
+    return existing_inline_assertions

Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/_export/passes/collect_tracepoints_pass.py ADDED Viewed

	@@ -0,0 +1,146 @@

+# mypy: allow-untyped-defs
+from __future__ import annotations
+import operator
+from typing import TYPE_CHECKING
+import torch
+from torch.export.exported_program import ConstantArgument, TensorArgument
+from torch.fx.passes.infra.pass_base import PassBase, PassResult
+if TYPE_CHECKING:
+    from torch.export.exported_program import ModuleCallSignature
+    from torch.export.graph_signature import ExportGraphSignature
+__all__ = ["CollectTracepointsPass"]
+class CollectTracepointsPass(PassBase):
+    """
+    Performs constant folding and constant propagation.
+    """
+    def __init__(
+        self, specs: dict[str, ModuleCallSignature], sig: ExportGraphSignature
+    ) -> None:
+        super().__init__()
+        self.specs = specs
+        self.sig = sig
+    def call(self, gm: torch.fx.GraphModule) -> PassResult | None:
+        def get_arg_spec(arg) -> TensorArgument | ConstantArgument:
+            if isinstance(arg, torch.fx.Node):
+                if isinstance(arg.meta.get("val"), torch.Tensor):
+                    return TensorArgument(name=arg.name)
+                else:
+                    raise AssertionError(
+                        "Symint input is not implemented yet for submodule call signature."
+                    )
+            else:
+                return ConstantArgument(name="", value=arg)
+        for module in gm.modules():
+            if not isinstance(module, torch.fx.GraphModule):
+                continue
+            nn_module_stack = None
+            for node in module.graph.nodes:
+                if node.op != "call_function":
+                    continue
+                if node.target is torch.ops.higher_order._export_tracepoint:
+                    kind = node.kwargs["kind"]
+                    if kind == "module_call_outputs":
+                        nn_module_stack = node.meta["nn_module_stack"]
+                    elif kind == "module_call_inputs":
+                        nn_module_stack = None
+                    else:
+                        raise AssertionError(f"Unknown tracepoint kind: {kind}")
+                elif node.meta["nn_module_stack"] == nn_module_stack:
+                    node.meta["nn_module_stack"].popitem()
+                else:
+                    nn_module_stack = None
+            nn_module_stack = None
+            for node in reversed(module.graph.nodes):
+                if node.op != "call_function":
+                    continue
+                if node.target is torch.ops.higher_order._export_tracepoint:
+                    kind = node.kwargs["kind"]
+                    if kind == "module_call_inputs":
+                        nn_module_stack = node.meta["nn_module_stack"]
+                    elif kind == "module_call_outputs":
+                        nn_module_stack = None
+                    else:
+                        raise AssertionError(f"Unknown tracepoint kind: {kind}")
+                elif node.meta["nn_module_stack"] == nn_module_stack:
+                    node.meta["nn_module_stack"].popitem()
+                else:
+                    nn_module_stack = None
+        def copy_sig(sig) -> ModuleCallSignature:
+            from torch.export.exported_program import ModuleCallSignature
+            return ModuleCallSignature(
+                inputs=[],
+                outputs=[],
+                in_spec=sig.in_spec,
+                out_spec=sig.out_spec,
+                forward_arg_names=None,
+            )
+        for module in gm.modules():
+            if not isinstance(module, torch.fx.GraphModule):
+                continue
+            for node in module.graph.nodes:
+                if node.op != "call_function":
+                    continue
+                if node.target is torch.ops.higher_order._export_tracepoint:
+                    # There's some subtlety worth noting. Here fqn corresponds to
+                    # the call name, whereas path corresponds to the module name.
+                    # They are not necessarily the same! When a submodule is shared
+                    # through different aliases, there are as many _export_tracepoint
+                    # markers as there are aliases, since the shared submodule is
+                    # wrapped once for each alias.
+                    path = node.kwargs["path"]
+                    fqn, _ = next(reversed(node.meta["nn_module_stack"].values()))
+                    module_key = next(reversed(node.meta["nn_module_stack"]))
+                    if "@" in module_key:
+                        suffix = module_key.split("@")[-1]
+                        path = f"{path}@{suffix}"
+                        call_fqn = f"{fqn}@{suffix}"
+                        if call_fqn not in self.specs:
+                            self.specs[call_fqn] = copy_sig(self.specs[fqn])
+                        fqn = call_fqn
+                    kind = node.kwargs["kind"]
+                    for i, arg in enumerate(node.args):
+                        # We only update the signature of the alias used to call
+                        # the submodule. Otherwise the signatures of all aliases
+                        # would get conflated; the inputs/outputs of every call
+                        # would be recorded in every other call as well.
+                        if fqn == path:
+                            if kind == "module_call_inputs":
+                                self.specs[path].inputs.append(get_arg_spec(arg))
+                            elif kind == "module_call_outputs":
+                                self.specs[path].outputs.append(get_arg_spec(arg))
+                            else:
+                                raise AssertionError(f"Unknown tracepoint kind: {kind}")
+                        if isinstance(arg, torch.fx.Node):
+                            for user in node.users:
+                                assert user.op == "call_function"
+                                assert user.target is operator.getitem
+                                assert isinstance(user.args[1], int)
+                                if user.args[1] == i:
+                                    user.replace_all_uses_with(arg)
+                                    self.sig.replace_all_uses(user.name, arg.name)
+                                    break
+                    users = list(node.users)
+                    for user in users:
+                        assert len(user.users) == 0
+                        gm.graph.erase_node(user)
+                    gm.graph.erase_node(node)
+            return PassResult(gm, True)
+        return None

Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/_export/passes/constant_folding.py ADDED Viewed

	@@ -0,0 +1,304 @@

+# mypy: allow-untyped-defs
+import collections
+from collections import defaultdict
+from collections.abc import Callable
+from typing import Any, Optional
+import torch
+import torch.utils._pytree as pytree
+aten = torch.ops.aten
+# We would like to split modules into two subgraphs for runtime weight updates to work correctly.
+# The use case and more information could be found at:
+# https://docs.google.com/document/d/1inZC-8KarJ6gKB7G9egmYLx1V_dKX_apxon0w4zPC0Q/edit?usp=sharing
+META_TAG = "MODULE_TYPE"
+MODULE_TAG = "_MAIN_MODULE"
+CONST_MODULE_TAG = "_CONST_MODULE"
+def replace_node_with_constant(gm, node, constant, name=None):
+    g = gm.graph
+    if name:
+        qualname = name
+    else:
+        if not hasattr(gm, "_frozen_param_count"):
+            gm._frozen_param_count = 0
+        i = gm._frozen_param_count
+        while True:
+            qualname = f"_frozen_param{i}"
+            if not hasattr(gm, qualname):
+                break
+            i += 1
+        gm._frozen_param_count = i + 1
+    with g.inserting_before(node):
+        new_input_node = g.create_node("get_attr", qualname, (), {})
+        node.replace_all_uses_with(new_input_node)
+        new_input_node.meta.update(node.meta)
+        g.erase_node(node)
+    # needed to suppress `does not reference an nn.Module, nn.Parameter, or buffer` warning
+    gm.register_buffer(qualname, constant)
+    setattr(gm, qualname, constant)
+class ConstantFolder(torch.fx.Interpreter):
+    def __init__(
+        self,
+        gm: torch.fx.GraphModule,
+        skip_constructors: bool = False,
+    ):
+        super().__init__(gm)
+        self.node_replacements: dict[torch.fx.Node, Any] = {}
+        self.replaced_uses: dict[torch.fx.Node, int] = collections.Counter()
+        self.unknown_value = object()
+        self.skip_constructors: bool = skip_constructors
+        # overwrite this to deallocate env values if their only remaining use
+        # is the output
+        self.user_to_last_uses = self.node_to_last_non_output_use()
+    def is_impure(self, node: torch.fx.Node) -> bool:
+        if (
+            node.target is torch.ops.prims.convert_element_type.default
+            and node.args[0].op == "get_attr"  # type: ignore[union-attr]
+            and node.args[0].meta["val"].dtype == torch.int8  # type: ignore[union-attr]
+            and node.args[1] == torch.bfloat16
+        ):
+            # For int8_weight -> dq -> bf16_weight
+            return True
+        if node.target in [
+            torch.ops.quantized_decomposed.dequantize_per_channel.default,
+            torch.ops.quantized_decomposed.dequantize_per_tensor.default,
+            torch.ops.quantized_decomposed.dequantize_per_tensor.tensor,
+            torch.ops.pt2e_quant.dequantize_affine,
+        ]:
+            # For the pattern fp32_weight -> q -> dq
+            # We only folding fp32_weight -> q
+            # int8_weight and leave dq in graph to be fused
+            return True
+        return False
+    def node_to_last_non_output_use(self):
+        last_non_output_use = collections.defaultdict(list)
+        seen_uses = set()
+        output_node = next(iter(reversed(self.module.graph.nodes)))  # type: ignore[arg-type, union-attr]
+        for node in reversed(self.module.graph.nodes):  # type: ignore[arg-type, union-attr]
+            if node.target == "output":
+                continue
+            def add_use(inp):
+                if inp in seen_uses:
+                    return
+                seen_uses.add(inp)
+                last_non_output_use[node].append(inp)
+            # In-place is fine since we don't mutate
+            pytree.tree_map_only_(torch.fx.Node, add_use, (node.args, node.kwargs))
+            # if this node is only used in output, we want to gc it right away
+            if len(node.users) == 1 and output_node in node.users:
+                last_non_output_use[node].append(node)
+        return last_non_output_use
+    def run_node(self, node):
+        if node.target == "output":
+            # because we remove nodes from env on last non output use,
+            # re-define them now or we'll get error in interpreter
+            def set_env(arg):
+                self.env[arg] = self.unknown_value
+            # In-place is fine since we don't mutate
+            pytree.tree_map_only_(torch.fx.Node, set_env, node.args)
+            return super().run_node(node)
+        args, kwargs = self.fetch_args_kwargs_from_env(node)
+        flattened_inputs = pytree.arg_tree_leaves(*args, **kwargs)
+        # We need to do this weird thing because in cases where flattened_inputs
+        # contains a ScriptObject, equality checking results in a type error if
+        # the types are different.
+        if any(
+            type(self.unknown_value) is type(input_) and self.unknown_value == input_
+            for input_ in flattened_inputs
+        ):
+            return self.unknown_value
+        # TODO - fix errors with this
+        if (
+            node.op == "call_function"
+            and node.target is aten._efficientzerotensor.default
+        ):
+            return self.unknown_value
+        # TODO - constant folding triton kernel returns the inputs -- fix this
+        if (
+            node.op == "call_function"
+            and node.name == "triton_kernel_wrapper_functional_proxy"
+        ):
+            return self.unknown_value
+        # skip constructors, since inductor generates optimal code for them already
+        # and turning into tensor would result in an additional global memory read
+        # TODO - more complicated strategy
+        if (
+            self.skip_constructors
+            and node.op != "get_attr"
+            and not any(isinstance(e, torch.Tensor) for e in flattened_inputs)
+        ):
+            return self.unknown_value
+        # All mutations should either be removed or on inputs which we did not make constant
+        if (
+            isinstance(node.target, torch._ops.OpOverload)
+            and torch.Tag.nondeterministic_seeded in node.target.tags
+        ):
+            return self.unknown_value
+        out = super().run_node(node)
+        if node.op != "get_attr" and isinstance(out, torch.Tensor):
+            if out.device.type == "meta":
+                return out
+            if not self.insertable_tensor_check(out):
+                return out
+            if self.is_impure(node):
+                return self.unknown_value
+            self.add_node_replacement(node, out)
+            flattened_node_inps = pytree.arg_tree_leaves(*node.args, **node.kwargs)
+            for n in flattened_node_inps:
+                if not isinstance(n, torch.fx.Node):
+                    continue
+                self.replaced_uses[n] += 1
+            for to_delete in self.user_to_last_uses.get(node, []):
+                if self.replaced_uses[to_delete] == len(to_delete.users):
+                    self.node_replacements.pop(to_delete, None)
+        return out
+    def insertable_tensor_check(self, tensor: torch.Tensor) -> bool:
+        return True
+    def add_node_replacement(self, node: torch.fx.Node, tensor: torch.Tensor) -> None:
+        self.node_replacements[node] = tensor
+    def run(self):  # type: ignore[override]
+        env = {}
+        for n in self.module.graph.find_nodes(op="placeholder"):  # type: ignore[operator, union-attr]
+            env[n] = self.unknown_value
+        return super().run(initial_env=env)
+def constant_fold(
+    gm: torch.fx.GraphModule,
+    constraint_fn: Optional[Callable[[torch.fx.Node], bool]] = None,
+):
+    with torch.utils._python_dispatch._disable_current_modes():
+        cf = ConstantFolder(gm, skip_constructors=True)
+        cf.run()
+        for node, constant in cf.node_replacements.items():
+            if constraint_fn is not None and not constraint_fn(node):
+                continue
+            replace_node_with_constant(gm, node, constant)
+        erased_params = []
+        # Get all attr users by looking up the graph instead from node.users, because in this case
+        # _tensor_constant0 and _tensor_constant0_1 are actually refereing to the same tensor.
+        #     opcode         name                 target            args                         kwargs
+        # -------------  -------------------  ----------------  ---------------------------  --------
+        # placeholder    arg0_1               arg0              ()                           {}
+        # get_attr       _tensor_constant0    state             ()                           {}
+        # call_function  add                  aten.add.Tensor   (arg0_1, _tensor_constant0)  {}
+        # get_attr       _tensor_constant0_1  state             ()                           {}
+        # call_function  add_                 aten.add_.Tensor  (_tensor_constant0_1, 1)     {}
+        # output         output               output            ([add],)                     {}
+        get_attr_node_users = defaultdict(list)
+        for node in gm.graph.nodes:
+            if node.op == "get_attr":
+                get_attr_node_users[node.target].extend(node.users.keys())
+        for node in gm.graph.find_nodes(op="get_attr"):
+            if node.op == "get_attr" and len(get_attr_node_users[node.target]) == 0:
+                if hasattr(gm, node.target):
+                    delattr(gm, node.target)
+                erased_params.append(node)
+        for node in erased_params:
+            gm.graph.erase_node(node)
+        gm.graph.eliminate_dead_code()
+        gm.graph.lint()
+        gm.recompile()
+def constant_graph_tag(gm: torch.fx.GraphModule) -> None:
+    with torch.utils._python_dispatch._disable_current_modes():
+        cf = ConstantFolder(gm, skip_constructors=True)
+        cf.run()
+        for node in gm.graph.nodes:
+            if (
+                node.op == "get_attr"
+                or node in cf.node_replacements
+                or node in cf.replaced_uses
+            ):
+                node.meta[META_TAG] = CONST_MODULE_TAG
+            else:
+                node.meta[META_TAG] = MODULE_TAG
+def run_and_get_constant_graph(gm: torch.fx.GraphModule) -> torch.fx.GraphModule:
+    """
+    Construct a GraphModule which corresponds to the part which could be
+    constant folded in provided gm.
+    """
+    constant_graph_tag(gm)
+    # We rewrite the tags, if it's a constant being directly consumed, without
+    # any folding opportunity, we keep it in main gm.
+    for node in gm.graph.find_nodes(op="get_attr"):
+        used_to_fold = False
+        for u in node.users:
+            if u.meta[META_TAG] == CONST_MODULE_TAG:
+                used_to_fold = True
+                break
+        if not used_to_fold:
+            node.meta[META_TAG] = MODULE_TAG
+    new_graph = torch.fx.Graph()
+    node_remapping: dict[torch.fx.Node, torch.fx.Node] = {}
+    output_nodes = []
+    for node in gm.graph.nodes:
+        if node.meta[META_TAG] == MODULE_TAG:
+            continue
+        new_node = new_graph.node_copy(node, lambda x: node_remapping[x])
+        node_remapping[node] = new_node
+        for user in node.users:
+            if user.meta[META_TAG] == MODULE_TAG:
+                output_nodes.append(new_node)
+                break
+    new_graph.output(tuple(output_nodes))
+    new_graph.lint()
+    new_gm = torch.fx.GraphModule(gm, new_graph)
+    return new_gm

Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/_export/passes/functionalize_side_effectful_ops_pass.py ADDED Viewed

	@@ -0,0 +1,99 @@

+import copy
+from typing import Optional
+import torch
+from torch._export.pass_base import (
+    _ExportPassBaseDeprecatedDoNotUse,
+    Argument,
+    PassResult,
+)
+from torch._export.pass_infra.node_metadata import NodeMetadata
+from torch._export.pass_infra.proxy_value import ProxyValue
+from torch._ops import OpOverload
+aten = torch.ops.aten
+_NON_FUNCTIONAL_TO_FUNCTIONAL_SIDE_EFFECTFUL_FUNCS: dict[OpOverload, OpOverload] = {
+    aten.sym_constrain_range.default: aten._functional_sym_constrain_range.default,
+    aten._assert_async.msg: aten._functional_assert_async.msg,
+}
+class _FunctionalizeSideEffectfulOpsPass(_ExportPassBaseDeprecatedDoNotUse):
+    """
+    Functionalize ops with side effect in graph module by replacing the op with
+    functional version of it. A new dependency token (`dep_token`) will be
+    created and propagated through functional ops to output.
+    For example:
+    ```
+    def f(x):
+        sym_constrain_range(x.shape[0], min=1, max=3)
+        return x.add(3)
+    ```
+    Will be transformed to:
+    ```
+    def f(x):
+        dep_token0 = _make_dep_token()
+        dep_token1 = _functional_sym_constrain_range(
+            x.shape[0], min=1, max=3, dep_token=dep_token0
+        )
+        return x.add(3), dep_token1
+    ```
+    """
+    def __init__(self) -> None:
+        super().__init__()
+        self._dep_token: Optional[ProxyValue] = None
+        self._next_dep_token_index: Optional[int] = None
+    def call(self, graph_module: torch.fx.GraphModule) -> PassResult:
+        # Early return if no non-functional assertions.
+        if not any(
+            n.target in _NON_FUNCTIONAL_TO_FUNCTIONAL_SIDE_EFFECTFUL_FUNCS
+            for n in graph_module.graph.nodes
+        ):
+            return PassResult(graph_module=graph_module, modified=False)
+        gm = copy.deepcopy(graph_module)
+        self._dep_token = None
+        self._next_dep_token_index = None
+        return super().call(gm)
+    def call_operator(
+        self,
+        op: OpOverload,
+        args: tuple[Argument, ...],
+        kwargs: dict[str, Argument],
+        meta: NodeMetadata,
+    ) -> ProxyValue:
+        if op not in _NON_FUNCTIONAL_TO_FUNCTIONAL_SIDE_EFFECTFUL_FUNCS:
+            return super().call_operator(op, args, kwargs, meta)
+        if self._dep_token is None:
+            self._dep_token = super().call_operator(
+                aten._make_dep_token,
+                args=(),
+                kwargs={},
+                meta=self._create_dummy_node_metadata(),
+            )
+            self._dep_token.node.name = "dep_token0"
+            self._next_dep_token_index = 1
+        self._dep_token = super().call_operator(
+            _NON_FUNCTIONAL_TO_FUNCTIONAL_SIDE_EFFECTFUL_FUNCS[op],
+            args=args,
+            kwargs={**kwargs, "dep_token": self._dep_token},
+            meta=meta,
+        )
+        assert self._next_dep_token_index is not None
+        self._dep_token.node.name = f"dep_token{self._next_dep_token_index}"
+        self._next_dep_token_index += 1
+        return self._dep_token
+    def output(self, results: list[Argument], meta: NodeMetadata) -> ProxyValue:
+        assert self._dep_token is not None
+        return super().output(results=(*results, self._dep_token), meta=meta)  # type: ignore[arg-type]

Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/_export/passes/insert_custom_op_guards.py ADDED Viewed

	@@ -0,0 +1,80 @@

+import functools
+from collections import defaultdict
+import torch
+from torch._export.passes._node_metadata_hook import (
+    _node_metadata_hook,
+    _set_node_metadata_hook,
+)
+from torch._library.fake_profile import OpProfile, TensorMetadata
+def insert_custom_op_guards(gm: torch.fx.GraphModule, ops_to_guard: set[str]) -> None:
+    """
+    This is used by draft_export to insert guards in front of calls to custom
+    operators which have a generated fake kernel.
+    """
+    for node in gm.graph.nodes:
+        if node.op == "call_function" and str(node.target) in ops_to_guard:
+            with (
+                _set_node_metadata_hook(
+                    gm,
+                    functools.partial(
+                        _node_metadata_hook,
+                        metadata={"stack_trace": node.meta.get("stack_trace")},
+                    ),
+                ),
+                gm.graph.inserting_before(node),
+            ):
+                for arg in (*node.args, *node.kwargs.values()):
+                    if isinstance(arg, torch.fx.Node) and isinstance(
+                        arg.meta.get("val"), torch.Tensor
+                    ):
+                        val = arg.meta["val"]
+                        gm.graph.call_function(
+                            torch.ops.aten._assert_tensor_metadata.default,
+                            args=(arg,),
+                            kwargs={
+                                "dtype": val.dtype,
+                                "device": val.device,
+                                "layout": val.layout,
+                            },
+                        )
+    gm.recompile()
+def get_op_profiles(
+    gm: torch.fx.GraphModule, ops_to_guard: set[str]
+) -> dict[str, set[OpProfile]]:
+    """
+    This is used by draft_export to get a list of custom operator profiles so
+    that we can generate fake kernels.
+    """
+    def _get_op_profile(node: torch.fx.Node) -> OpProfile:
+        args_profile = tuple(
+            TensorMetadata.maybe_from_tensor(arg.meta.get("val"))
+            if isinstance(arg, torch.fx.Node)
+            else None
+            for arg in (*node.args, *node.kwargs.values())
+        )
+        out_profile = None
+        meta = node.meta.get("val")
+        assert meta is not None
+        if isinstance(meta, torch.Tensor):
+            out_profile = TensorMetadata.maybe_from_tensor(meta)
+        elif isinstance(meta, (list, tuple)):
+            out_profile = tuple(TensorMetadata.maybe_from_tensor(m) for m in meta)  # type: ignore[assignment]
+        assert out_profile is not None
+        return OpProfile(args_profile, out_profile)  # type: ignore[arg-type]
+    op_profiles: dict[str, set[OpProfile]] = defaultdict(set)
+    for node in gm.graph.nodes:
+        if node.op == "call_function" and str(node.target) in ops_to_guard:
+            op_profiles[str(node.target)].add(_get_op_profile(node))
+    return op_profiles

Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/_export/passes/lift_constants_pass.py ADDED Viewed

	@@ -0,0 +1,417 @@

+# mypy: allow-untyped-defs
+import collections
+import logging
+from typing import Any, Optional, Union
+import torch
+from torch._export.verifier import SpecViolationError
+from torch._guards import detect_fake_mode
+from torch._library.fake_class_registry import FakeScriptObject
+from torch._library.opaque_object import is_opaque_reference_type
+from torch._subclasses.fake_tensor import unset_fake_temporarily
+from torch.export.exported_program import (
+    ArgumentSpec,
+    CustomObjArgument,
+    ExportGraphSignature,
+    InputKind,
+    InputSpec,
+    TensorArgument,
+)
+from torch.fx._symbolic_trace import _ConstantAttributeType
+from torch.fx.graph_module import _get_attr
+log = logging.getLogger(__name__)
+class ConstantAttrMap(collections.abc.MutableMapping):
+    """A mapping class that understands how to use module constants (tensors,
+    ScriptObjects, FakeScriptObjects) as keys. We store tensors and FakeScriptObjects normally,
+    but ScriptObjects are stored by hash, because different torch.ScriptObjects can point to
+    the same underlying value (but we guarantee that they will `hash()` to the same value
+    if that's the case).
+    """
+    def __init__(self) -> None:
+        # Underlying dict that we use to implement this mapping.
+        self._constant_attrs: dict[
+            Union[int, torch.Tensor, FakeScriptObject, torch.utils._pytree.TreeSpec],
+            list[Any],
+        ] = {}
+        # Map from the hash(ScriptObject) to the ScriptObject itself. Used for
+        # APIs like `__iter__` that should look like they're returning the
+        # original ScriptObjects.
+        self._script_object_map: dict[int, torch.ScriptObject] = {}
+    def __getitem__(self, key: _ConstantAttributeType) -> Any:
+        real_key = hash(key) if isinstance(key, torch.ScriptObject) else key
+        assert isinstance(real_key, (int, torch.Tensor, FakeScriptObject))
+        return self._constant_attrs[real_key]
+    def __setitem__(self, key: _ConstantAttributeType, value):
+        # we shouldn't actually call this, should go to add() instead to handle aliasing
+        raise NotImplementedError(
+            """Directly setting values for ConstantAttrMap is not supported, please use add(key, value) instead.
+The same key can be mapped to multiple values, for handling constant aliasing."""
+        )
+    def add(self, key: _ConstantAttributeType, value: Any) -> None:
+        if isinstance(key, torch.ScriptObject):
+            if hash(key) not in self._constant_attrs:
+                self._constant_attrs[hash(key)] = []
+            self._constant_attrs[hash(key)].append(value)
+            self._script_object_map[hash(key)] = key
+        elif isinstance(key, (torch.Tensor, FakeScriptObject)):
+            if key not in self._constant_attrs:
+                self._constant_attrs[key] = []
+            self._constant_attrs[key].append(value)
+        else:
+            raise TypeError(
+                f"Expected key to be a tensor or ScriptObject, got {type(key)}"
+            )
+    def __delitem__(self, key: _ConstantAttributeType):
+        real_key = hash(key) if isinstance(key, torch.ScriptObject) else key
+        del self._constant_attrs[real_key]
+    def __iter__(self):
+        for key in self._constant_attrs:
+            if isinstance(key, int):
+                yield self._script_object_map[key]
+            else:
+                yield key
+    def __len__(self):
+        return len(self._constant_attrs)
+    def __contains__(self, key: object) -> bool:
+        real_key = hash(key) if isinstance(key, torch.ScriptObject) else key
+        return real_key in self._constant_attrs
+def get_constant_fqn(node: torch.fx.Node, constant_name: str) -> str:
+    # The FQN of the constant tensor in the state dict should
+    # correspond to the module where the constant tensor was
+    # originally used.
+    if len(node.meta["nn_module_stack"]) == 0:
+        return constant_name
+    parent_fqn = list(node.meta["nn_module_stack"].values())[-1][0]
+    if len(parent_fqn) > 0:
+        return f"{parent_fqn}.{constant_name}"
+    else:
+        return constant_name
+def _get_first_fqn(
+    const_attrs: ConstantAttrMap,
+    key: _ConstantAttributeType,
+) -> Any:
+    fqns = const_attrs.get(key)
+    return fqns[0] if fqns else None
+def _unused_constant(node: torch.fx.Node) -> Optional[list[torch.fx.Node]]:
+    """
+    If there is a tensor constant created while tracing, here is how the graph
+    looks like:
+        %_tensor_constant0 : [num_users=1] = get_attr[target=_tensor_constant0]
+        %lift_fresh_copy : [num_users=1] = call_function[target=torch.ops.aten.lift_fresh_copy.default](args = (%_tensor_constant0,))
+        %detach_ : [num_users=?] = call_function[target=torch.ops.aten.detach_.default](args = (%lift_fresh_copy,))
+    To check to see if the tensor constant is being used, we want to traverse to
+    the detach node to see if it's actually being used.
+    This function returns None if this constant is being used, otherwise it returns the
+    lift_fresh and detach node to be removed later.
+    """  # noqa: B950
+    if len(node.users) > 1:
+        return None
+    lift_fresh_node = next(iter(node.users.keys()))
+    if not (
+        lift_fresh_node.op == "call_function"
+        and lift_fresh_node.target
+        in (
+            torch.ops.aten.lift_fresh.default,
+            torch.ops.aten.lift_fresh_copy.default,
+        )
+    ):
+        return None
+    if len(lift_fresh_node.users) > 1:
+        return None
+    # Case 1: lift node is not used anywhere
+    if len(lift_fresh_node.users) == 0:
+        return [lift_fresh_node, node]
+    detach_node = next(iter(lift_fresh_node.users.keys()))
+    if not (
+        detach_node.op == "call_function"
+        and detach_node.target
+        in (
+            torch.ops.aten.detach_.default,
+            torch.ops.aten.detach.default,
+        )
+    ):
+        return None
+    if len(detach_node.users) > 0:
+        return None
+    else:
+        # Case 2: Lift node's child is not used anywhere
+        return [detach_node, lift_fresh_node, node]
+def lift_constants_pass(
+    gm: torch.fx.GraphModule,
+    graph_signature: ExportGraphSignature,
+    constant_attrs: ConstantAttrMap,
+) -> dict[str, _ConstantAttributeType]:
+    """
+    Takes a graph module, graph signature, and modifies them inplace to lift any
+    constants (tensors or custom classes) as inputs to the graph. Returns a
+    dictionary of names to constants.
+    Arguments:
+        gm (torch.fx.GraphModule): The graph module containing the graph and constants to lift.
+        graph_signature (ExportGraphSignature): This graph signature will be
+            mutated to add additional CONSTANT_TENSOR and CUSTOM_OBJ inputs.
+        constant_attrs (ConstantAttr): A mapping from a constant value to its
+            fully-qualified path in `gm`. This is used to maintain consistent
+            location of constants between the original module and the exported
+            version.
+    Returns:
+        A dictionary of fqn => constant value.
+    """
+    all_constants: dict[str, _ConstantAttributeType] = {}
+    input_specs = graph_signature.input_specs
+    num_custom_obj = sum(
+        input_spec.kind == InputKind.CUSTOM_OBJ for input_spec in input_specs
+    )
+    num_tensor_constants = sum(
+        input_spec.kind == InputKind.CONSTANT_TENSOR for input_spec in input_specs
+    )
+    fake_mode = detect_fake_mode(
+        tuple(node.meta["val"] for node in gm.graph.nodes if node.op == "placeholder")
+    )
+    first_user_input_loc, first_user_input = 0, next(iter(gm.graph.nodes))
+    used_target_names = set()
+    input_nodes = [node for node in gm.graph.nodes if node.op == "placeholder"]
+    assert len(input_nodes) == len(input_specs)
+    for i, (node, input_spec) in enumerate(zip(input_nodes, input_specs)):
+        used_target_names.add(input_spec.target)
+        if input_spec.kind == InputKind.USER_INPUT:
+            first_user_input = node
+            first_user_input_loc = i
+            break
+    lifted_objs = ConstantAttrMap()
+    renamed_targets = {}
+    for node in list(gm.graph.nodes):
+        if node.op == "get_attr":
+            if nodes_to_remove := _unused_constant(node):
+                # Remove the node if it's not being used
+                for node_rm in nodes_to_remove:
+                    gm.graph.erase_node(node_rm)
+                continue
+            constant_val = _get_attr(gm, node.target)
+            # These are not hashable and not gonna be lifted
+            # so we can skip them earlier
+            if isinstance(constant_val, torch.fx.GraphModule):
+                continue
+            if "LoweredBackendModule" in type(constant_val).__name__:
+                continue
+            if "AOTInductorRunnerWrapper" in type(constant_val).__name__:
+                continue
+            if isinstance(constant_val, torch.utils._pytree.TreeSpec):
+                continue
+            if constant_val in lifted_objs:
+                # We already lifted this constant elsewhere. Just rewrite uses
+                # of this get_attr to point to the already-existing placeholder
+                # node.
+                const_placeholder_node = _get_first_fqn(lifted_objs, constant_val)
+                node.replace_all_uses_with(const_placeholder_node)
+                gm.graph.erase_node(node)
+                renamed_targets[node.name] = const_placeholder_node.name
+                continue
+            # For ScriptObject, Tensor and FakeScriptObject constants:
+            # First check if the constant was an attribute on some module by
+            # consulting `constant_attrs` map. If it is, use the fqn that keeps
+            # its location consistent with the eager module.
+            #
+            # If it's not in the `constant_attrs` map, that means it's an inline
+            # constant (e.g. x + torch.tensor(0)), and thus did not have a
+            # specific location in the eager module. In that case, just generate
+            # some name and attach it to the module in which it was used.
+            if isinstance(
+                constant_val, (torch.ScriptObject, FakeScriptObject)
+            ) or is_opaque_reference_type(type(constant_val)):
+                constant_kind = InputKind.CUSTOM_OBJ
+                constant_fqn = _get_first_fqn(constant_attrs, constant_val)
+                if constant_fqn is not None:
+                    constant_name = constant_fqn.replace(".", "_")
+                else:
+                    constant_name = f"lifted_custom_{num_custom_obj}"
+                    constant_fqn = get_constant_fqn(node, constant_name)
+                    while constant_fqn in used_target_names:
+                        num_custom_obj += 1
+                        constant_name = f"lifted_custom_{num_custom_obj}"
+                        constant_fqn = get_constant_fqn(node, constant_name)
+                    num_custom_obj += 1
+            elif isinstance(constant_val, torch.Tensor):
+                # Remove the parameterness of constant_val
+                if isinstance(constant_val, torch.nn.Parameter):
+                    log.debug(
+                        "%s created when tracing %s is a parameter. But "
+                        "it's not registered with register_parameter(). export will treat it as a constant tensor",
+                        str(node.target),
+                        str(node.meta.get("stack_trace", "<unknown stack>")),
+                    )
+                    # We get the real data out of the parameter by disabling the surrounding fake mode.
+                    with unset_fake_temporarily():
+                        constant_val = constant_val.data
+                constant_kind = InputKind.CONSTANT_TENSOR
+                constant_fqn = _get_first_fqn(constant_attrs, constant_val)
+                if constant_fqn is not None:
+                    constant_name = constant_fqn.replace(".", "_")
+                else:
+                    constant_name = f"lifted_tensor_{num_tensor_constants}"
+                    constant_fqn = get_constant_fqn(node, constant_name)
+                    while constant_fqn in used_target_names:
+                        num_tensor_constants += 1
+                        constant_name = f"lifted_tensor_{num_tensor_constants}"
+                        constant_fqn = get_constant_fqn(node, constant_name)
+                    num_tensor_constants += 1
+            else:
+                raise SpecViolationError(
+                    f"getattr node {node} referencing unsupported type {type(constant_val)}"
+                )
+            with gm.graph.inserting_before(first_user_input):
+                # Insert the constant node before the first user input
+                const_placeholder_node = gm.graph.placeholder(constant_name)
+                # match target name with its node name in case there is name collision
+                # and suffix is added to node name in fx
+                const_placeholder_node.target = const_placeholder_node.name
+                for k, v in node.meta.items():
+                    const_placeholder_node.meta[k] = v
+                # Once the FQN has been used, remove nn_module_stack, stack_trace
+                const_placeholder_node.meta.pop("nn_module_stack")
+                const_placeholder_node.meta.pop("stack_trace", None)
+                input_spec_arg: ArgumentSpec
+                if isinstance(constant_val, torch.Tensor):
+                    if fake_mode is not None:
+                        const_placeholder_node.meta["val"] = fake_mode.from_tensor(
+                            constant_val, static_shapes=True
+                        )
+                        const_placeholder_node.meta["val"].constant = constant_val
+                    else:
+                        const_placeholder_node.meta["val"] = constant_val
+                    input_spec_arg = TensorArgument(name=const_placeholder_node.name)
+                elif isinstance(constant_val, torch._C.ScriptObject):
+                    class_fqn = constant_val._type().qualified_name()  # type: ignore[attr-defined]
+                    const_placeholder_node.meta["val"] = CustomObjArgument(
+                        constant_fqn, class_fqn
+                    )
+                    input_spec_arg = CustomObjArgument(
+                        name=const_placeholder_node.name, class_fqn=class_fqn
+                    )
+                elif isinstance(constant_val, FakeScriptObject):
+                    class_fqn = constant_val.script_class_name
+                    const_placeholder_node.meta["val"] = CustomObjArgument(
+                        constant_fqn, class_fqn, constant_val
+                    )
+                    input_spec_arg = CustomObjArgument(
+                        name=const_placeholder_node.name,
+                        class_fqn=class_fqn,
+                        fake_val=constant_val,
+                    )
+                else:
+                    raise SpecViolationError(
+                        f"tried to lift unsupported type {type(constant_val)} from node {node.format_node()}"
+                    )
+                lifted_objs.add(constant_val, const_placeholder_node)
+                node.replace_all_uses_with(const_placeholder_node)
+                gm.graph.erase_node(node)
+                renamed_targets[node.name] = const_placeholder_node.name
+                # Add the constant as a buffer to the graph signature
+                graph_signature.input_specs.insert(
+                    first_user_input_loc,
+                    InputSpec(
+                        kind=constant_kind,
+                        arg=input_spec_arg,
+                        target=constant_fqn,
+                    ),
+                )
+                if constant_val in constant_attrs:
+                    for fqn in constant_attrs[constant_val]:
+                        all_constants[fqn] = constant_val
+                else:
+                    all_constants[constant_fqn] = constant_val
+                first_user_input_loc += 1
+    for spec in graph_signature.output_specs:
+        if spec.arg.name in renamed_targets:
+            spec.arg.name = renamed_targets[spec.arg.name]
+    return all_constants
+def rewrite_script_object_meta(
+    gm: torch.fx.GraphModule,
+) -> dict[str, _ConstantAttributeType]:
+    """When tracing, we produce a graph with FakeScriptObject in the
+    meta["val"].
+    For now, we rewrie meta["val"] to be a placeholder CustomObjArgument
+    """
+    constants: dict[
+        str,
+        _ConstantAttributeType,
+    ] = {}
+    for node in gm.graph.nodes:
+        if "val" not in node.meta:
+            continue
+        old_meta = node.meta["val"]
+        if isinstance(old_meta, torch.ScriptObject):
+            class_fqn = old_meta._type().qualified_name()  # type: ignore[attr-defined]
+            new_meta = CustomObjArgument(node.name, class_fqn)
+            constants[node.name] = old_meta
+            node.meta["val"] = new_meta
+        elif isinstance(old_meta, FakeScriptObject):
+            class_fqn = old_meta.script_class_name  # type: ignore[attr-defined]
+            new_meta = CustomObjArgument(node.name, class_fqn, old_meta)
+            constants[node.name] = old_meta
+            node.meta["val"] = new_meta
+    return constants
+def _materialize_and_lift_constants(
+    gm: torch.fx.GraphModule,
+    export_graph_signature: ExportGraphSignature,
+    constant_attrs: ConstantAttrMap,
+) -> dict[str, _ConstantAttributeType]:
+    constants = rewrite_script_object_meta(gm)
+    constants.update(lift_constants_pass(gm, export_graph_signature, constant_attrs))
+    return constants

Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/_export/passes/remove_runtime_assertions.py ADDED Viewed

	@@ -0,0 +1,36 @@

+import torch
+from torch.fx.passes.infra.pass_base import PassBase, PassResult
+class _RemoveRuntimeAssertionsPass(PassBase):
+    """
+    Remove runtime assertions inserted by the
+    _AddRuntimeAssertionsForInlineConstraintsPass.
+    """
+    def call(self, graph_module: torch.fx.GraphModule) -> PassResult:
+        modified = False
+        for module in graph_module.modules():
+            if not isinstance(module, torch.fx.GraphModule):
+                continue
+            for node in module.graph.nodes:
+                if node.target in [
+                    torch.ops.aten._assert_async.msg,
+                    torch.ops.aten._assert_scalar.default,
+                    torch.ops.aten.sym_constrain_range_for_size.default,
+                    torch.ops.aten.sym_constrain_range.default,
+                    torch.ops.aten._assert_tensor_metadata.default,
+                ]:
+                    assert_async_node = node
+                    if len(assert_async_node.users) > 0:
+                        continue
+                    module.graph.erase_node(assert_async_node)
+                    # the upstream scalar_tensor <- {le, ge} <- sym_size
+                    # linear chain of nodes of nodes is removed by the
+                    # downstream dead code elimination
+                    modified = True
+        # We don't necessarily want to run DCE here because it could affect
+        # nodes that are in the module_call_graph attribute of the exported
+        # program. We will leave it to the pass caller to call DCE.
+        return PassResult(graph_module, modified)

Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/_export/passes/replace_autocast_with_hop_pass.py ADDED Viewed

	@@ -0,0 +1,189 @@

+# mypy: allow-untyped-defs
+from __future__ import annotations
+from typing import TYPE_CHECKING
+import torch
+from torch._higher_order_ops.wrap import wrap_with_autocast
+from ..utils import node_inline_, nodes_filter, nodes_first, sequential_split
+from .replace_with_hop_pass_util import (
+    _replace_with_hop_helper,
+    _replace_with_hop_pass_helper,
+    _sequential_split_and_maybe_inline_subgraphs_helper,
+)
+if TYPE_CHECKING:
+    from torch.export.graph_signature import ExportGraphSignature
+def _is_autocast_node(node: torch.fx.Node) -> torch.fx.Node | bool:
+    return (
+        node
+        and node.op == "call_function"
+        and node.target
+        in [
+            torch.amp.autocast_mode._enter_autocast,
+            torch.amp.autocast_mode._exit_autocast,
+        ]
+    )
+def _is_enter_autocast_node(node: torch.fx.Node) -> torch.fx.Node | bool:
+    return (
+        node
+        and node.op == "call_function"
+        and node.target is torch.amp.autocast_mode._enter_autocast
+    )
+def _is_exit_autocast_node(node: torch.fx.Node) -> torch.fx.Node | bool:
+    return (
+        node
+        and node.op == "call_function"
+        and node.target is torch.amp.autocast_mode._exit_autocast
+    )
+def _is_autocast_sub_mod(node: torch.fx.Node) -> bool:
+    """
+    Check if the first non-placeholder node is `torch.amp.autocast_mode._enter_autocast`.
+    """
+    if node.op == "call_module":
+        assert isinstance(node.target, str)
+        subgm = getattr(node.graph.owning_module, node.target)
+        first_non_ph = nodes_first(
+            subgm.graph.nodes, lambda node: node.op != "placeholder"
+        )
+        if (
+            first_non_ph
+            and first_non_ph.op == "call_function"
+            and first_non_ph.target is torch.amp.autocast_mode._enter_autocast
+        ):
+            # TODO: check if current auto-cast type is the same as the args of
+            # _enter_autocast. If so, return False, i.e. do not create a submodule.
+            return True
+    return False
+def _check_valid_autocast_block(
+    enter_autocast_node: torch.fx.Node, exit_autocast_node: torch.fx.Node
+) -> None:
+    assert _is_enter_autocast_node(enter_autocast_node)
+    assert _is_exit_autocast_node(exit_autocast_node)
+    assert exit_autocast_node.args[0] == enter_autocast_node
+def _replace_with_hop(node: torch.fx.Node) -> None:
+    assert node.op == "call_module"
+    graph: torch.fx.Graph = node.graph
+    assert graph.owning_module is not None
+    gm: torch.fx.GraphModule = graph.owning_module
+    assert isinstance(node.target, str)
+    sub_gm = getattr(gm, node.target)
+    sub_graph = sub_gm.graph
+    autocast_nodes = nodes_filter(sub_graph.nodes, _is_autocast_node)
+    if len(autocast_nodes) > 0:
+        assert len(autocast_nodes) > 1  # need at least an enter node and an exist node
+        enter_autocast_node = autocast_nodes[0]
+        exit_autocast_node = autocast_nodes[-1]
+        _check_valid_autocast_block(enter_autocast_node, exit_autocast_node)
+        _replace_with_hop_helper(node, enter_autocast_node, wrap_with_autocast)
+        sub_graph.erase_node(exit_autocast_node)
+        sub_graph.erase_node(enter_autocast_node)
+def _split_autocast(gm: torch.fx.GraphModule) -> torch.fx.GraphModule:
+    """
+    split_autocast creates a new graph module that splits the input graph module into multiple submodules
+    based on the `_enter_autocast` and `_exit_autocast` nodes. It doesn't mutate the input graph module.
+    Nodes between the **outer-most** `_enter_autocast` and `_exit_autocast(_enter_autocast)` are split
+    into a submodule. Nested autocast regions are not split.
+    `_enter_autocast` and `_exit_autocast(_enter_autocast)` nodes are in the submodule as well.
+    Below is an example of splitting. A, B, C, D, E are blocks of non-autocast nodes in the original graph
+    module. Nodes marked with the same number are grouped into the same submodule.
+    A               # 0
+    enter_autocast  # 1
+    B               # 1
+    exit_autocast   # 1
+    C               # 2
+    enter_autocast  # 3
+    D               # 3
+    exit_autocast   # 3
+    E               # 4
+    """
+    enter_autocast_node_stack: list[torch.fx.Node] = []
+    first_node_after_outer_most_exit: bool = False
+    def node_call_back(node: torch.fx.Node) -> bool:
+        nonlocal enter_autocast_node_stack, first_node_after_outer_most_exit
+        increment_id = False
+        if first_node_after_outer_most_exit or (
+            len(enter_autocast_node_stack) == 0 and _is_enter_autocast_node(node)
+        ):
+            assert len(enter_autocast_node_stack) == 0
+            first_node_after_outer_most_exit = False
+            increment_id = True
+        if _is_enter_autocast_node(node):
+            enter_autocast_node_stack.append(node)
+        elif _is_exit_autocast_node(node):
+            assert len(enter_autocast_node_stack) > 0
+            last_enter_autocast_node = enter_autocast_node_stack.pop()
+            assert node.args[0] == last_enter_autocast_node
+            if len(enter_autocast_node_stack) == 0:
+                # next node should be in the next submodule since
+                # autocast block ends
+                first_node_after_outer_most_exit = True
+        return increment_id
+    return sequential_split(gm, node_call_back)
+def _sequential_split_and_maybe_inline_subgraphs(
+    gm: torch.fx.GraphModule, graph_signature: ExportGraphSignature | None
+) -> tuple[torch.fx.GraphModule, ExportGraphSignature | None]:
+    """
+    Helper function for replace_autocast_with_hop_pass().
+    Split the graph module into multiple subgraphs based on the autocast nodes.
+    For each subgraph, decides whether to construct a HOO subgraph, or inline the calls
+    back into the parent graph module.
+    Nodes between `_enter_autocast` and `_exit_autocast(_enter_autocast)` are considered
+    as a subgraph.
+    """
+    need_replacing = any(_is_autocast_node(node) for node in gm.graph.nodes)
+    if not need_replacing:
+        return gm, graph_signature
+    # split_autocast returns a new graph module that could have different output
+    # args names. We need to fix the graph signature in `_sequential_split_and_maybe_inline_subgraphs_helper`.
+    new_gm = _split_autocast(gm)
+    def _maybe_inline_or_replace_with_hop(node: torch.fx.Node) -> None:
+        if _is_autocast_sub_mod(node):
+            _replace_with_hop(node)
+        else:
+            assert node.op == "call_module"
+            assert isinstance(node.target, str)
+            node_inline_(node)
+    return _sequential_split_and_maybe_inline_subgraphs_helper(
+        new_gm, graph_signature, _maybe_inline_or_replace_with_hop
+    )
+def replace_autocast_with_hop_pass(
+    gm: torch.fx.GraphModule, graph_signature: ExportGraphSignature | None
+) -> tuple[torch.fx.GraphModule, ExportGraphSignature | None]:
+    """
+    Split gm into sub-graph-modules using `sequential_split_and_maybe_inline_subgraphs`, and
+    then recursively call itself on each of the submodules.
+    """
+    return _replace_with_hop_pass_helper(
+        gm,
+        graph_signature,
+        _sequential_split_and_maybe_inline_subgraphs,
+    )

Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/_export/passes/replace_quantized_ops_with_standard_ops_pass.py ADDED Viewed

	@@ -0,0 +1,676 @@

+# mypy: allow-untyped-defs
+import logging
+import operator
+from typing import Optional, Union
+import torch
+import torch.export._trace
+from torch._ops import OpOverload
+from torch.ao.quantization.fx._decomposed import (
+    dequantize_per_channel,
+    dequantize_per_tensor,
+    quantize_per_tensor,
+)
+from torch.ao.quantization.utils import calculate_qmin_qmax
+from torch.fx.graph_module import _assign_attr
+log = logging.getLogger(__name__)
+# Those values will need to be carried over multiple operators.
+_INPUT_Q_DTYPE: Optional[Union[torch.dtype, torch.fx.Node]] = None
+_SCALE: Optional[Union[float, torch.fx.Node]] = None
+_ZERO_POINT: Optional[Union[float, torch.fx.Node]] = None
+def int_to_valid_dtype(val: int) -> torch.dtype:
+    from torch._export.converter import _TORCH_ENUM_TO_DTYPE  # No circular import.
+    if isinstance(val, torch.dtype):
+        return val
+    dtype = _TORCH_ENUM_TO_DTYPE[val]
+    if dtype == torch.quint8:
+        return torch.uint8
+    elif dtype == torch.qint8:
+        return torch.int8
+    return dtype
+def fx_enum_to_dtype(gm: torch.fx.GraphModule, val: int) -> torch.fx.Node:
+    return gm.graph.call_function(int_to_valid_dtype, (val,))
+def insert_quantized_node(
+    gm: torch.fx.GraphModule,
+    val_node: torch.fx.Node,
+    scale_node: Union[float, torch.fx.Node],
+    zero_point_node: Union[float, torch.fx.Node],
+    qmin_node: Union[float, int, torch.fx.Node],
+    qmax_node: Union[float, int, torch.fx.Node],
+    dtype_node: Union[torch.dtype, torch.fx.Node],
+    qscheme: Optional[torch.qscheme],
+) -> torch.fx.Node:
+    return gm.graph.call_function(
+        quantize_per_tensor,
+        (
+            val_node,
+            scale_node,
+            zero_point_node,
+            qmin_node,
+            qmax_node,
+            dtype_node,
+        ),
+    )
+def get_dequantized(
+    val: torch.Tensor,
+    scale: Union[float, torch.Tensor],
+    zero_point: Union[float, torch.Tensor],
+    qmin: Union[float, int],
+    qmax: Union[float, int],
+    dtype: torch.dtype,
+    axis: Optional[int],
+    qscheme: Optional[torch.qscheme],
+) -> torch.Tensor:
+    if qscheme is torch.per_tensor_affine:
+        return dequantize_per_tensor(
+            val,
+            scale,  # type: ignore[arg-type]
+            zero_point,  # type: ignore[arg-type]
+            qmin,  # type: ignore[arg-type]
+            qmax,  # type: ignore[arg-type]
+            dtype,
+        )
+    elif qscheme is torch.per_channel_affine:
+        return dequantize_per_channel(
+            val,
+            scale,  # type: ignore[arg-type]
+            zero_point,  # type: ignore[arg-type]
+            axis,  # type: ignore[arg-type]
+            qmin,  # type: ignore[arg-type]
+            qmax,  # type: ignore[arg-type]
+            dtype,
+        )
+    else:
+        raise RuntimeError(f"Unsupported dequantization scheme: {qscheme}")
+def insert_dequantized_node(
+    gm: torch.fx.GraphModule,
+    val_node: torch.fx.Node,
+    scale_node: Union[float, torch.fx.Node],
+    zero_point_node: Union[float, torch.fx.Node],
+    qmin_node: Union[float, int, torch.fx.Node],
+    qmax_node: Union[float, int, torch.fx.Node],
+    dtype_node: Union[torch.dtype, torch.fx.Node],
+    axis_node: Optional[Union[int, torch.fx.Node]],
+    qscheme: Optional[torch.qscheme],
+) -> torch.fx.Node:
+    if qscheme is torch.per_tensor_affine:
+        return gm.graph.call_function(
+            dequantize_per_tensor,
+            (
+                val_node,
+                scale_node,
+                zero_point_node,
+                qmin_node,
+                qmax_node,
+                dtype_node,
+            ),
+        )
+    elif qscheme is torch.per_channel_affine:
+        return gm.graph.call_function(
+            dequantize_per_channel,
+            (
+                val_node,
+                scale_node,
+                zero_point_node,
+                axis_node,
+                qmin_node,
+                qmax_node,
+                dtype_node,
+            ),
+        )
+    else:
+        raise RuntimeError(f"Unsupported dequantization scheme: {qscheme}")
+def get_qmin_qmax(dtype: torch.dtype) -> tuple[Union[int, float], Union[int, float]]:
+    return calculate_qmin_qmax(None, None, False, dtype, False)  # type: ignore[arg-type]
+def insert_qmin_qmax_node(
+    gm: torch.fx.GraphModule, dtype_node: Union[torch.dtype, torch.fx.Node]
+) -> tuple[torch.fx.Node, torch.fx.Node]:
+    q_min_max_node = gm.graph.call_function(
+        calculate_qmin_qmax, (None, None, False, dtype_node, False)
+    )
+    qmin_node = gm.graph.call_function(operator.getitem, (q_min_max_node, 0))
+    qmax_node = gm.graph.call_function(operator.getitem, (q_min_max_node, 1))
+    return qmin_node, qmax_node
+def get_script_object(
+    gm: torch.nn.Module, node: torch.fx.Node
+) -> torch._C.ScriptObject:
+    assert isinstance(node, torch.fx.Node)
+    assert node.op == "get_attr"
+    attr_name = node.target
+    assert isinstance(attr_name, str)
+    mod = gm
+    for attr in attr_name.split("."):
+        mod = getattr(mod, attr)
+    assert isinstance(mod, torch._C.ScriptObject)
+    return mod
+def insert_weight_and_bias_get_attr_node_from_get_attr_to_scriptobject(
+    gm: torch.fx.GraphModule,
+    param_node: torch.fx.Node,
+) -> tuple[torch.fx.Node, Optional[torch.fx.Node]]:
+    """Directly inline tensor from a get_attr fx node."""
+    mod = get_script_object(gm, param_node)
+    w_qtensor, b_qtensor = mod.unpack()  # type: ignore[attr-defined]
+    w_attr_name, b_attr_name = (
+        f"dequantized_{param_node.target}_w",
+        f"dequantized_{param_node.target}_b",
+    )
+    return insert_weight_and_bias_get_attr_node(
+        gm, w_qtensor, b_qtensor, w_attr_name, b_attr_name
+    )
+def insert_weight_and_bias_get_attr_node_from_get_attr_to_qtensor(
+    gm: torch.fx.GraphModule,
+    get_attr_to_weight_node: torch.fx.Node,
+    get_attr_to_bias_node: Optional[torch.fx.Node],
+) -> tuple[torch.fx.Node, Optional[torch.fx.Node]]:
+    assert isinstance(get_attr_to_weight_node.target, str)
+    w_qtensor = getattr(gm, get_attr_to_weight_node.target)
+    w_attr_name = f"dequantized_{get_attr_to_weight_node.target}_w"
+    if get_attr_to_bias_node is not None:
+        assert isinstance(get_attr_to_bias_node.target, str)
+        b_qtensor = getattr(gm, get_attr_to_bias_node.target)
+        b_attr_name = f"dequantized_{get_attr_to_bias_node.target}_b"
+    else:
+        b_qtensor, b_attr_name = None, ""
+    return insert_weight_and_bias_get_attr_node(
+        gm, w_qtensor, b_qtensor, w_attr_name, b_attr_name
+    )
+def insert_weight_and_bias_get_attr_node(
+    gm: torch.fx.GraphModule,
+    w_qtensor: torch.Tensor,
+    b_qtensor: Optional[torch.Tensor],
+    w_attr_name: str,
+    b_attr_name: str,
+) -> tuple[torch.fx.Node, Optional[torch.fx.Node]]:
+    w_tensor = get_tensor_from_qtensor(w_qtensor)
+    _assign_attr(w_tensor, gm, w_attr_name)
+    w_tensor_attr = gm.graph.get_attr(w_attr_name)
+    if b_qtensor is not None:
+        b_tensor = get_tensor_from_qtensor(b_qtensor, dequant=False)
+        _assign_attr(b_tensor, gm, b_attr_name)
+        b_tensor_attr = gm.graph.get_attr(b_attr_name)
+    else:
+        b_tensor_attr = None
+    return w_tensor_attr, b_tensor_attr
+def get_tensor_from_qtensor(
+    qtensor: torch.Tensor, dequant: bool = True
+) -> torch.Tensor:
+    # Manual conversion because qint8 is not used anymore.
+    if qtensor.dtype in [torch.qint8, torch.quint8]:
+        tensor = qtensor.int_repr()
+    else:
+        tensor = qtensor
+    # Weights need dequantization with scaling and zero_point adjustment, but
+    # bias does not need that.
+    if dequant:
+        qscheme = qtensor.qscheme()
+        if qscheme == torch.per_channel_affine:
+            scale, zero_point, axis = (
+                qtensor.q_per_channel_scales(),
+                qtensor.q_per_channel_zero_points(),
+                qtensor.q_per_channel_axis(),
+            )
+        else:
+            scale, zero_point, axis = (
+                qtensor.q_scale(),  # type: ignore[assignment]
+                qtensor.q_zero_point(),  # type: ignore[assignment]
+                None,
+            )
+        dtype = tensor.dtype
+        qmin, qmax = get_qmin_qmax(dtype)
+        return get_dequantized(
+            tensor, scale, zero_point, qmin, qmax, dtype, axis, qscheme
+        )
+    return tensor
+def insert_fused_activation_node(
+    gm: torch.fx.GraphModule, opname: str, fx_node: torch.fx.Node
+) -> torch.fx.Node:
+    if opname in ["conv1d_relu", "conv2d_relu", "linear_relu", "add_relu", "mul_relu"]:
+        fx_node = gm.graph.call_function(torch.ops.aten.relu, (fx_node,))
+    return fx_node
+def _conv1d_op_with_squeeze(
+    inp: torch.Tensor,
+    weight: torch.Tensor,
+    bias: Optional[torch.Tensor],
+    stride: list[int],
+    padding: list[int],
+    dilation: list[int],
+    groups: int,
+) -> torch.Tensor:
+    # In quantized version, conv1d is emulated using conv2d with squeeze and unsqueeze
+    # operations before and after the conv2d operation to match the dimension of weights.
+    # Reference: https://github.com/pytorch/pytorch/blob/eca0cb0fbe84bb0a34fa94afe261bceecd52c436/aten/src/ATen/native/quantized/cpu/qconv.cpp#L1827  # noqa: B950
+    s_inp = torch.ops.aten.unsqueeze(inp, 2)
+    conv1d_res = torch.ops.aten.conv2d(
+        s_inp,
+        weight,
+        bias,
+        stride,
+        padding,
+        dilation,
+        groups,
+    )
+    uns_conv1d_res = torch.ops.aten.squeeze(conv1d_res, 2)
+    return uns_conv1d_res
+def _transform_conv_with_packedparam(gm: torch.fx.GraphModule, node: torch.fx.Node):
+    """Conv specific transformation function."""
+    assert isinstance(node.target, torch._ops.OpOverload)
+    opname = node.target._opname
+    scale_node, zero_point_node = node.args[2], node.args[3]
+    op_f = (
+        torch.ops.aten.conv2d
+        if opname in ["conv2d", "conv2d_relu"]
+        else _conv1d_op_with_squeeze
+    )
+    inp_node, param_node = node.args[0], node.args[1]
+    assert isinstance(inp_node, torch.fx.Node)
+    assert isinstance(param_node, torch.fx.Node)
+    if param_node.op == "call_function":
+        # Using Conv2dPrepackParam from conv_prepack.
+        # We directly skip the packing call and inline weights and bias.
+        w_node, b_node = param_node.args[0], param_node.args[1]
+        assert isinstance(w_node, torch.fx.Node)
+        assert b_node is None or isinstance(b_node, torch.fx.Node)
+        (
+            param_0,
+            param_1,
+        ) = insert_weight_and_bias_get_attr_node_from_get_attr_to_qtensor(
+            gm, w_node, b_node
+        )
+        op_res_node = gm.graph.call_function(
+            op_f, (inp_node, param_0, param_1, *param_node.args[2:])
+        )
+    else:
+        # Using ConvPrepackedParam.
+        param = get_script_object(gm, param_node)
+        (
+            param_0,
+            param_1,
+        ) = insert_weight_and_bias_get_attr_node_from_get_attr_to_scriptobject(
+            gm, param_node
+        )  # type: ignore[assignment]
+        op_res_node = gm.graph.call_function(
+            op_f,
+            (
+                inp_node,
+                param_0,
+                param_1,
+                param.stride(),  # type: ignore[attr-defined]
+                param.padding(),  # type: ignore[attr-defined]
+                param.dilation(),  # type: ignore[attr-defined]
+                param.groups(),  # type: ignore[attr-defined]
+            ),
+        )
+    return op_res_node, scale_node, zero_point_node
+def _transform_linear_with_packedparam(gm: torch.fx.GraphModule, node: torch.fx.Node):
+    """Linear specific transformation function."""
+    scale_node, zero_point_node = node.args[2], node.args[3]
+    inp_node, param_node = node.args[0], node.args[1]
+    assert isinstance(inp_node, torch.fx.Node)
+    assert isinstance(param_node, torch.fx.Node)
+    if param_node.op == "call_function":
+        # Using LinearPrepackParam from linear_prepack.
+        # We directly skip the packing call and inline weights and bias.
+        w_node, b_node = param_node.args[0], param_node.args[1]
+        assert isinstance(w_node, torch.fx.Node)
+        assert b_node is None or isinstance(b_node, torch.fx.Node)
+        (
+            param_0,
+            param_1,
+        ) = insert_weight_and_bias_get_attr_node_from_get_attr_to_qtensor(
+            gm, w_node, b_node
+        )
+        op_res_node = gm.graph.call_function(
+            torch.ops.aten.linear, (inp_node, param_0, param_1, *param_node.args[2:])
+        )
+    else:
+        # Using LinearPackedParams.
+        (
+            param_0,
+            param_1,
+        ) = insert_weight_and_bias_get_attr_node_from_get_attr_to_scriptobject(
+            gm, param_node
+        )  # type: ignore[assignment]
+        op_res_node = gm.graph.call_function(
+            torch.ops.aten.linear, (inp_node, param_0, param_1)
+        )
+    return op_res_node, scale_node, zero_point_node
+def _transform_op_where_last_two_arguments_are_scale_and_zero_point(
+    gm: torch.fx.GraphModule, node: torch.fx.Node
+):
+    """
+    This transformation function can be used for function where the last two
+    parameters are scale and zero point. Additionally, the function's parameters
+    do not need any unpacking.
+    """
+    to_standard_op = {
+        "mul": torch.ops.aten.mul,
+        "mul_relu": torch.ops.aten.mul,
+        "add": torch.ops.aten.add,
+        "add_relu": torch.ops.aten.add,
+        "softmax": torch.ops.aten.softmax,
+        "cat": torch.ops.aten.cat,
+        "hardswish": torch.ops.aten.hardswish,
+    }
+    assert isinstance(node.target, torch._ops.OpOverload)
+    opname, args = node.target._opname, node.args
+    scale_node, zero_point_node = args[-2], args[-1]
+    op_res_node = gm.graph.call_function(to_standard_op[opname], tuple(args[:-2]))
+    return op_res_node, scale_node, zero_point_node
+def _transform_scalar_arithmetic(gm: torch.fx.GraphModule, node: torch.fx.Node):
+    """Transform scalar overload for basic arithmetic."""
+    to_standard_op = {
+        "mul": torch.ops.aten.mul.Scalar,
+        "add": torch.ops.aten.add.Scalar,
+    }
+    assert isinstance(node.target, torch._ops.OpOverload)
+    opname, args = node.target._opname, node.args
+    op_res_node = gm.graph.call_function(to_standard_op[opname], args)
+    return op_res_node, _SCALE, _ZERO_POINT
+def _transform_prepacked_op(gm: torch.fx.GraphModule, node: torch.fx.Node):
+    """
+    Transformation for functions under prepacked namespace, where they share
+    the same handling logic that [...]OpContext contains all parameters.
+    """
+    assert isinstance(node.target, torch._ops.OpOverload)
+    opname, args = node.target._opname, node.args
+    op_f = None
+    if opname == "conv2d_clamp_run":
+        op_f = torch.ops.aten.conv2d
+    elif opname == "linear_clamp_run":
+        op_f = torch.ops.aten.linear
+    else:
+        raise RuntimeError(f"Invalid operator {opname}")
+    assert isinstance(args[1], torch.fx.Node)
+    so = get_script_object(gm, args[1])
+    func_args = []
+    func_args += [args[0]]
+    func_args += so.unpack()[:2]  # type: ignore[attr-defined]
+    if opname == "conv2d_clamp_run":
+        func_args += torch.ops.prepacked.unpack_prepacked_sizes_conv2d(so)[2:]
+    op_res_node = gm.graph.call_function(op_f, tuple(func_args))
+    return op_res_node
+def _transform_batch_norm(gm: torch.fx.GraphModule, node: torch.fx.Node):
+    args = node.args
+    scale_node, zero_point_node = args[-2], args[-1]
+    op_res_node = gm.graph.call_function(
+        torch.ops.aten.native_batch_norm, (*args[:-3], False, 0.1, args[-3])
+    )
+    op_res_node = gm.graph.call_function(operator.getitem, (op_res_node, 0))
+    return op_res_node, scale_node, zero_point_node
+def fx_transform_quantized_op_to_standard_op(
+    gm: torch.fx.GraphModule, node: torch.fx.Node
+) -> torch.fx.Node:
+    global _SCALE, _ZERO_POINT, _INPUT_Q_DTYPE
+    assert isinstance(node.target, torch._ops.OpOverload)
+    opname, overload = node.target._opname, node.target._overloadname
+    key = f"{opname}.{overload}"
+    opname_to_transform_f = {
+        "conv1d.new": _transform_conv_with_packedparam,
+        "conv1d_relu.new": _transform_conv_with_packedparam,
+        "conv1d.default": _transform_conv_with_packedparam,
+        "conv1d_relu.default": _transform_conv_with_packedparam,
+        "conv2d.new": _transform_conv_with_packedparam,
+        "conv2d_relu.new": _transform_conv_with_packedparam,
+        "conv2d.default": _transform_conv_with_packedparam,
+        "conv2d_relu.default": _transform_conv_with_packedparam,
+        "linear.default": _transform_linear_with_packedparam,
+        "linear_relu.default": _transform_linear_with_packedparam,
+        "add.default": _transform_op_where_last_two_arguments_are_scale_and_zero_point,
+        "add_relu.default": _transform_op_where_last_two_arguments_are_scale_and_zero_point,
+        "mul.default": _transform_op_where_last_two_arguments_are_scale_and_zero_point,
+        "mul_relu.default": _transform_op_where_last_two_arguments_are_scale_and_zero_point,
+        "softmax.default": _transform_op_where_last_two_arguments_are_scale_and_zero_point,
+        "cat.default": _transform_op_where_last_two_arguments_are_scale_and_zero_point,
+        "hardswish.default": _transform_op_where_last_two_arguments_are_scale_and_zero_point,
+        "batch_norm2d.default": _transform_batch_norm,
+        "mul.Scalar": _transform_scalar_arithmetic,
+        "add.Scalar": _transform_scalar_arithmetic,
+    }
+    if f"{key}" not in opname_to_transform_f:
+        raise RuntimeError(f"Unsupported quantized op during transformation: {key}")
+    op_res_node, scale_node, zero_point_node = opname_to_transform_f[f"{key}"](gm, node)
+    # Add fused activation layer.
+    op_res_node = insert_fused_activation_node(gm, opname, op_res_node)
+    _SCALE, _ZERO_POINT = scale_node, zero_point_node
+    assert _INPUT_Q_DTYPE is not None
+    qmin_node, qmax_node = insert_qmin_qmax_node(gm, _INPUT_Q_DTYPE)
+    q_fx_node = insert_quantized_node(
+        gm,
+        op_res_node,
+        scale_node,
+        zero_point_node,
+        qmin_node,
+        qmax_node,
+        _INPUT_Q_DTYPE,
+        torch.per_tensor_affine,
+    )
+    dq_fx_node = insert_dequantized_node(
+        gm,
+        q_fx_node,
+        scale_node,
+        zero_point_node,
+        qmin_node,
+        qmax_node,
+        _INPUT_Q_DTYPE,
+        None,
+        torch.per_tensor_affine,
+    )
+    return dq_fx_node
+def replace_quantized_ops_with_standard_ops(gm: torch.fx.GraphModule):
+    """
+    Replace legacy quantized ops (aten.quantize_per_tensor, quantized.conv) with
+    PT2 ops (quantize_decomposed.quantize_per_tensor, aten.conv).
+    Before:    x || -> aten.q        || -> quantized.conv2d     || -> quantized.linear    || -> aten.dq || -> y
+    After:     x || -> qd.q -> qd.dq || -> aten.conv2d -> qd.q -> qd.dq || aten.linear -> qd.q -> qd.dq || -> y
+    (qd == quantized_decomposed library, q = quantize, dq = dequantize)
+                                          ^
+                                          |
+                getattr(w), getattr(b) from Conv2dParamPrepack
+    During each iteration, the transformation spits out the transformed operator, its quantized output,
+    and its dequantized value together. We did this because dequantization need to use the
+    scale and zero point parameters from the quantization to recover the approximate original value. After each
+    iteration, the new dequantization node will be used as the input to the next node (e.g., dq2 -> linear).
+    For operators like conv2d and linear, their weights and bias are packed in a quantized format in the ScriptObject.
+    During the transformation, we unpack those objects, get their dequantized tensor, populate those
+    as attributes to the module, and use getattr to access them.
+    One exception in the transformation is conv_prepack and linear_prepack. Those calls pack
+    weight and bias constant tensors into ScriptObject, which are then used by subsequent conv2d or linear calls.
+    During transformation, we directly skip transforming conv_prepack or linear_prepack. We check whether ScriptObject to the
+    quantized::conv2d or linear is from conv_prepack or linear_prepack. If it is, we then inline those parameters
+    to the operator by converting them to a getattr fx.node.
+    For prepacked::conv2d_clamp_run and prepacked::linear_clamp_run, we directly convert them to aten.conv2d and aten.linear
+    without the need of doing de/quantization.
+    Three global variables defined are _INPUT_Q_DTYPE, _SCALE, _ZERO_POINT. _INPUT_Q_DTYPE determines the de/quantization
+    data type, which is the same across the entire program, but it only shows up in the very first quantization
+    call. _SCALE and _ZERO_POINT are used only when operators do not have those specified. E.g., mul.Scalar.
+    """
+    global _INPUT_Q_DTYPE
+    quantized = False
+    last_quantized_node = None
+    # pyrefly: ignore [bad-assignment]
+    for node in gm.graph.nodes:
+        if isinstance(node.target, OpOverload):
+            with gm.graph.inserting_before(node):
+                namespace, opname = node.target.namespace, node.target._opname
+                if namespace == "quantized" and opname not in [
+                    "conv_prepack",
+                    "linear_prepack",
+                ]:
+                    quantized = True
+                    fx_node = fx_transform_quantized_op_to_standard_op(gm, node)
+                    node.replace_all_uses_with(fx_node)
+                    last_quantized_node = fx_node
+                elif namespace == "prepacked":
+                    quantized = True
+                    fx_node = _transform_prepacked_op(gm, node)
+                    node.replace_all_uses_with(fx_node)
+                    last_quantized_node = fx_node
+                elif namespace == "aten" and opname == "quantize_per_tensor":
+                    inp_node, scale_node, zero_point_node, dtype_node = node.args
+                    dtype_node = fx_enum_to_dtype(gm, dtype_node)
+                    _INPUT_Q_DTYPE = dtype_node
+                    qmin_node, qmax_node = insert_qmin_qmax_node(gm, dtype_node)
+                    q_fx_node = insert_quantized_node(
+                        gm,
+                        inp_node,
+                        scale_node,
+                        zero_point_node,
+                        qmin_node,
+                        qmax_node,
+                        dtype_node,
+                        torch.per_tensor_affine,
+                    )
+                    dq_fx_node = insert_dequantized_node(
+                        gm,
+                        q_fx_node,
+                        scale_node,
+                        zero_point_node,
+                        qmin_node,
+                        qmax_node,
+                        dtype_node,
+                        None,
+                        torch.per_tensor_affine,
+                    )
+                    node.replace_all_uses_with(dq_fx_node)
+                    last_quantized_node = dq_fx_node
+                elif namespace == "aten" and opname == "dequantize":
+                    assert last_quantized_node is not None
+                    node.replace_all_uses_with(last_quantized_node)
+                else:
+                    last_quantized_node = node
+    # Post-processing again to remove legacy ScriptObjects and quantizated tensors
+    # stored as attributes or in the buffer. This is used to clean up the GraphModule
+    # to not trigger tracing errors like missing __obj_flatten__ functions.
+    def _clean_attr(mod: torch.nn.Module):
+        for submod in mod.modules():
+            attr_names_to_clean = set()
+            for k, v in submod.__dict__.items():
+                if isinstance(v, torch.ScriptObject):
+                    attr_names_to_clean.add(k)
+                if k == "_buffers":
+                    buffer_name_to_clean = set()
+                    # pyrefly: ignore [missing-attribute]
+                    for b_name, b_value in v.items():
+                        if isinstance(b_value, torch.Tensor) and b_value.dtype in [
+                            torch.qint8,
+                            torch.quint8,
+                        ]:
+                            buffer_name_to_clean.add(b_name)
+                    for b_name in buffer_name_to_clean:
+                        # pyrefly: ignore [missing-attribute]
+                        v.pop(b_name, None)
+            for attr_name in attr_names_to_clean:
+                delattr(submod, attr_name)
+    if quantized:
+        """
+        TODO: SetAttr + quantized ops will result incorrect program. This flag is used to temporarily
+        bypass test cases.
+        The deadcode elimination pass is needed to remove legacy quantized ops. Otherwise, retracing
+        will throw errors. However, the current way of SetAttr does inplace update to attributes, so
+        this pass regard them as dead code and remove them. Below is an example of GraphModule before
+        and after the dead code elimination pass.
+        class GraphModule(torch.nn.Module):
+            def forward(self, x_1):
+                # No stacktrace found for following nodes
+                data = self.data;  data = None
+                data_1 = self.data
+                add_tensor = torch.ops.aten.add.Tensor(data_1, x_1, alpha = 1);  data_1 = None
+                data_2 = self.data
+                copy_ = torch_Tensor_copy_(data_2, add_tensor);  data_2 = add_tensor = copy_ = None
+                data_3 = self.data
+                add_tensor_1 = torch.ops.aten.add.Tensor(x_1, data_3, alpha = 1);  x_1 = data_3 = None
+                return add_tensor_1
+        class GraphModule(torch.nn.Module):
+            def forward(self, x_1):
+                # No stacktrace found for following nodes
+                data_3 = self.data
+                add_tensor_1 = torch.ops.aten.add.Tensor(x_1, data_3, alpha = 1);  x_1 = data_3 = None
+                return add_tensor_1
+        """
+        gm.graph.eliminate_dead_code()
+        _clean_attr(gm)

Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/_export/passes/replace_set_grad_with_hop_pass.py ADDED Viewed

	@@ -0,0 +1,121 @@

+# mypy: allow-untyped-defs
+from __future__ import annotations
+from typing import TYPE_CHECKING
+import torch
+from torch._higher_order_ops.wrap import wrap_with_set_grad_enabled
+from ..utils import node_inline_, nodes_filter, nodes_first, nodes_map, sequential_split
+from .replace_with_hop_pass_util import (
+    _replace_with_hop_helper,
+    _replace_with_hop_pass_helper,
+    _sequential_split_and_maybe_inline_subgraphs_helper,
+)
+if TYPE_CHECKING:
+    from torch.export.graph_signature import ExportGraphSignature
+def _is_set_grad_enabled_node(node: torch.fx.Node) -> torch.fx.Node | bool:
+    return (
+        node
+        and node.op == "call_function"
+        and node.target is torch._C._set_grad_enabled
+    )
+def _is_set_grad_enabled_sub_mod(
+    node: torch.fx.Node, omit_if_same_with_ambient: bool = False
+) -> bool | torch.Tensor:
+    if node.op == "call_module":
+        assert isinstance(node.target, str)
+        subgm = getattr(node.graph.owning_module, node.target)
+        first_non_ph = nodes_first(
+            subgm.graph.nodes, lambda node: node.op != "placeholder"
+        )
+        if (
+            first_non_ph
+            and first_non_ph.op == "call_function"
+            and first_non_ph.target is torch._C._set_grad_enabled
+        ):
+            return (
+                first_non_ph.args[0] != torch.is_grad_enabled()
+                if omit_if_same_with_ambient
+                else True
+            )
+    return False
+def _replace_with_hop(node: torch.fx.Node) -> None:
+    assert node.op == "call_module"
+    graph: torch.fx.Graph = node.graph
+    assert graph.owning_module is not None
+    gm: torch.fx.GraphModule = graph.owning_module
+    assert isinstance(node.target, str)
+    sub_gm = getattr(gm, node.target)
+    sub_graph = sub_gm.graph
+    set_grad_nodes = nodes_filter(sub_graph.nodes, _is_set_grad_enabled_node)
+    if len(set_grad_nodes) > 0:
+        assert len(set_grad_nodes) == 1
+        set_grad_node = set_grad_nodes[0]
+        _replace_with_hop_helper(node, set_grad_node, wrap_with_set_grad_enabled)
+        sub_graph.erase_node(set_grad_node)
+def _remove_set_grad_and_inline(node: torch.fx.Node) -> None:
+    assert node.op == "call_module"
+    graph: torch.fx.Graph = node.graph
+    assert graph.owning_module is not None
+    gm: torch.fx.GraphModule = graph.owning_module
+    assert isinstance(node.target, str)
+    sub_gm = getattr(gm, node.target)
+    sub_graph = sub_gm.graph
+    nodes_map(
+        sub_graph.nodes,
+        lambda n: sub_graph.erase_node(n) if _is_set_grad_enabled_node(n) else n,
+    )
+    node_inline_(node)
+def _sequential_split_and_maybe_inline_subgraphs(
+    gm: torch.fx.GraphModule, graph_signature: ExportGraphSignature | None
+) -> tuple[torch.fx.GraphModule, ExportGraphSignature | None]:
+    """
+    Helper function for replace_set_grad_with_hop_pass().
+    Split the graph module into multiple subgraphs based on the set_grad_enabled nodes.
+    For each subgraph, decides whether to construct a HOO subgraph, or inline the calls
+    back into the parent graph module.
+    """
+    need_replacing = any(_is_set_grad_enabled_node(node) for node in gm.graph.nodes)
+    if not need_replacing:
+        return gm, graph_signature
+    # sequential_split returns a new graph module that could have different output
+    # args names. We need to fix the graph signature.
+    new_gm = sequential_split(gm, _is_set_grad_enabled_node)
+    def _maybe_inline_or_replace_with_hop(node: torch.fx.Node):
+        if _is_set_grad_enabled_sub_mod(node, omit_if_same_with_ambient=True):
+            _replace_with_hop(node)
+        else:
+            _remove_set_grad_and_inline(node)
+    return _sequential_split_and_maybe_inline_subgraphs_helper(
+        new_gm, graph_signature, _maybe_inline_or_replace_with_hop
+    )
+def replace_set_grad_with_hop_pass(
+    gm: torch.fx.GraphModule, graph_signature: ExportGraphSignature | None
+) -> tuple[torch.fx.GraphModule, ExportGraphSignature | None]:
+    """
+    Split gm into sub-graph-modules using `sequential_split_and_maybe_inline_subgraphs`, and
+    then recursively call itself on each of the submodules.
+    """
+    return _replace_with_hop_pass_helper(
+        gm,
+        graph_signature,
+        _sequential_split_and_maybe_inline_subgraphs,
+    )

Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/_export/passes/replace_view_ops_with_view_copy_ops_pass.py ADDED Viewed

	@@ -0,0 +1,65 @@

+# mypy: allow-untyped-defs
+from typing import Optional
+import torch
+from torch._export.error import InternalError
+from torch._export.pass_base import _ExportPassBaseDeprecatedDoNotUse
+from torch._ops import HigherOrderOperator, OpOverload
+__all__ = ["ReplaceViewOpsWithViewCopyOpsPass"]
+_NON_FUNCTIONAL_OPS_TO_FUNCTIONAL_OPS: dict[OpOverload, OpOverload] = {
+    torch.ops.aten._unsafe_view.default: torch.ops.aten.view_copy.default,
+}
+def is_view_op(schema: torch._C.FunctionSchema) -> bool:
+    if len(schema.arguments) == 0:
+        return False
+    alias_info = schema.arguments[0].alias_info
+    return (alias_info is not None) and (not alias_info.is_write)
+def get_view_copy_of_view_op(schema: torch._C.FunctionSchema) -> Optional[OpOverload]:
+    if is_view_op(schema) and schema.name.startswith("aten::"):
+        view_op_name = schema.name.split("::")[1]
+        view_op_overload = (
+            schema.overload_name if schema.overload_name != "" else "default"
+        )
+        view_copy_op_name = view_op_name + "_copy"
+        if not hasattr(torch.ops.aten, view_copy_op_name):
+            raise InternalError(f"{schema.name} is missing a view_copy variant")
+        view_copy_op_overload_packet = getattr(torch.ops.aten, view_copy_op_name)
+        if not hasattr(view_copy_op_overload_packet, view_op_overload):
+            raise InternalError(f"{schema.name} is missing a view_copy variant")
+        return getattr(view_copy_op_overload_packet, view_op_overload)
+    return None
+class ReplaceViewOpsWithViewCopyOpsPass(_ExportPassBaseDeprecatedDoNotUse):
+    """
+    Our backend expects pure functional operators. For efficiency
+    purposes, we keep view ops around while functionalizing the exported
+    program. This pass replaces view ops with view copy ops for backends that
+    need AOT memory planning.
+    """
+    def call_operator(self, op, args, kwargs, meta):
+        if op in _NON_FUNCTIONAL_OPS_TO_FUNCTIONAL_OPS:
+            return super().call_operator(
+                (_NON_FUNCTIONAL_OPS_TO_FUNCTIONAL_OPS[op]), args, kwargs, meta
+            )
+        if isinstance(op, HigherOrderOperator):
+            return super().call_operator(op, args, kwargs, meta)
+        if view_copy_op := get_view_copy_of_view_op(op._schema):
+            return super().call_operator(view_copy_op, args, kwargs, meta)
+        return super().call_operator(op, args, kwargs, meta)

Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/_export/passes/replace_with_hop_pass_util.py ADDED Viewed

	@@ -0,0 +1,190 @@

+# mypy: allow-untyped-defs
+from __future__ import annotations
+import contextlib
+import copy
+import operator
+from typing import TYPE_CHECKING
+import torch
+from ..utils import node_replace_, nodes_map
+if TYPE_CHECKING:
+    from collections.abc import Callable
+    from torch._ops import HigherOrderOperator
+    from torch.export.graph_signature import ExportGraphSignature
+def _replace_with_hop_helper(
+    node: torch.fx.Node,
+    enter_block_node: torch.fx.Node,
+    wrap_hoo: HigherOrderOperator,
+) -> None:
+    graph: torch.fx.Graph = node.graph
+    assert graph.owning_module is not None
+    gm: torch.fx.GraphModule = graph.owning_module
+    assert isinstance(node.target, str)
+    sub_gm = getattr(gm, node.target)
+    def set_hoo_node_meta(call_func_node):
+        call_func_node.meta["nn_module_stack"] = copy.copy(
+            enter_block_node.meta.get("nn_module_stack", {})
+        )
+        call_func_node.meta["torch_fn"] = (
+            f"{wrap_hoo.__name__}",
+            # pyrefly: ignore [missing-attribute]
+            f"{wrap_hoo.__class__.__name__}.{wrap_hoo.__name__}",
+        )
+        if isinstance(output_args, (tuple, list)):
+            call_func_node.meta["val"] = tuple(arg.meta["val"] for arg in output_args)
+        elif isinstance(output_args, torch.fx.Node):
+            call_func_node.meta["val"] = (output_args.meta["val"],)
+    with graph.inserting_before(node):
+        get_attr_node = graph.get_attr(node.target)
+        get_attr_node.meta["nn_module_stack"] = copy.copy(
+            enter_block_node.meta.get("nn_module_stack", {})
+        )
+        output_node = next(iter(reversed(sub_gm.graph.nodes)), None)
+        # Split_module pass intentionally doesn't add output node
+        # if the graph doesn't return anything.
+        # TODO (tmanlaibaatar) Figure out if this is right behaviour
+        # for split_module
+        if isinstance(output_node, torch.fx.Node) and output_node.op != "output":
+            output_node = None
+        if output_node is not None:
+            assert len(output_node.args) == 1
+            output_args = output_node.args[0]
+            enter_block_node_args = enter_block_node.args
+            if isinstance(output_args, (tuple, list)):
+                call_func_node = graph.call_function(
+                    wrap_hoo,
+                    (*enter_block_node_args, get_attr_node, *node.args),
+                    {},
+                )
+                # Create the metadata
+                set_hoo_node_meta(call_func_node)
+                node_replace_(node, call_func_node)
+                # Rename the name of getitem nodes to the actual name of its contents
+                # for passing verifier and better readability, also propagate metadata
+                for get_item_node in call_func_node.users:
+                    idx: int = get_item_node.args[1]  # type: ignore[assignment]
+                    output_node = output_args[idx]
+                    get_item_node._rename(output_node.name)
+                    get_item_node.meta = output_node.meta
+            elif isinstance(output_args, torch.fx.Node):
+                call_func_node = graph.create_node(
+                    "call_function",
+                    wrap_hoo,
+                    (*enter_block_node_args, get_attr_node, *node.args),
+                    {},
+                    output_args.name,
+                )
+                # Modify the subgraph to output a singleton list.
+                output_node.args = ((output_args,),)
+                # Add in an extra `getitem(wrap_hoo, 0)` node to the toplevel graph.
+                get_item_node = graph.create_node(
+                    "call_function",
+                    operator.getitem,
+                    (call_func_node, 0),
+                    {},
+                )
+                # Create the metadata
+                get_item_node.meta = output_args.meta
+                set_hoo_node_meta(call_func_node)
+                node_replace_(node, get_item_node)
+            else:
+                raise NotImplementedError(
+                    f"replace_with_hop_pass doesn't support output type {type(output_args)}"
+                )
+        else:
+            # TODO (shangdiy): remove this line, since the export graph can be non-functional
+            node.graph.erase_node(node)
+def _sequential_split_and_maybe_inline_subgraphs_helper(
+    new_gm: torch.fx.GraphModule,
+    graph_signature: ExportGraphSignature | None,
+    maybe_inline_or_replace_with_hop: Callable[[torch.fx.Node], None],
+) -> tuple[torch.fx.GraphModule, ExportGraphSignature | None]:
+    """
+    Helper function for replacing graph nodse with higher order nodes.
+    For each subgraph in `new_gm`, decides whether to construct a HOO subgraph, or inline the calls
+    back into the parent graph module, depending on `maybe_inline_or_replace_with_hop`.
+    """
+    # new_gm is a new graph module that could have different output args names.
+    # We need to fix the graph signature.
+    replace_ctx = contextlib.nullcontext()
+    new_signature = None
+    if graph_signature is not None:
+        # Cannot deep copy a real ScriptObject, which is referenced
+        # in the FakeScriptObject. Copy should be good enough to guard
+        # against accidental mutation to original graph_signature.
+        new_signature = copy.copy(graph_signature)
+        new_gm_out_node = next(reversed(new_gm.graph.find_nodes(op="output")))
+        assert new_gm_out_node.op == "output" and len(new_gm_out_node.args[0]) == len(
+            new_signature.output_specs
+        )
+        for arg_node, out_spec in zip(
+            new_gm_out_node.args[0], new_signature.output_specs
+        ):
+            if arg_node is None:
+                assert out_spec.arg.value is None  # type: ignore[union-attr]
+            elif (
+                isinstance(arg_node, torch.fx.Node)
+                and out_spec.arg.name != arg_node.name
+            ):
+                out_spec.arg.name = arg_node.name
+        replace_ctx = new_gm._set_replace_hook(new_signature.get_replace_hook())  # type: ignore[assignment]
+    with replace_ctx:
+        nodes_map(
+            list(new_gm.graph.nodes),
+            lambda node: (
+                maybe_inline_or_replace_with_hop(node)
+                if node.op == "call_module"
+                else node
+            ),
+        )
+    new_gm.recompile()
+    new_gm.graph.lint()
+    return new_gm, new_signature
+def _replace_with_hop_pass_helper(
+    gm: torch.fx.GraphModule,
+    graph_signature: ExportGraphSignature | None,
+    sequential_split_and_maybe_inline_subgraphs: Callable[
+        [torch.fx.GraphModule, ExportGraphSignature | None],
+        tuple[torch.fx.GraphModule, ExportGraphSignature | None],
+    ],
+) -> tuple[torch.fx.GraphModule, ExportGraphSignature | None]:
+    """
+    Split gm into sub-graph-modules using `sequential_split_and_maybe_inline_subgraphs`, and
+    then recursively call itself on each of the submodules.
+    """
+    new_gm, new_signature = sequential_split_and_maybe_inline_subgraphs(
+        gm, graph_signature
+    )
+    # recursively call
+    for node in new_gm.graph.nodes:
+        if node.op == "get_attr":
+            subgm = getattr(new_gm, node.target)
+            if not isinstance(subgm, torch.fx.GraphModule):
+                continue
+            new_subgm, _ = _replace_with_hop_pass_helper(
+                subgm,
+                None,
+                sequential_split_and_maybe_inline_subgraphs,
+            )
+            setattr(new_gm, node.target, new_subgm)
+    new_gm.recompile()
+    new_gm.graph.lint()
+    return new_gm, new_signature

Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/_export/serde/__init__.py ADDED Viewed

File without changes

Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/_export/serde/dynamic_shapes.py ADDED Viewed

	@@ -0,0 +1,324 @@

+import dataclasses
+from typing import Any, Optional, Union
+import torch
+from torch._dynamo.exc import UserError, UserErrorType
+from torch.export.dynamic_shapes import (
+    _check_dynamic_shapes,
+    _DerivedDim,
+    _DimHint,
+    _tree_map_with_path,
+    Dim,
+)
+from torch.utils._pytree import tree_map
+from .serialize import _dataclass_to_dict
+@dataclasses.dataclass
+class RootDim:
+    """
+    This represents a Dim object.
+    """
+    min: int
+    max: Union[int, None]
+    derived: list[str]
+@dataclasses.dataclass
+class DynamicShapesSpec:
+    """
+    This stores a dynamic_shapes spec for de/serialization.
+    """
+    dynamic_shapes: Union[dict[str, Any], tuple[Any], list[Any], None]
+    dims: dict[str, RootDim]
+def _postprocess_serialized_shapes(
+    dynamic_shapes: Union[dict[str, Any], tuple[Any], list[Any], None],
+    dims: dict[str, dict[str, Union[int, list[str], None]]],
+    to_dict: Optional[bool] = False,
+) -> Union[DynamicShapesSpec, dict[str, Any]]:
+    """
+    Sorts dims and dumps to dictionary format.
+    """
+    from torch.utils._sympy.numbers import int_oo
+    dims = {
+        k: RootDim(
+            min=v["min"],  # type: ignore[arg-type]
+            max=None if v["max"] is int_oo else v["max"],  # type: ignore[arg-type]
+            derived=sorted(v["derived"]),  # type: ignore[arg-type]
+        )
+        for k, v in sorted(dims.items())
+    }
+    # pyrefly: ignore [bad-argument-type]
+    spec = DynamicShapesSpec(dynamic_shapes=dynamic_shapes, dims=dims)
+    if to_dict:
+        return _dataclass_to_dict(spec)
+    else:
+        return spec
+def _dump_dynamic_shapes(
+    dynamic_shapes: Union[dict[str, Any], tuple[Any], list[Any], None],
+    args: tuple[Any],
+    kwargs: Optional[dict[str, Any]] = None,
+    to_dict: Optional[bool] = False,
+) -> Union[DynamicShapesSpec, dict[str, Any]]:
+    """
+    Utility function for dynamic shapes serialization, serializing a dynamic_shapes spec.
+    Returns a DynamicShapesSpec dataclass containing 2 fields, "dynamic_shapes" and "dims".
+    Uses args & kwargs to distinguish between tensor-level and dim-level specs (only for Nones).
+    dynamic_shapes: A pytree structure mirroring the dynamic_shapes input to export():
+        - Each tensor input is represented with a list of values, non-tensor inputs with None.
+        - dynamic dimensions (i.e. symbols) in tensors and Dim enums are represented with strings.
+        - static dimensions are represented with ints.
+    dims: A dictionary mapping each symbol name to the min/max range and derived dim names.
+    For example:
+    ```
+    dx = Dim("dx", min=4, max=16)
+    dy = dx + 1
+    inputs = (
+        [
+            torch.randn(4, 4),
+            torch.randn(5, 4),
+        ],
+        torch.randn(4),
+        torch.randn(4, 4),
+        "hello",
+    )
+    dynamic_shapes = {
+        "a": [
+            (dx, 4),
+            (dy, 4),
+        ],
+        "b": (Dim.STATIC,),
+        "c": None,
+        "d": None,
+    }
+    out = _dump_dynamic_shapes(dynamic_shapes, inputs, to_dict=True)
+    ```
+    would generate the following output:
+    ```
+    {
+        "dynamic_shapes": (
+            [
+                ["dx", 4],
+                ["dx + 1", 4],
+            ],
+            ["_DimHint.STATIC"],
+            ["_DimHint.STATIC", "_DimHint.STATIC"],
+            None,
+        ),
+        "dims": {
+            "dx": {
+                "min": 4,
+                "max": 16,
+                "derived": ["dx + 1"],
+            },
+        },
+    }
+    ```
+    """
+    dims: dict[str, dict[str, Any]] = {}
+    def _standardize_shapes(path, tensor, shape):  # type: ignore[no-untyped-def]
+        """
+        Helps standardize the dynamic_shapes tree structure we serialize,
+        returning lists for each tensor shape, handling tensor-level Nones.
+        """
+        if not isinstance(tensor, torch.Tensor):
+            return None
+        if shape is None:
+            return [Dim.STATIC] * len(tensor.shape)
+        out = []
+        if isinstance(shape, dict):
+            for i, s in enumerate(tensor.shape):
+                out.append(s if shape.get(i) is None else shape.get(i))
+        else:
+            assert isinstance(shape, (tuple, list))
+            for i, s in enumerate(tensor.shape):
+                out.append(s if shape[i] is None else shape[i])
+        return out
+    def _track_dim_from_dims(
+        val: Union[None, int, _DimHint, Dim],
+    ) -> Union[None, int, str]:
+        """
+        Tracks dims, ranges, derived dims from the standardized dynamic_shapes spec.
+        """
+        if val is None or isinstance(val, int):  # non-tensor input or static
+            return val
+        if isinstance(val, _DimHint):  # store enum as string
+            return val.__class__.__name__ + "." + val.type.name
+        assert isinstance(val, Dim)
+        # track root dim
+        root = val.root if isinstance(val, _DerivedDim) else val  # type: ignore[attr-defined]
+        if root.__name__ not in dims:
+            dims[root.__name__] = {
+                "min": root.min,  # type: ignore[attr-defined,union-attr]
+                "max": root.max,  # type: ignore[attr-defined,union-attr]
+                "derived": set(),
+            }
+        # track derived dims
+        if isinstance(val, _DerivedDim):
+            dims[root.__name__]["derived"].add(val.__name__)
+        return val.__name__
+    if dynamic_shapes is None:
+        return {"dynamic_shapes": None, "dims": {}}
+    # convert to tuple of specs, for each arg/kwarg
+    kwargs = kwargs or {}
+    if isinstance(dynamic_shapes, dict):
+        dynamic_shapes = dynamic_shapes.values()  # type: ignore[assignment]
+    # pyrefly: ignore [bad-assignment, bad-argument-type]
+    dynamic_shapes = tuple(dynamic_shapes)
+    combined_args = tuple(args) + tuple(kwargs.values())
+    # run same check when we're processing shapes for export - is this too lazy?
+    _check_dynamic_shapes(dict(enumerate(combined_args)), dynamic_shapes)  # type: ignore[arg-type]
+    tree_shapes = _tree_map_with_path(
+        _standardize_shapes, combined_args, dynamic_shapes, tree_name="inputs"
+    )
+    serialized_shapes = tree_map(_track_dim_from_dims, tree_shapes)
+    return _postprocess_serialized_shapes(serialized_shapes, dims, to_dict=to_dict)
+def _load_dynamic_shapes(
+    spec: Union[DynamicShapesSpec, dict[str, Any]],
+    from_dict: Optional[bool] = False,
+) -> Union[dict[str, Any], tuple[Any], list[Any], None]:
+    """
+    Utility function for dynamic shapes serialization.
+    Deserializes a DynamicShapesSpec or corresponding dictionary into a dynamic_shapes input to export().
+    """
+    import sympy
+    from torch.fx.experimental.symbolic_shapes import _is_supported_equivalence
+    if from_dict:
+        if not isinstance(spec, dict):
+            raise UserError(
+                UserErrorType.INVALID_INPUT,
+                f"With from_dict=True, expected `spec` to be a dict, got {type(spec)}",
+            )
+        if sorted(spec.keys()) != ["dims", "dynamic_shapes"]:
+            raise UserError(
+                UserErrorType.INVALID_INPUT,
+                "With from_dict=True, expected `spec` to have keys `dims` and `dynamic_shapes`, "
+                f"instead found {spec.keys()}",
+            )
+        dims = {}
+        for k, v in spec["dims"].items():
+            if not isinstance(k, str):
+                raise UserError(
+                    UserErrorType.INVALID_INPUT,
+                    f"Expected `spec['dims']` keys to be strings for symbols, got key {type(k)}",
+                )
+            if sorted(v.keys()) != ["derived", "max", "min"]:
+                raise UserError(
+                    UserErrorType.INVALID_INPUT,
+                    f"Expected `spec['dims']` values to have keys `derived`, `max`, and `min`, "
+                    f"instead found {v.keys()}",
+                )
+            if not isinstance(v["min"], int):
+                raise UserError(
+                    UserErrorType.INVALID_INPUT,
+                    f"Expected dims in `spec['dims']` to map `min` to an int, got {k}: {v['min']}",
+                )
+            if not isinstance(v["max"], int) or v["max"] is None:
+                raise UserError(
+                    UserErrorType.INVALID_INPUT,
+                    f"Expected dims in `spec['dims']` to map `max` to an int or None, got {k}: {v['max']}",
+                )
+            if not isinstance(v["derived"], list) or any(
+                not isinstance(d, str) for d in v["derived"]
+            ):
+                raise UserError(
+                    UserErrorType.INVALID_INPUT,
+                    "Expected dims in `spec['dims']` to map `derived` to a list of derived expressions, "
+                    f"got {k}: {v['derived']}",
+                )
+            dims[k] = RootDim(**v)
+        dynamic_shapes = spec["dynamic_shapes"]
+    else:
+        if not isinstance(spec, DynamicShapesSpec):
+            raise UserError(
+                UserErrorType.INVALID_INPUT,
+                f"Expected `spec` to be a DynamicShapesSpec, got {type(spec)}",
+            )
+        dims = spec.dims
+        dynamic_shapes = spec.dynamic_shapes
+    if dynamic_shapes is None:
+        return None
+    dim_cache = {}
+    for name, info in dims.items():
+        symbol = sympy.sympify(name)
+        if not isinstance(symbol, sympy.Symbol):
+            raise UserError(
+                UserErrorType.INVALID_INPUT,
+                f"Expected `spec['dims']` keys to be symbols, got {name}",
+            )
+        dim_cache[name] = Dim(name, min=info.min, max=info.max)  # cache root dim
+        for _expr in info.derived:
+            expr = sympy.sympify(_expr)
+            if len(expr.free_symbols) != 1 or symbol not in expr.free_symbols:
+                raise UserError(
+                    UserErrorType.INVALID_INPUT,
+                    f"Expected derived expressions in to have {name} as the only free symbol, got {expr}",
+                )
+            if not _is_supported_equivalence(expr):
+                raise UserError(
+                    UserErrorType.INVALID_INPUT,
+                    f"Expected derived expressions to be linear expressions, got {expr}",
+                )
+            modulus, remainder = sympy.polys.polytools.div(expr, symbol)
+            ddim = dim_cache[name]
+            if modulus != 1:
+                ddim = int(modulus) * ddim  # type: ignore[assignment, operator]
+            if remainder != 0:
+                ddim = ddim + int(remainder)  # type: ignore[assignment, operator]
+            dim_cache[_expr] = ddim  # cache derived dims
+    def deserialize_shape(
+        val: Union[None, int, str],
+    ) -> Union[None, int, Dim, _DimHint]:
+        if val is None or isinstance(val, int):
+            return val
+        elif val == "_DimHint.AUTO":
+            return _DimHint.AUTO()
+        elif val == "_DimHint.DYNAMIC":
+            return _DimHint.DYNAMIC()
+        elif val == "_DimHint.STATIC":
+            return _DimHint.STATIC()
+        if not isinstance(val, str):
+            raise UserError(
+                UserErrorType.INVALID_INPUT,
+                "Expected leaves in `spec['dynamic_shapes']` to be ints, None, Dim.AUTO/STATIC, symbols, "
+                f" or derived expressions, got {val}",
+            )
+        if val not in dim_cache:
+            raise UserError(
+                UserErrorType.INVALID_INPUT,
+                "Expected dims in `spec['dynamic_shapes']` to be tracked in `spec['dims']`, "
+                f"got {val} which is not in {dims.keys()}",
+            )
+        return dim_cache[val]  # type: ignore[return-value]
+    return tree_map(deserialize_shape, dynamic_shapes)

Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/_export/serde/export_schema.thrift ADDED Viewed

	@@ -0,0 +1,377 @@

+// @generated by update_schema.py
+// checksum<<0e870e558fb4362f69b825842ab606cf0becd10a008003ac676156becf20b65b>>
+namespace py3 torch._export
+namespace cpp2 torch._export.schema
+enum ArgumentKind {
+  UNKNOWN = 0,
+  POSITIONAL = 1,
+  KEYWORD = 2,
+}
+enum Layout {
+  Unknown = 0,
+  SparseCoo = 1,
+  SparseCsr = 2,
+  SparseCsc = 3,
+  SparseBsr = 4,
+  SparseBsc = 5,
+  _mkldnn = 6,
+  Strided = 7,
+}
+enum MemoryFormat {
+  Unknown = 0,
+  ContiguousFormat = 1,
+  ChannelsLast = 2,
+  ChannelsLast3d = 3,
+  PreserveFormat = 4,
+}
+enum ScalarType {
+  UNKNOWN = 0,
+  BYTE = 1,
+  CHAR = 2,
+  SHORT = 3,
+  INT = 4,
+  LONG = 5,
+  HALF = 6,
+  FLOAT = 7,
+  DOUBLE = 8,
+  COMPLEXHALF = 9,
+  COMPLEXFLOAT = 10,
+  COMPLEXDOUBLE = 11,
+  BOOL = 12,
+  BFLOAT16 = 13,
+  UINT16 = 28,
+  FLOAT8E4M3FN = 29,
+  FLOAT8E5M2 = 30,
+  FLOAT8E4M3FNUZ = 31,
+  FLOAT8E5M2FNUZ = 32,
+}
+struct Device {
+  10: string type;
+  20: optional i64 index;
+}
+union SymExprHint {
+  10: i64 as_int;
+  20: bool as_bool;
+  30: double as_float;
+}
+struct SymExpr {
+  10: string expr_str;
+  20: optional SymExprHint hint;
+}
+union SymInt {
+  10: SymExpr as_expr;
+  20: i64 as_int;
+}
+union SymFloat {
+  10: SymExpr as_expr;
+  20: double as_float;
+}
+union SymBool {
+  10: SymExpr as_expr;
+  20: bool as_bool;
+}
+struct TensorMeta {
+  10: ScalarType dtype;
+  20: list<SymInt> sizes;
+  30: bool requires_grad;
+  40: Device device;
+  50: list<SymInt> strides;
+  60: SymInt storage_offset;
+  70: Layout layout;
+}
+union SymIntArgument {
+  10: string as_name;
+  20: i64 as_int;
+}
+union SymFloatArgument {
+  10: string as_name;
+  20: double as_float;
+}
+union SymBoolArgument {
+  10: string as_name;
+  20: bool as_bool;
+}
+struct TensorArgument {
+  10: string name;
+}
+struct TokenArgument {
+  10: string name;
+}
+union OptionalTensorArgument {
+  20: TensorArgument as_tensor;
+  10: bool as_none;
+}
+struct GraphArgument {
+  10: string name;
+  20: Graph graph;
+}
+struct CustomObjArgument {
+  10: string name;
+  20: string class_fqn;
+}
+struct ComplexValue {
+  10: double real;
+  20: double imag;
+}
+union Argument {
+  10: bool as_none;
+  20: TensorArgument as_tensor;
+  30: list<TensorArgument> as_tensors;
+  50: i64 as_int;
+  70: list<i64> as_ints;
+  80: double as_float;
+  90: list<double> as_floats;
+  100: string as_string;
+  101: list<string> as_strings;
+  110: SymIntArgument as_sym_int;
+  120: list<SymIntArgument> as_sym_ints;
+  130: ScalarType as_scalar_type;
+  140: MemoryFormat as_memory_format;
+  150: Layout as_layout;
+  160: Device as_device;
+  170: bool as_bool;
+  180: list<bool> as_bools;
+  182: SymBoolArgument as_sym_bool;
+  184: list<SymBoolArgument> as_sym_bools;
+  200: GraphArgument as_graph;
+  190: list<OptionalTensorArgument> as_optional_tensors;
+  210: CustomObjArgument as_custom_obj;
+  220: string as_operator;
+  230: SymFloatArgument as_sym_float;
+  240: list<SymFloatArgument> as_sym_floats;
+  250: OptionalTensorArgument as_optional_tensor;
+  260: ComplexValue as_complex;
+  280: list<list<i64>> as_int_lists;
+  290: map<string, Argument> as_string_to_argument;
+}
+struct NamedArgument {
+  10: string name;
+  20: Argument arg;
+  30: optional ArgumentKind kind;
+}
+struct Node {
+  10: string target;
+  20: list<NamedArgument> inputs;
+  30: list<Argument> outputs;
+  40: map<string, string> metadata;
+  50: optional bool is_hop_single_tensor_return;
+}
+struct Graph {
+  10: list<Argument> inputs;
+  20: list<Argument> outputs;
+  30: list<Node> nodes;
+  40: map<string, TensorMeta> tensor_values;
+  50: map<string, SymInt> sym_int_values;
+  60: map<string, SymBool> sym_bool_values;
+  70: bool is_single_tensor_return;
+  80: map<string, CustomObjArgument> custom_obj_values;
+  90: map<string, SymFloat> sym_float_values;
+}
+struct UserInputSpec {
+  10: Argument arg;
+}
+union ConstantValue {
+  10: bool as_none;
+  20: i64 as_int;
+  30: double as_float;
+  40: string as_string;
+  50: bool as_bool;
+}
+struct InputToConstantInputSpec {
+  10: string name;
+  20: ConstantValue value;
+}
+struct InputToParameterSpec {
+  10: TensorArgument arg;
+  20: string parameter_name;
+}
+struct InputToBufferSpec {
+  10: TensorArgument arg;
+  20: string buffer_name;
+  30: bool persistent;
+}
+struct InputToTensorConstantSpec {
+  10: TensorArgument arg;
+  20: string tensor_constant_name;
+}
+struct InputToCustomObjSpec {
+  10: CustomObjArgument arg;
+  20: string custom_obj_name;
+}
+struct InputTokenSpec {
+  10: TokenArgument arg;
+}
+union InputSpec {
+  10: UserInputSpec user_input;
+  20: InputToParameterSpec parameter;
+  30: InputToBufferSpec buffer;
+  40: InputToTensorConstantSpec tensor_constant;
+  50: InputToCustomObjSpec custom_obj;
+  70: InputTokenSpec token;
+  60: InputToConstantInputSpec constant_input;
+}
+struct UserOutputSpec {
+  10: Argument arg;
+}
+struct LossOutputSpec {
+  10: TensorArgument arg;
+}
+struct BufferMutationSpec {
+  10: TensorArgument arg;
+  20: string buffer_name;
+}
+struct ParameterMutationSpec {
+  10: TensorArgument arg;
+  20: string parameter_name;
+}
+struct GradientToParameterSpec {
+  10: TensorArgument arg;
+  20: string parameter_name;
+}
+struct GradientToUserInputSpec {
+  10: TensorArgument arg;
+  20: string user_input_name;
+}
+struct UserInputMutationSpec {
+  10: TensorArgument arg;
+  20: string user_input_name;
+}
+struct OutputTokenSpec {
+  10: TokenArgument arg;
+}
+union OutputSpec {
+  10: UserOutputSpec user_output;
+  20: LossOutputSpec loss_output;
+  30: BufferMutationSpec buffer_mutation;
+  40: GradientToParameterSpec gradient_to_parameter;
+  50: GradientToUserInputSpec gradient_to_user_input;
+  60: UserInputMutationSpec user_input_mutation;
+  70: OutputTokenSpec token;
+  80: ParameterMutationSpec parameter_mutation;
+}
+struct GraphSignature {
+  10: list<InputSpec> input_specs;
+  20: list<OutputSpec> output_specs;
+}
+struct RangeConstraint {
+  10: optional i64 min_val;
+  20: optional i64 max_val;
+}
+struct ModuleCallSignature {
+  10: list<Argument> inputs;
+  20: list<Argument> outputs;
+  30: string in_spec;
+  40: string out_spec;
+  50: optional list<string> forward_arg_names;
+}
+struct ModuleCallEntry {
+  10: string fqn;
+  30: optional ModuleCallSignature signature;
+}
+struct NamedTupleDef {
+  10: list<string> field_names;
+}
+struct GraphModule {
+  10: Graph graph;
+  50: GraphSignature signature;
+  60: list<ModuleCallEntry> module_call_graph;
+  40: map<string, string> metadata;
+  70: map<string, NamedTupleDef> treespec_namedtuple_fields;
+}
+struct SchemaVersion {
+  10: i64 major;
+  20: i64 minor;
+}
+struct ExportedProgram {
+  10: GraphModule graph_module;
+  20: map<string, i64> opset_version;
+  30: map<string, RangeConstraint> range_constraints;
+  60: SchemaVersion schema_version;
+  70: list<string> verifiers;
+  80: string torch_version;
+  90: list<string> guards_code;
+}
+struct PayloadMeta {
+  10: string path_name;
+  20: bool is_param;
+  30: bool use_pickle;
+  40: optional TensorMeta tensor_meta;
+}
+struct PayloadConfig {
+  10: map<string, PayloadMeta> config;
+}
+struct AOTInductorModelPickleData {
+  1: string library_basename;
+  2: list<string> input_names;
+  3: list<string> output_names;
+  4: optional i64 floating_point_input_dtype;
+  5: optional i64 floating_point_output_dtype;
+  6: optional bool aot_inductor_model_is_cpu;
+}
+struct ExternKernelNode {
+  10: string name;
+  20: Node node;
+}
+struct ExternKernelNodes {
+  10: list<ExternKernelNode> nodes;
+}