BryanW commited on Mar 23

Commit

175af23

verified ·

1 Parent(s): a1d17d3

Add files using upload-large-folder tool

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/__pycache__/_VF.cpython-312.pyc +0 -0
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/__pycache__/__config__.cpython-312.pyc +0 -0
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/__pycache__/__future__.cpython-312.pyc +0 -0
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/__pycache__/_appdirs.cpython-312.pyc +0 -0
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/__pycache__/_classes.cpython-312.pyc +0 -0
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/__pycache__/_compile.cpython-312.pyc +0 -0
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/__pycache__/_custom_ops.cpython-312.pyc +0 -0
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/__pycache__/_environment.cpython-312.pyc +0 -0
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/__pycache__/_guards.cpython-312.pyc +0 -0
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/__pycache__/_jit_internal.cpython-312.pyc +0 -0
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/__pycache__/_linalg_utils.cpython-312.pyc +0 -0
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/__pycache__/_lobpcg.cpython-312.pyc +0 -0
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/__pycache__/_lowrank.cpython-312.pyc +0 -0
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/__pycache__/_namedtensor_internals.cpython-312.pyc +0 -0
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/__pycache__/_ops.cpython-312.pyc +0 -0
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/__pycache__/_python_dispatcher.cpython-312.pyc +0 -0
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/__pycache__/_size_docs.cpython-312.pyc +0 -0
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/__pycache__/_sources.cpython-312.pyc +0 -0
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/__pycache__/_storage_docs.cpython-312.pyc +0 -0
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/__pycache__/_streambase.cpython-312.pyc +0 -0
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/__pycache__/_tensor.cpython-312.pyc +0 -0
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/__pycache__/_tensor_str.cpython-312.pyc +0 -0
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/__pycache__/_thread_safe_fork.cpython-312.pyc +0 -0
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/__pycache__/_utils.cpython-312.pyc +0 -0
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/__pycache__/_utils_internal.cpython-312.pyc +0 -0
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/__pycache__/_vmap_internals.cpython-312.pyc +0 -0
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/__pycache__/_weights_only_unpickler.cpython-312.pyc +0 -0
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/__pycache__/functional.cpython-312.pyc +0 -0
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/__pycache__/hub.cpython-312.pyc +0 -0
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/__pycache__/library.cpython-312.pyc +0 -0
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/__pycache__/quasirandom.cpython-312.pyc +0 -0
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/__pycache__/random.cpython-312.pyc +0 -0
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/__pycache__/return_types.cpython-312.pyc +0 -0
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/__pycache__/serialization.cpython-312.pyc +0 -0
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/__pycache__/storage.cpython-312.pyc +0 -0
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/__pycache__/torch_version.cpython-312.pyc +0 -0
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/__pycache__/types.cpython-312.pyc +0 -0
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/__pycache__/version.cpython-312.pyc +0 -0
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/_awaits/__init__.py +53 -0
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/_decomp/__init__.py +549 -0
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/_decomp/decompositions.py +0 -0
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/_decomp/decompositions_for_jvp.py +336 -0
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/_decomp/decompositions_for_rng.py +266 -0
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/_dispatch/__init__.py +0 -0
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/_dispatch/python.py +192 -0
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/_export/config.py +45 -0
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/_export/error.py +56 -0
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/_export/verifier.py +531 -0
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/_inductor/__autotune_main__.py +33 -0
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/_inductor/__init__.py +447 -0

Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/__pycache__/_VF.cpython-312.pyc ADDED Viewed

Binary file (1.56 kB). View file

Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/__pycache__/__config__.cpython-312.pyc ADDED Viewed

Binary file (1.08 kB). View file

Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/__pycache__/__future__.cpython-312.pyc ADDED Viewed

Binary file (3.58 kB). View file

Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/__pycache__/_appdirs.cpython-312.pyc ADDED Viewed

Binary file (29.6 kB). View file

Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/__pycache__/_classes.cpython-312.pyc ADDED Viewed

Binary file (3.34 kB). View file

Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/__pycache__/_compile.cpython-312.pyc ADDED Viewed

Binary file (2.54 kB). View file

Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/__pycache__/_custom_ops.cpython-312.pyc ADDED Viewed

Binary file (13.9 kB). View file

Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/__pycache__/_environment.cpython-312.pyc ADDED Viewed

Binary file (344 Bytes). View file

Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/__pycache__/_guards.cpython-312.pyc ADDED Viewed

Binary file (57 kB). View file

Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/__pycache__/_jit_internal.cpython-312.pyc ADDED Viewed

Binary file (52.6 kB). View file

Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/__pycache__/_linalg_utils.cpython-312.pyc ADDED Viewed

Binary file (6.66 kB). View file

Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/__pycache__/_lobpcg.cpython-312.pyc ADDED Viewed

Binary file (49.5 kB). View file

Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/__pycache__/_lowrank.cpython-312.pyc ADDED Viewed

Binary file (12.8 kB). View file

Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/__pycache__/_namedtensor_internals.cpython-312.pyc ADDED Viewed

Binary file (6.34 kB). View file

Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/__pycache__/_ops.cpython-312.pyc ADDED Viewed

Binary file (63.1 kB). View file

Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/__pycache__/_python_dispatcher.cpython-312.pyc ADDED Viewed

Binary file (5.34 kB). View file

Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/__pycache__/_size_docs.cpython-312.pyc ADDED Viewed

Binary file (1.35 kB). View file

Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/__pycache__/_sources.cpython-312.pyc ADDED Viewed

Binary file (5.69 kB). View file

Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/__pycache__/_storage_docs.cpython-312.pyc ADDED Viewed

Binary file (1.75 kB). View file

Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/__pycache__/_streambase.cpython-312.pyc ADDED Viewed

Binary file (968 Bytes). View file

Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/__pycache__/_tensor.cpython-312.pyc ADDED Viewed

Binary file (78.1 kB). View file

Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/__pycache__/_tensor_str.cpython-312.pyc ADDED Viewed

Binary file (32.2 kB). View file

Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/__pycache__/_thread_safe_fork.cpython-312.pyc ADDED Viewed

Binary file (220 Bytes). View file

Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/__pycache__/_utils.cpython-312.pyc ADDED Viewed

Binary file (44.5 kB). View file

Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/__pycache__/_utils_internal.cpython-312.pyc ADDED Viewed

Binary file (14.3 kB). View file

Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/__pycache__/_vmap_internals.cpython-312.pyc ADDED Viewed

Binary file (10.2 kB). View file

Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/__pycache__/_weights_only_unpickler.cpython-312.pyc ADDED Viewed

Binary file (26 kB). View file

Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/__pycache__/functional.cpython-312.pyc ADDED Viewed

Binary file (85.1 kB). View file

Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/__pycache__/hub.cpython-312.pyc ADDED Viewed

Binary file (37.9 kB). View file

Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/__pycache__/library.cpython-312.pyc ADDED Viewed

Binary file (75 kB). View file

Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/__pycache__/quasirandom.cpython-312.pyc ADDED Viewed

Binary file (10.3 kB). View file

Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/__pycache__/random.cpython-312.pyc ADDED Viewed

Binary file (8.97 kB). View file

Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/__pycache__/return_types.cpython-312.pyc ADDED Viewed

Binary file (2.04 kB). View file

Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/__pycache__/serialization.cpython-312.pyc ADDED Viewed

Binary file (87.8 kB). View file

Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/__pycache__/storage.cpython-312.pyc ADDED Viewed

Binary file (73.7 kB). View file

Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/__pycache__/torch_version.cpython-312.pyc ADDED Viewed

Binary file (3.27 kB). View file

Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/__pycache__/types.cpython-312.pyc ADDED Viewed

Binary file (4.21 kB). View file

Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/__pycache__/version.cpython-312.pyc ADDED Viewed

Binary file (636 Bytes). View file

Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/_awaits/__init__.py ADDED Viewed

	@@ -0,0 +1,53 @@

+from __future__ import annotations
+from typing import Generic, TypeVar
+import torch
+__all__ = ['Await']
+W = TypeVar("W")
+class _PyAwaitMeta(type(torch._C._Await), type(Generic)):  # type: ignore[misc, no-redef]
+    pass
+class _Await(torch._C._Await, Generic[W], metaclass=_PyAwaitMeta):
+    r"""
+    Wrapper around a ``torch._C.Await`` which encapsulates delayed execution
+    of a callable. All manipulations happen with functions ``torch.jit._awaitable``,
+    ``torch.jit._awaitable_wait``, ``torch.jit._awaitable_nowait``.
+    Torch scriptable manipulations:
+    ``torch.jit._awaitable(func, *args)``
+    Creates ``Await[W]`` object, where W is return type of func.
+    Returns:
+    ``torch.jit._awaitable_wait(Await[W])``
+    Returns the result of the function, specified at ``_awaitable``,  with specified arguments.
+    Returns:
+        The result of type ``W`` of the function call. The result is owned by ``Await[W]``
+        and returned on all following ``_awaitable_wait`` calls.
+    ``torch.jit._awaitable_nowait(W)``
+    Returns:
+        Trivial ``Await[W]`` with specified result.
+    Only in eager mode:
+    ``fn() -> Callable[Tuple[Any], W]``
+    Returns:
+        Specified at ``_awaitable`` python function ``func``.
+    ``args() -> Tuple[Any]``
+    Returns:
+        Specified at ``_awaitable`` python args.
+    ``is_nowait() -> _bool``
+    Returns:
+        ``True`` if this object was created via ``_awaitable_nowait`` call (trivial `Await[W]`).
+    In eager mode ``Await[W]`` can be used as ``W`` i.e. attributes of W can be called on ``Await[W]``,
+    ``_awaitable_wait()`` call will be transparently added.
+    """

Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/_decomp/__init__.py ADDED Viewed

	@@ -0,0 +1,549 @@

+# mypy: allow-untyped-defs
+import inspect
+from collections import defaultdict
+from collections.abc import Callable, Sequence
+from functools import lru_cache, partial, wraps
+from itertools import chain
+from typing import Optional, TYPE_CHECKING, TypeVar, Union
+from typing_extensions import ParamSpec
+if TYPE_CHECKING:
+    from torch.export.decomp_utils import CustomDecompTable
+import torch
+import torch.library
+from torch._ops import HigherOrderOperator, OperatorBase, OpOverload, OpOverloadPacket
+from torch._prims_common import CustomOutParamAnnotation
+from torch._subclasses.functional_tensor import FunctionalTensor
+from torch.utils import _pytree as pytree
+__all__ = [
+    "decomposition_table",
+    "pre_autograd_decomposition_table",
+    "meta_table",
+    "register_decomposition",
+    "get_decompositions",
+    "core_aten_decompositions",
+    "_should_decompose_because_unsafe_op",
+]
+_T = TypeVar("_T")
+_P = ParamSpec("_P")
+# TODO: relax key type here; torch registrations should be possible to; but
+# right now this type is accurate
+global_decomposition_table: dict[str, dict[torch._ops.OperatorBase, Callable]] = (
+    defaultdict(dict)
+)
+decomposition_table = global_decomposition_table["post_autograd"]
+pre_autograd_decomposition_table = global_decomposition_table["pre_autograd"]
+meta_table = global_decomposition_table["meta"]
+def _should_decompose_because_unsafe_op(op: torch._ops.OperatorBase) -> bool:
+    """
+    Returns True if the op must always decompose in export/compile tracing system
+    In export, we always decompose certain CIA ops that are tagged with
+    maybe_aliasing_or_mutating because we statically need to know if the op is
+    mutating or not. But these CIA ops could have different behaviour in runtime.
+    native_batch_norm is a prim op which has a wrong schema and it needs to be replaced
+    with correct schema. But until then, we will force decompose it via this tag.
+    """
+    if not isinstance(op, torch._ops.OpOverload):
+        return False
+    if torch.Tag.maybe_aliasing_or_mutating in op.tags:
+        return True
+    return op is torch.ops.aten.native_batch_norm.default
+def _add_op_to_registry(registry, op, fn):
+    """
+    This is an internal API for adding an op to the decomposition table.
+    If op is OpOverload, it will be added to the registry directly.
+    If op is OpOverloadPacket, all the valid op_overloads in the packet will be added to the registry.
+    """
+    overloads: list[Union[torch._ops.OperatorBase]] = []
+    if isinstance(op, HigherOrderOperator):
+        # There's no concept of overloads for HigherOrderOperator
+        registry[op] = fn
+        return
+    elif isinstance(op, OpOverload):
+        overloads.append(op)
+    else:
+        assert isinstance(op, OpOverloadPacket)
+        for ol in op.overloads():
+            overloads.append(getattr(op, ol))
+    for op_overload in overloads:
+        if op_overload in registry:
+            raise RuntimeError(f"duplicate registrations for {op_overload}")
+        # TorchScript dumps a bunch of extra nonsense overloads
+        # which don't have corresponding dispatcher entries, we need
+        # to filter those out, e.g aten.add.float_int
+        if torch._C._dispatch_has_kernel(op_overload.name()):
+            registry[op_overload] = fn
+def _convert_out_params(f):
+    out_annotation = f.__annotations__.get("out")
+    # If there are no out params, do not wrap the function.
+    if not out_annotation:
+        return f
+    # Hack to detect when out is a Tuple. There seems to be no pretty way of doing this
+    if getattr(out_annotation, "__origin__", None) is tuple:
+        sig = inspect.signature(f)
+        out_names = sig.return_annotation._fields
+        # If out is a tuple, we need to register a function that unpacks all the out
+        # elements as this is what native_functions.yaml expects
+        @wraps(f)
+        def _fn(*args, **kwargs):
+            out_kwargs = tuple(kwargs.pop(o, None) for o in out_names)
+            # Either all of the out kwargs are set or none of them
+            is_none = out_kwargs[0] is None
+            assert all((o is None) == is_none for o in out_kwargs)
+            return f(*args, **kwargs, out=None if is_none else out_kwargs)
+        out_params = [
+            inspect.Parameter(
+                o,
+                kind=inspect.Parameter.KEYWORD_ONLY,
+                default=None,
+                annotation=t,
+            )
+            for o, t in zip(out_names, out_annotation.__args__)
+        ]
+        # Drop the out parameter and concatenate the new kwargs in the signature
+        params = chain((v for k, v in sig.parameters.items() if k != "out"), out_params)
+        _fn.__signature__ = inspect.Signature(  # type: ignore[attr-defined]
+            parameters=params,  # type: ignore[arg-type]
+            return_annotation=sig.return_annotation,
+        )
+        # Drop the out parameter and concatenate the new kwargs in the annotations
+        _fn.__annotations__ = {k: v for k, v in f.__annotations__.items() if k != "out"}
+        for o in out_params:
+            _fn.__annotations__[o.name] = o.annotation
+        # Propagate that this function is wrapped by `out_wrapper`
+        _fn._torch_decompositions_out_wrapper = f._torch_decompositions_out_wrapper  # type: ignore[attr-defined]
+        return _fn
+    # Alternatively, there may be a single tensor out parameter with a name
+    # other than "out". This will need special treatment and is indicated by an
+    # annotation, which we will remove here so it is not exposed after wrapping.
+    custom_out_param_name = f.__annotations__.pop(CustomOutParamAnnotation, None)
+    if custom_out_param_name:
+        @wraps(f)
+        def _fn(*args, **kwargs):
+            out_kwarg = kwargs.pop(custom_out_param_name, None)
+            return f(*args, **kwargs, out=out_kwarg)
+        out_param = inspect.Parameter(
+            custom_out_param_name,
+            kind=inspect.Parameter.KEYWORD_ONLY,
+            default=None,
+            annotation=out_annotation,
+        )
+        # Drop the out parameter and concatenate the new kwarg in the signature
+        sig = inspect.signature(f)
+        params = chain(
+            (v for k, v in sig.parameters.items() if k != "out"), (out_param,)
+        )
+        _fn.__signature__ = inspect.Signature(  # type: ignore[attr-defined]
+            parameters=params,  # type: ignore[arg-type]
+            return_annotation=sig.return_annotation,
+        )
+        # Drop the out parameter and concatenate the new kwargs in the annotations
+        _fn.__annotations__ = {k: v for k, v in f.__annotations__.items() if k != "out"}
+        _fn.__annotations__[out_param.name] = out_param.annotation
+        return _fn
+    return f
+def register_decomposition(
+    aten_op, registry=None, *, type="post_autograd", unsafe=False
+) -> Callable[[Callable[_P, _T]], Callable[_P, _T]]:
+    """
+    A decorator to register a function as a decomposition to the Python
+    decomposition table.  Use it like this::
+        @register_decomposition(torch.ops.aten.clamp_min)
+        def clamp_min(x):
+            return torch.clamp(self, min=min)
+    If you are writing a new decomposition, consider contributing it
+    directly to PyTorch in torch._decomp.decompositions.
+    This API is experimental; we are almost certainly going to extend
+    the API when we make decompositions eligible for use in transforms (e.g.,
+    autograd) and not just backend tracing, where we then need to know if a
+    decomposition can be used to simulate a transform.
+    By default, we also will register it to the Meta key of dispatcher,
+    and replace the c++ Meta implementation if there is already one.
+    unsafe kwarg is for reuse of this function for registering non-function
+    things
+    """
+    assert type in {"post_autograd", "pre_autograd", "meta"}
+    def decomposition_decorator(fn: Callable[_P, _T]) -> Callable[_P, _T]:
+        orig_fn = fn
+        if not unsafe:
+            fn = _convert_out_params(fn)
+        nonlocal registry
+        if registry is None:
+            registry = global_decomposition_table[type]
+        def register(op):
+            _add_op_to_registry(registry, op, fn)
+        # To handle allowing multiple aten_ops at once
+        pytree.tree_map_(register, aten_op)
+        return orig_fn
+    return decomposition_decorator
+def get_decompositions(
+    aten_ops: Sequence[Union[torch._ops.OperatorBase, OpOverloadPacket]],
+    type: str = "post_autograd",
+) -> dict[torch._ops.OperatorBase, Callable]:
+    """
+    Retrieve a dictionary of decompositions corresponding to the list of
+    operator overloads and overload packets passed as input.  Overload
+    packets will include all decomposed overloads in the packet.  If there is
+    no decomposition for a requested operator, it is silently ignored.
+    This API is experimental; we are almost certainly going to give an alternate,
+    more recommended formulation, where a user provides the set of operators
+    they know how to implement, and we provide decompositions for everything
+    not in this set.
+    """
+    assert type in {"post_autograd", "pre_autograd", "meta"}
+    registry = global_decomposition_table[type]
+    packets_to_overloads = defaultdict(list)
+    for opo in registry:
+        if isinstance(opo, (OpOverload, OpOverloadPacket)):
+            packets_to_overloads[opo.overloadpacket].append(opo)
+    decompositions: dict[torch._ops.OperatorBase, Callable] = {}
+    for op in aten_ops:
+        if isinstance(op, OpOverloadPacket) and op in packets_to_overloads:
+            for op_overload in packets_to_overloads[op]:
+                decompositions[op_overload] = registry[op_overload]
+        elif isinstance(op, (torch._ops.OperatorBase)) and op in registry:
+            decompositions[op] = registry[op]
+    return decompositions
+def remove_decompositions(
+    decompositions: dict[torch._ops.OperatorBase, Callable],
+    aten_ops: Sequence[Union[OpOverload, OpOverloadPacket]],
+) -> None:
+    """
+    Given a dictionary of decompositions obtained from get_decompositions(), removes
+    operators associated with a list of operator overloads and overload packets passed
+    as input. If the decomposition dictionary does not contain a decomposition that is
+    specified to be removed, it is silently ignored.
+    """
+    for op in aten_ops:
+        if isinstance(op, OpOverloadPacket):
+            for overload_name in op.overloads():
+                opo = getattr(op, overload_name)
+                decompositions.pop(opo, None)
+        elif isinstance(op, OpOverload):
+            decompositions.pop(op, None)
+# populate the table
+import torch._decomp.decompositions
+import torch._refs
+def core_aten_decompositions() -> "CustomDecompTable":
+    from torch.export.exported_program import default_decompositions
+    return default_decompositions()
+# See NOTE [Core ATen Ops]
+#
+# list was copied from torch/_inductor/decomposition.py
+# excluding decompositions that results in prim ops
+# Resulting opset of decomposition is core aten ops
+def _core_aten_decompositions_post_autograd() -> dict[
+    torch._ops.OperatorBase, Callable
+]:
+    aten = torch.ops.aten
+    return get_decompositions(
+        [
+            aten.addcdiv,
+            aten.addcdiv_,
+            aten.addcmul,
+            aten.addcmul_,
+            aten.addr,
+            aten.affine_grid_generator,
+            aten.alias_copy,
+            aten.all,
+            aten.aminmax,
+            aten.arange.default,
+            aten.arange.start,
+            aten.avg_pool2d_backward,
+            aten.baddbmm,
+            aten.binary_cross_entropy,
+            aten.binary_cross_entropy_backward,
+            aten.binary_cross_entropy_with_logits,
+            aten.block_diag,
+            aten.bernoulli.p,
+            aten.bernoulli.default,
+            aten.celu,
+            aten.celu_,
+            aten.channel_shuffle,
+            aten.clamp_max,
+            aten.clamp_min,
+            aten.col2im,
+            aten.count_nonzero,
+            aten.linalg_cross,
+            aten.cudnn_batch_norm,
+            aten.cudnn_batch_norm_backward,
+            aten.miopen_batch_norm_backward,
+            aten.deg2rad,
+            aten.deg2rad_,
+            aten.detach,
+            aten.diag_embed,
+            aten.diagonal_backward,
+            aten.diagonal_copy,
+            aten.dot,
+            aten.vdot,
+            aten.elu_,
+            aten.elu_backward,
+            aten._embedding_bag,
+            aten.embedding_dense_backward,
+            aten.empty_like,
+            aten._euclidean_dist.default,
+            aten.expand_as,
+            aten.expand_copy,
+            aten.eye,
+            aten.fill,
+            aten.fill_,
+            aten.floor_divide,
+            aten.frac,
+            aten.frac_,
+            aten._fused_moving_avg_obs_fq_helper,
+            aten.gelu_,
+            aten.gelu_backward,
+            aten.glu,
+            aten.glu_backward,
+            aten.hardshrink,
+            aten.hardsigmoid,
+            aten.hardsigmoid_,
+            aten.hardsigmoid_backward,
+            aten.hardswish,
+            aten.hardswish_,
+            aten.hardswish_backward,
+            aten.hardtanh_,
+            aten.hardtanh_backward,
+            aten.heaviside,
+            aten.heaviside_,
+            aten.huber_loss,
+            aten.huber_loss_backward,
+            aten.im2col,
+            aten.index_add.out,
+            aten.index_add.default,
+            aten.index_add_,
+            aten.index_copy.out,
+            aten.index_copy.default,
+            aten.index_copy_,
+            aten.index_fill.int_Scalar,
+            aten.index_fill.int_Tensor,
+            aten.index_fill.int_Scalar_out,
+            aten.index_fill.int_Tensor_out,
+            aten.index_fill_,
+            aten.isin,
+            aten.isneginf,
+            aten.isposinf,
+            aten.l1_loss,
+            aten._lazy_clone,
+            aten._test_parallel_materialize,
+            aten.leaky_relu_,
+            aten.leaky_relu_backward,
+            aten.lerp,
+            aten.lerp_,
+            aten.linspace,
+            aten.logaddexp,
+            aten.logaddexp2,
+            aten.logit,
+            aten.logit_,
+            aten.logit_backward,
+            aten.log_sigmoid_backward,
+            aten.log_sigmoid_forward,
+            aten._log_softmax_backward_data,
+            aten.logspace,
+            aten.logsumexp.default,
+            aten.masked_fill,
+            aten.masked_fill_,
+            aten.max_unpool2d,
+            aten.max_unpool3d,
+            aten.mish,
+            aten.mish_,
+            aten.mish_backward,
+            aten.mse_loss,
+            aten.mse_loss_backward,
+            aten.multi_margin_loss,
+            aten.multilabel_margin_loss_forward,
+            aten.mv,
+            aten.mvlgamma,
+            aten.mvlgamma_,
+            aten.nansum,
+            aten.nan_to_num,
+            aten.nan_to_num_,
+            aten.narrow,
+            aten.native_batch_norm_backward,
+            aten.native_dropout_backward,
+            aten.native_group_norm_backward,
+            aten.native_layer_norm_backward,
+            aten._fused_rms_norm,
+            aten._fused_rms_norm_backward,
+            aten.new_empty,
+            aten.new_full,
+            aten.new_ones,
+            aten.new_zeros,
+            aten.nll_loss2d_forward,
+            aten.nll_loss2d_backward,
+            aten.nll_loss_backward,
+            aten.nll_loss_forward,
+            aten.norm.ScalarOpt_dtype,
+            aten.norm.Scalar,
+            aten.norm.ScalarOpt_dim_dtype,
+            aten.norm.ScalarOpt_dim,
+            aten.norm.dtype_out,
+            aten.norm.out,
+            aten.norm.names_dtype_out,
+            aten.norm.names_out,
+            aten.norm.ScalarOpt_dtype_out,
+            aten.norm.Scalar_out,
+            aten.ones,
+            aten.ones_like,
+            aten.pixel_shuffle,
+            aten.pixel_unshuffle,
+            aten._prelu_kernel,
+            aten._prelu_kernel_backward,
+            aten._reshape_alias,
+            aten.rad2deg,
+            aten.rad2deg_,
+            aten.reflection_pad1d,
+            aten.reflection_pad1d_backward,
+            aten.reflection_pad2d,
+            aten.reflection_pad2d_backward,
+            aten.reflection_pad3d,
+            aten.reflection_pad3d_backward,
+            aten.replication_pad1d,
+            aten.replication_pad2d,
+            aten.replication_pad3d,
+            aten.renorm,
+            aten.renorm_,
+            aten.replication_pad2d,
+            aten.resize_as,
+            aten.roll,
+            aten.rot90,
+            aten.rrelu_with_noise,
+            aten.rrelu_with_noise_,
+            aten.rsub,
+            aten._safe_softmax,
+            aten._scaled_dot_product_flash_attention_for_cpu.default,
+            aten.select_backward,
+            aten.select_scatter,
+            aten.sgn,
+            aten.sgn_,
+            aten.sigmoid_backward,
+            aten.silu,
+            aten.silu_,
+            aten.silu_backward.grad_input,
+            aten.silu_backward,
+            aten.sinc,
+            aten.sinc_,
+            aten.slice_backward,
+            aten.smooth_l1_loss,
+            aten.smooth_l1_loss_backward,
+            aten.soft_margin_loss,
+            aten.soft_margin_loss_backward,
+            aten._softmax_backward_data,
+            aten.softplus,
+            aten.softplus_backward,
+            aten.softshrink,
+            aten.special_entr,
+            aten.special_log_ndtr,
+            aten.special_xlog1py,
+            aten.split.Tensor,
+            aten.split_with_sizes_copy,
+            aten.squeeze_copy,
+            aten.squeeze.default,
+            aten.squeeze.dim,
+            aten.std.correction,
+            aten.std.out,
+            aten.std.correction_out,
+            aten.std.names_out,
+            aten.std.correction_names_out,
+            aten.std_mean.correction,
+            aten.std_mean.correction_out,
+            aten.stack,
+            aten.sum.default,
+            aten.sum.out,
+            aten.t,
+            aten.t_copy,
+            aten.take,
+            aten.tanh_backward,
+            aten.threshold,
+            aten.threshold_,
+            aten.threshold_backward,
+            aten.trace,
+            aten.transpose.int,
+            aten.transpose_copy,
+            aten.tril,
+            aten.tril_,
+            aten.triu,
+            aten.triu_,
+            aten.unbind,
+            aten.unfold_backward,
+            aten.unfold_copy,
+            aten._unsafe_index,
+            aten._unsafe_index_put,
+            aten._unsafe_masked_index,
+            aten._unsafe_masked_index_put_accumulate,
+            aten.unsafe_split.Tensor,
+            aten.unsafe_split_with_sizes,
+            aten.unsqueeze_copy,
+            aten._unsafe_view,
+            aten.upsample_linear1d,
+            aten.upsample_bilinear2d.out,
+            aten.upsample_trilinear3d.out,
+            aten.upsample_nearest2d_backward,
+            aten.view_as_complex,
+            aten.xlogy,
+            aten.xlogy_,
+            aten.zero,
+            aten.zero_,
+            aten.zeros,
+            aten.zeros_like,
+            aten._chunk_cat,
+            aten._weight_norm_interface,
+        ]
+    )

Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/_decomp/decompositions.py ADDED Viewed

The diff for this file is too large to render. See raw diff

Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/_decomp/decompositions_for_jvp.py ADDED Viewed

	@@ -0,0 +1,336 @@

+# mypy: allow-untyped-decorators
+# mypy: allow-untyped-defs
+import inspect
+from collections.abc import Callable
+from typing import Optional
+import torch
+import torch._decomp
+from torch import Tensor
+from torch._prims_common.wrappers import _maybe_remove_out_wrapper
+decomposition_table = torch._decomp.decomposition_table
+decomposition_table_for_jvp: dict[torch._ops.OperatorBase, Callable] = {}
+register_decomposition = torch._decomp.register_decomposition
+aten = torch.ops.aten
+# NOTE: [forward-mode AD decompositions mechanism]
+#
+# The mechanism is in VariableType,
+#   IF any inputs have forward grad
+#      AND there is no forward AD formula implemented
+#      AND the functions are actually differentiable
+#   run the decomposition
+#      See run_jit_decomposition_with_args_for_jvp
+#      We currently use python decompositions that we torchscript.
+#
+# Note that we would be building the backward graph at the decomposed level
+# too, but that is OK, because we would've errored out otherwise anyway.
+#
+# TODO: The mechanism we are using to register decompositions doesn't
+# seem to be exclusively used for jvp. So open question here is whether
+# torch/csrc/jit/runtime/decomposition_registry.cpp is being used for other things.
+# If that is the case, we may go down the decomposition path unexpectedly
+# (and possibly produce an unintelligible error) vs erroring out earlier and
+# printing that the forward AD formula is not implemented.
+#
+# The solution to this may be to have an explicitly white list control when
+# to enable the decomposition.
+def maybe_register_decomposition(op):
+    def decorator(f):
+        try:
+            return register_decomposition(op)(f)
+        except Exception:
+            return f
+    return decorator
+# Functions where we need a special decomposition for jvp but there's another version that
+# should be used more generally (ex. for jvp we need to recompute the mean and variance for
+# the backwards of a normalization function. Without jvp, it should use the saved value)
+decomposition_table_for_jvp = {}
+def register_decomposition_for_jvp(fn):
+    return register_decomposition(fn, registry=decomposition_table_for_jvp)
+def _register_jit_decomposition_for_jvp(decomp, use_python=False):
+    if decomp in decomposition_table_for_jvp:
+        decomposition_table_used = decomposition_table_for_jvp
+    elif decomp in decomposition_table:
+        decomposition_table_used = decomposition_table
+    else:
+        raise RuntimeError(f"could not find decomposition for {decomp}")
+    decomp_fn = decomposition_table_used[decomp]
+    # `out_wrapper` extends a decompositions signature with
+    # an `out` parameter. However jit will use the unwrapped function's
+    # signature instead so we need to unwrap here to prevent an error
+    decomp_fn = _maybe_remove_out_wrapper(decomp_fn)
+    if use_python:
+        decomp_fn = torch.jit.ignore(decomp_fn)
+        sig = inspect.signature(decomp_fn)
+        # Create a string wrapping the function from the signature
+        # example output:
+        # def wrapped_decomp(x: torch.Tensor, y: int, z: int):
+        #   return decomp_fn(x, y, z)
+        # Thanks copilot!
+        def get_function_def(sig):
+            param_def = [f"{param_str}" for param_str in sig.parameters.values()]
+            param_use = [f"{param_str}" for param_str in sig.parameters]
+            return f"def wrapped_decomp({', '.join(param_def)}):\n  return decomp_fn({', '.join(param_use)})\n"
+        f_str = get_function_def(sig)
+        graph = torch.jit.CompilationUnit(f_str).wrapped_decomp.graph
+    else:
+        graph = torch.jit.script(decomp_fn).graph
+    torch.jit._register_decomposition(decomp, graph)
+# The only decompositions here are temporary or hacks for the purposes of jvp
+# TODO: do these also belong here?
+@maybe_register_decomposition(aten.trace.default)
+def trace(self: Tensor) -> Tensor:
+    return torch.sum(torch.diag(self))
+@maybe_register_decomposition(aten.log_sigmoid_forward.default)
+def log_sigmoid_forward(self: Tensor) -> tuple[Tensor, Tensor]:
+    min = torch.minimum(self.new_zeros(()), self)
+    z = torch.exp(-torch.abs(self))
+    if self.is_cuda or self.is_xpu:
+        buffer = self.new_zeros((0,))
+    else:
+        buffer = z
+    return min - torch.log1p(z), buffer
+def recompute_mean_var(
+    input: Tensor, rstd: Tensor, inner_dim_indices: list[int], keepdim: bool
+):
+    # for most norm decompositions, it will be the same as the core version except for here.
+    # We recompute the mean and variance so that they track gradients through input
+    mean = torch.mean(input, dim=inner_dim_indices, keepdim=keepdim)
+    var = torch.var(input, dim=inner_dim_indices, unbiased=False, keepdim=keepdim)
+    eps = torch.pow(1 / rstd, 2) - var  # this makes me so sad inside
+    eps = eps.detach()
+    rstd = 1 / torch.sqrt(var + eps)
+    return mean, rstd
+@register_decomposition_for_jvp(aten.native_layer_norm_backward)
+def native_layer_norm_backward(
+    grad_out: Tensor,
+    input: Tensor,
+    normalized_shape: list[int],
+    mean: Tensor,
+    rstd: Tensor,
+    weight: Optional[Tensor],
+    bias: Optional[Tensor],
+    output_mask: list[bool],
+) -> tuple[Optional[Tensor], Optional[Tensor], Optional[Tensor]]:
+    input_shape = input.shape
+    input_ndim = input.dim()
+    axis = input_ndim - len(normalized_shape)
+    inner_dims = input_shape[axis:]
+    outer_dims = input_shape[:axis]
+    inner_dim_indices = list(range(axis, input_ndim))
+    outer_dim_indices = list(range(axis))
+    N = 1
+    for i in inner_dims:
+        N *= i
+    M = 1
+    for i in outer_dims:
+        M *= i
+    if M <= 0 or N <= 0:
+        return (
+            input.new_zeros(input_shape),
+            input.new_zeros(input_shape[axis:]),
+            input.new_zeros(input_shape[axis:]),
+        )
+    mean_, rstd_ = recompute_mean_var(input, rstd, inner_dim_indices, keepdim=True)
+    x_hat = (input - mean_) * rstd_
+    if weight is not None:
+        grad_x_hat = grad_out * weight
+    else:
+        grad_x_hat = grad_out
+    a = grad_x_hat * N
+    b = torch.sum(grad_x_hat, inner_dim_indices, True)
+    c1 = torch.mul(grad_x_hat, x_hat)
+    c2 = torch.sum(c1, inner_dim_indices, True)
+    c3 = torch.mul(x_hat, c2)
+    inner = a - b - c3
+    if output_mask[0]:
+        d_input: Optional[Tensor] = (rstd_ / N) * inner
+    else:
+        d_input = torch.zeros_like(input)  # should be None but doesn't work with vjp
+    if output_mask[1] and weight is not None:
+        if len(outer_dim_indices) > 0:
+            d_weight: Optional[Tensor] = torch.sum(
+                grad_out * x_hat, outer_dim_indices, False
+            )
+        else:
+            d_weight = grad_out * x_hat
+    elif weight is not None:
+        d_weight = torch.zeros_like(weight)  # should be None but doesn't work with vjp
+    else:
+        d_weight = torch.zeros(())  # should be None but doesn't work with vjp
+    if output_mask[2] and bias is not None:
+        if len(outer_dim_indices) > 0:
+            d_bias: Optional[Tensor] = torch.sum(grad_out, outer_dim_indices, False)
+        else:
+            d_bias = grad_out.clone()
+    elif bias is not None:
+        d_bias = torch.zeros_like(bias)  # should be None but doesn't work with vjp
+    else:
+        d_bias = torch.zeros(())  # should be None but doesn't work with vjp
+    return (d_input, d_weight, d_bias)
+def prod(x: list[int]):
+    r = 1
+    for i in x:
+        r *= i
+    return r
+@register_decomposition_for_jvp(aten.native_batch_norm_backward)
+def native_batch_norm_backward(
+    grad_out: Tensor,
+    input: Tensor,
+    weight: Optional[Tensor],
+    running_mean: Optional[Tensor],
+    running_var: Optional[Tensor],
+    save_mean: Optional[Tensor],
+    save_invstd: Optional[Tensor],
+    train: bool,
+    eps: float,
+    output_mask: list[bool],
+) -> tuple[Tensor, Optional[Tensor], Optional[Tensor]]:
+    input_shape = input.shape
+    input_rank = input.dim()
+    assert input_rank >= 2, "rank of the input must be at least 2"
+    axis = 1
+    num_features = prod(input_shape) / input_shape[axis]  # type: ignore[arg-type]
+    mean = save_mean
+    invstd = save_invstd
+    if train:
+        assert save_mean is not None and save_invstd is not None, (
+            "when train=True, save_mean and save_invstd are required"
+        )
+        reduciton_dims = [0] + list(range(2, input.dim()))
+        assert invstd is not None  # for typing
+        mean, invstd = recompute_mean_var(input, invstd, reduciton_dims, keepdim=False)
+    else:
+        assert running_mean is not None and running_var is not None
+        mean = running_mean
+        invstd = torch.rsqrt(running_var + eps)
+    assert invstd is not None and mean is not None
+    broadcast_mask = [1] * input_rank
+    broadcast_mask[axis] = input_shape[axis]
+    reduction_axes: list[int] = []
+    for i in range(input_rank):
+        if i != axis:
+            reduction_axes.append(i)
+    mean = torch.reshape(mean, broadcast_mask)
+    norm = 1.0 / num_features
+    grad_output_sum = torch.sum(grad_out, reduction_axes)
+    dot_p = torch.sum(grad_out * (input - mean), reduction_axes)
+    grad_mean = torch.reshape(grad_output_sum * norm, broadcast_mask)
+    proj_scale = torch.reshape(torch.mul(dot_p * norm, invstd * invstd), broadcast_mask)
+    if weight is None:
+        grad_scale = torch.reshape(invstd, broadcast_mask) * 1.0
+    else:
+        grad_scale = torch.reshape(invstd * weight, broadcast_mask)
+    if train:
+        proj = (input - mean) * proj_scale
+        grad_input = ((grad_out - proj) - grad_mean) * grad_scale
+    else:
+        grad_input = grad_out * grad_scale
+    if output_mask[1]:
+        grad_weight = dot_p * invstd
+    elif weight is not None:
+        grad_weight = torch.zeros_like(
+            weight
+        )  # should be None but doesn't work with vjp
+    else:
+        grad_weight = torch.zeros(())  # should be None but doesn't work with vjp
+    if output_mask[2]:
+        grad_bias = grad_output_sum
+    else:
+        grad_bias = torch.zeros_like(
+            grad_output_sum
+        )  # should be None but doesn't work with vjp
+    return (grad_input, grad_weight, grad_bias)
+@register_decomposition_for_jvp(aten.batch_norm_backward)
+def batch_norm_backward(
+    grad_out: Tensor,
+    input: Tensor,
+    weight: Tensor,
+    running_mean: Optional[Tensor],
+    running_var: Optional[Tensor],
+    save_mean: Optional[Tensor],
+    save_var: Optional[Tensor],
+    update: bool,
+    eps: float,
+    output_mask: list[bool],
+    reserve: Tensor,
+) -> tuple[Tensor, Optional[Tensor], Optional[Tensor]]:
+    return native_batch_norm_backward(
+        grad_out,
+        input,
+        weight,
+        running_mean,
+        running_var,
+        save_mean,
+        save_var,
+        update,
+        eps,
+        output_mask,
+    )
+_register_jit_decomposition_for_jvp(torch.ops.aten.trace.default, use_python=True)
+_register_jit_decomposition_for_jvp(torch.ops.aten.nll_loss_backward.default)
+_register_jit_decomposition_for_jvp(torch.ops.aten.nll_loss2d_backward.default)
+_register_jit_decomposition_for_jvp(torch.ops.aten._log_softmax_backward_data.default)
+_register_jit_decomposition_for_jvp(torch.ops.aten._softmax_backward_data.default)
+_register_jit_decomposition_for_jvp(torch.ops.aten.log_sigmoid_forward.default)
+_register_jit_decomposition_for_jvp(torch.ops.aten.native_layer_norm_backward.default)
+_register_jit_decomposition_for_jvp(torch.ops.aten.native_batch_norm_backward.default)
+_register_jit_decomposition_for_jvp(torch.ops.aten.cudnn_batch_norm_backward.default)
+_register_jit_decomposition_for_jvp(torch.ops.aten.batch_norm_backward.default)
+_register_jit_decomposition_for_jvp(torch.ops.aten.miopen_batch_norm_backward.default)

Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/_decomp/decompositions_for_rng.py ADDED Viewed

	@@ -0,0 +1,266 @@

+# mypy: allow-untyped-decorators
+# mypy: allow-untyped-defs
+import functools
+from collections import defaultdict
+from collections.abc import Callable
+import torch
+import torch._decomp as decomp
+from torch._decomp import get_decompositions
+from torch._ops import OpOverload
+aten = torch.ops.aten
+rng_decompositions: dict[str, dict[OpOverload, Callable]] = defaultdict(dict)
+def register_rng_decomposition(aten_op):
+    return decomp.register_decomposition(aten_op, rng_decompositions)
+def throw_on_non_cuda(device):
+    raise RuntimeError(
+        f"You are trying to functionalize a {device.type} RNG operator but {device.type} does not "
+        f"use Philox/counter-based RNG. Therefore, functionalizing a {device.type} RNG operator is "
+        "not supported. We are discussing the possibility of a Philox-based RNG implementation for CPU."
+    )
+# TODO - We have to register many more distributions here, and also higher level
+# ops like dropout which have fused implementation and can hide the rand inside.
+@register_rng_decomposition(aten.rand)
+def rand(shape, dtype=None, layout=torch.strided, device=None, pin_memory=False):
+    if device and device.type != "cuda":
+        throw_on_non_cuda(device)
+    seed, offset = PhiloxStateTracker.get_state_as_tuple()
+    dtype = dtype or torch.float32
+    out, offset_jump = torch.ops.rngprims.philox_rand(
+        shape, seed, offset, None, device, dtype
+    )
+    PhiloxStateTracker.advance_offset(offset_jump)
+    return out
+@register_rng_decomposition(aten.rand_like)
+def rand_like(
+    x: torch.Tensor,
+    dtype=None,
+    layout=None,
+    device=None,
+    pin_memory=False,
+    memory_format=torch.preserve_format,
+):
+    device = device or x.device
+    if device.type != "cuda":
+        throw_on_non_cuda(device)
+    dtype = dtype or x.dtype
+    seed, offset = PhiloxStateTracker.get_state_as_tuple()
+    out, offset_jump = torch.ops.rngprims.philox_rand(
+        x.shape, seed, offset, None, device, dtype
+    )
+    PhiloxStateTracker.advance_offset(offset_jump)
+    return out
+class PhiloxState:
+    """
+    Represents a PhiloxRngState - (seed, offset) where offset = base_offset +
+    relative_offset. seed and base_offset basically point to the rng state just
+    before tracing starts. relative offset tracks the totally consumed offset at
+    trace time.
+    """
+    def __init__(self) -> None:
+        self.reset()
+    def reset(self):
+        self.seed = torch.tensor(())
+        self.base_offset = torch.tensor(())
+        self.relative_offset = 0
+        self.offset_advanced_alteast_once = False
+    def validate_state(self):
+        assert self.seed.numel() != 0 and self.base_offset.numel() != 0
+    def advance_offset(self, consumed_offset):
+        self.offset_advanced_alteast_once = True
+        self.relative_offset = self.relative_offset + consumed_offset
+    def set_state(self, seed, base_offset, relative_offset=0):
+        self.seed = seed
+        self.base_offset = base_offset
+        self.relative_offset = relative_offset
+    def get_state_as_tuple(self):
+        self.validate_state()
+        return (self.seed, self.base_offset + self.relative_offset)
+    def get_state_as_tensor(self):
+        # Only needed because we override get_rng_state.
+        self.validate_state()
+        return torch.stack([self.seed, self.base_offset + self.relative_offset])
+    def set_state_from_tensor(self, state):
+        # Only needed because we override set_rng_state.
+        self.seed, self.base_offset = torch.unbind(state)
+        self.relative_offset = 0
+class PhiloxStateTracker:
+    """
+    Singleton class to track the philox rng state during AOT Autograd tracing.
+    For each aot tracing instance, AOT Autograd resets this tracker and keeps
+    track of both forward and backward offsets. At runtime, we only care about
+    the total consumed forward and backward offsets. For dynamic shapes, these
+    offsets are a function of input shapes. Therefore, the AOT generated graphs
+    have additional outputs that compute total consumed forward and backward
+    offsets.
+    """
+    running_state: PhiloxState
+    fwd_state: PhiloxState
+    bwd_state: PhiloxState
+    def __enter__(self):
+        PhiloxStateTracker.reset()
+        return self
+    def __exit__(self, exc_type, exc_cal, exc_tb):
+        PhiloxStateTracker.reset()
+    @classmethod
+    def reset(cls):
+        cls.running_state = PhiloxState()
+        cls.fwd_state = PhiloxState()
+        cls.bwd_state = PhiloxState()
+    @classmethod
+    def mark_beginning_of_forward(cls):
+        # Tells the tracker to use fwd_state as the running state
+        cls.running_state = cls.fwd_state
+    @classmethod
+    def mark_beginning_of_backward(cls):
+        # Tells the tracker to use bwd_state as the running state
+        cls.running_state = cls.bwd_state
+    @classmethod
+    def record_state(cls, seed, offset, mode):
+        # Records the seed and offset tensors. These tensors are used to invoke
+        # the philox_rand functional primitives.
+        if mode == "forward":
+            cls.fwd_state.set_state(seed, offset)
+            cls.mark_beginning_of_forward()
+        else:
+            assert mode == "backward"
+            cls.bwd_state.set_state(seed, offset)
+    @classmethod
+    def get_state_as_tensor(cls):
+        # The only reason this exists is because we override get_rng_state and
+        # set_rng_state during tracing. get_rng_state expects a tensor output,
+        # so return (seed, offset) tuple upset other parts of the program like
+        # ctx.saved_tensors.
+        # A bad consequence is that if user saves and restores rng state, we
+        # have little bit of ugliness in the generated code, where we first
+        # concat the (seed, offset) to create a tensor for get_rng_state, and
+        # then split it back to get (seed, offset) tuple in set_rng_state.
+        # TODO: Investigate if there is be a better way to wrap the tuple in a
+        # false Tensor object, and then desugar it later on.
+        return cls.running_state.get_state_as_tensor()
+    @classmethod
+    def get_state_as_tuple(cls):
+        return cls.running_state.get_state_as_tuple()
+    @classmethod
+    def set_state_from_tensor(cls, x):
+        # This is only needed because we override set_rng_state. Look at the
+        # comment in get_state_from_tensor method.
+        cls.running_state.set_state_from_tensor(x)
+    @classmethod
+    def advance_offset(cls, consumed_offset):
+        cls.running_state.advance_offset(consumed_offset)
+    @classmethod
+    def get_current_relative_offset(cls):
+        return cls.running_state.relative_offset
+    @staticmethod
+    def multiple_of_4(offset):
+        # torch cuda rng state offset must be a multiple of 4. For inductor, as
+        # we sum up all the numel, the result might not be a multiple of 4. This
+        # method achieves that.
+        return (offset + 3) // 4 * 4
+    @classmethod
+    def get_updated_fwd_offset(cls):
+        # Short circuit if no rand ops were observed
+        if not cls.fwd_state.offset_advanced_alteast_once:
+            return cls.fwd_state.base_offset
+        return cls.multiple_of_4(
+            cls.fwd_state.base_offset + cls.fwd_state.relative_offset
+        )
+    @classmethod
+    def get_updated_bwd_offset(cls):
+        # Short circuit if no rand ops were observed
+        if not cls.bwd_state.offset_advanced_alteast_once:
+            return cls.bwd_state.base_offset
+        return cls.multiple_of_4(
+            cls.bwd_state.base_offset + cls.bwd_state.relative_offset
+        )
+# Adding more decompositions which eventually use rand_like inside decomps.
+# Adding these in rng_decompositions ensures the functionalization of rand_like
+# ops used in these decomps. The list is copied from inductor codebase, which
+# uses it for similar purpose.
+#
+# Caution - These decomps do not have same accuracy as that of eager. However,
+# we can't just disable them with a config flag like fallback_random, because
+# for functionalization of rng ops, we have to decompose these ops.
+extra_random_decomps = get_decompositions(
+    [
+        aten.cauchy,
+        aten.cauchy_,
+        aten.exponential,
+        aten.exponential_,
+        aten.geometric,
+        aten.geometric_,
+        aten.native_dropout,
+        aten.normal,
+        aten.normal_,
+        aten.normal_functional,
+        aten.log_normal,
+        aten.log_normal_,
+        aten.rrelu_with_noise,
+        aten.rrelu_with_noise_,
+        aten.uniform_,
+    ]
+)
+register_extra_random_decomp = functools.partial(
+    decomp.register_decomposition, registry=extra_random_decomps
+)
+@register_extra_random_decomp([aten.bernoulli_])
+def bernoulli_(self, p=0.5):
+    if self.device == torch.device("cpu"):
+        return NotImplemented
+    return self.copy_(torch.rand_like(self, dtype=torch.float32) < p)
+@register_extra_random_decomp([aten.bernoulli.p])
+def bernoulli_p(self, p=0.5, *, generator=None):
+    if self.device == torch.device("cpu"):
+        return NotImplemented
+    assert generator is None
+    return torch.rand_like(self, dtype=torch.float32) < p
+rng_decompositions.update(extra_random_decomps)  # type: ignore[arg-type]

Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/_dispatch/__init__.py ADDED Viewed

File without changes

Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/_dispatch/python.py ADDED Viewed

	@@ -0,0 +1,192 @@

+# mypy: allow-untyped-defs
+import itertools
+import unittest.mock
+from collections.abc import Callable, Iterator
+from contextlib import contextmanager
+from typing import TypeVar, Union
+from typing_extensions import ParamSpec
+import torch
+import torch._C
+import torch._ops
+import torch.utils._python_dispatch
+import torch.utils._pytree as pytree
+from torch._C import DispatchKey
+__all__ = ["enable_python_dispatcher", "no_python_dispatcher", "enable_pre_dispatch"]
+no_python_dispatcher = torch._C._DisablePythonDispatcher
+enable_python_dispatcher = torch._C._EnablePythonDispatcher
+enable_pre_dispatch = torch._C._EnablePreDispatch
+CROSSREF_FUNCTIONALIZE = False
+_P = ParamSpec("_P")
+_T = TypeVar("_T")
+def all_py_loaded_overloads() -> Iterator[torch._ops.OpOverload]:
+    """
+    Warning: the set of overloads this will report is very subtle.  It is precisely
+    the set of torch.ops functions that have actually been accessed from Python
+    (e.g., we actually called torch.ops.aten.blah at some point.  This is DIFFERENT
+    from the set of registered operators, which will in general be a larger set,
+    as this would include all operators which we ran C++ static initializers or
+    Python operator registration on.  This does not eagerly populate the list on
+    torch.ops.aten; this list is lazy!
+    In other words, this is good for traversing over everything that has an
+    OpOverload object allocated in Python.  We use it for cache invalidation, but
+    don't rely on this list being complete.
+    Note that even if we did report all C++ registered overloads, this isn't guaranteed
+    to be complete either, as a subsequent lazy load of a library which triggers more
+    registrations could add more things to the set.
+    """
+    for ns in torch.ops:
+        packets = getattr(torch.ops, ns)
+        for op_name in packets:
+            packet = getattr(packets, op_name)
+            for overload in packet:
+                yield getattr(packet, overload)
+@contextmanager
+def suspend_functionalization():
+    f_tls = torch._C._dispatch_tls_is_dispatch_key_included(
+        torch._C.DispatchKey.Functionalize
+    )
+    f_rv = torch._C._functionalization_reapply_views_tls()
+    if f_tls:
+        torch._disable_functionalization()
+    try:
+        yield
+    finally:
+        if f_tls:
+            torch._enable_functionalization(reapply_views=f_rv)
+def check_tensor_metadata_matches(nv, rv, desc):
+    assert callable(desc)
+    assert nv.size() == rv.size(), f"{desc()}: sizes {nv.size()} != {rv.size()}"
+    assert nv.dtype == rv.dtype, f"{desc()}: dtype {nv.dtype} != {rv.dtype}"
+    same_strides, idx = torch._prims_common.check_significant_strides(
+        nv, rv, only_cuda=False
+    )
+    assert same_strides, (
+        f"{desc()}: strides {nv.stride()} != {rv.stride()} (mismatch at index {idx})"
+    )
+def check_metadata_matches(n, r, desc):
+    assert callable(desc)
+    n_vals, _n_spec = pytree.tree_flatten(n)
+    r_vals, _r_spec = pytree.tree_flatten(r)
+    # TODO: test the specs match; empirically  sometimes we have a tuple
+    # on one side and a list on the other
+    assert len(n_vals) == len(r_vals), f"{len(n_vals)} != {len(r_vals)}"
+    for i, nv, rv in zip(range(len(n_vals)), n_vals, r_vals):
+        if not isinstance(rv, torch.Tensor):
+            continue
+        check_tensor_metadata_matches(nv, rv, lambda: f"{desc()} output {i}")
+class Lit:
+    def __init__(self, s):
+        self.s = s
+    def __repr__(self):
+        return self.s
+def _fmt(a: object) -> object:
+    if isinstance(a, torch.Tensor):
+        return Lit(
+            f"torch.empty_strided({tuple(a.size())}, {a.stride()}, dtype={a.dtype})"
+        )
+    else:
+        return a
+def make_crossref_functionalize(
+    op: torch._ops.OpOverload[_P, _T], final_key: DispatchKey
+) -> Union[Callable[_P, _T], DispatchKey]:
+    from torch._subclasses.fake_tensor import FakeTensorMode
+    # This case is pretty weird, suppress it for now
+    if op is torch.ops.aten.lift_fresh.default:
+        return final_key
+    def handler(*args: _P.args, **kwargs: _P.kwargs) -> _T:
+        fake_mode = FakeTensorMode()
+        def fakeify_defun(t):
+            if isinstance(t, torch.Tensor):
+                if torch._is_functional_tensor(t):
+                    r = torch._from_functional_tensor(t)
+                    # NB: This assumes that the inner tensor sizes/strides match
+                    # the outer tensor sizes/strides.  This doesn't necessarily have to
+                    # be the case, see discussion at
+                    # https://github.com/pytorch/pytorch/pull/87610/files/401ddeda1d769bedc88a12de332c7357b60e51a4#r1007264456
+                    assert t.size() == r.size()
+                    assert t.stride() == r.stride()
+                else:
+                    r = t
+                # TODO: suppress guards
+                return fake_mode.from_tensor(r)
+            return t
+        def maybe_detach(t):
+            if isinstance(t, torch.Tensor):
+                return t.detach()
+            else:
+                return t
+        # TODO: This probably does the wrong thing if you're running other
+        # substantive modes with the normal op outside here
+        with (
+            torch.utils._python_dispatch._disable_current_modes(),
+            suspend_functionalization(),
+        ):
+            f_args, f_kwargs = pytree.tree_map(fakeify_defun, (args, kwargs))
+            orig_f_args, orig_f_kwargs = pytree.tree_map(
+                maybe_detach, (f_args, f_kwargs)
+            )
+            with fake_mode:
+                f_r = op(*f_args, **f_kwargs)  # pyrefly: ignore [invalid-param-spec]
+        r = op._op_dk(final_key, *args, **kwargs)
+        def desc():
+            fmt_args = ", ".join(
+                itertools.chain(
+                    (repr(pytree.tree_map(_fmt, a)) for a in orig_f_args),
+                    (
+                        f"{k}={pytree.tree_map(_fmt, v)}"
+                        for k, v in orig_f_kwargs.items()
+                    ),
+                )
+            )
+            return f"{op}({fmt_args})"
+        check_metadata_matches(f_r, r, desc)
+        return r
+    return handler
+# NB: enabling this is slow, don't do it in a hot loop.  This is purely
+# for debugging purposes.
+@contextmanager
+def enable_crossref_functionalize():
+    for op in all_py_loaded_overloads():
+        op._uncache_dispatch(torch._C.DispatchKey.Functionalize)
+    try:
+        with (
+            enable_python_dispatcher(),
+            unittest.mock.patch("torch._dispatch.python.CROSSREF_FUNCTIONALIZE", True),
+        ):
+            yield
+    finally:
+        for op in all_py_loaded_overloads():
+            op._uncache_dispatch(torch._C.DispatchKey.Functionalize)

Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/_export/config.py ADDED Viewed

	@@ -0,0 +1,45 @@

+"""
+Configuration module for torch.export.export.
+This module contains various configuration flags and settings that control torch.export's
+behavior, including:
+- Runtime behavior flags
+- Debugging and development options
+"""
+import sys
+from typing import Any, TYPE_CHECKING
+from torch._environment import is_fbcode
+from torch.utils._config_module import install_config_module
+# this flag controls whether we use new functional tracer. It
+# should be True in the long term.
+use_new_tracer_experimental = True
+# this flag is used to control whether we want to instrument
+# fake tensor creation to track potential leaks. It is off
+# by default, but user can turn it on to debug leaks.
+detect_non_strict_fake_tensor_leaks = False
+# error on potentially pre-dispatch/non-strict tracing limitation
+# this type of error usually happens when we encounter an op
+# that we don't know how to proxy, resulting in untracked fake tensors
+error_on_lifted_constant_tensors = True
+# enable auto_functionalized_v2 in export
+# We turn this off in fbcode due to downstream users not
+# being ready to handle auto_functionalized_v2.
+enable_auto_functionalized_v2_for_export = not is_fbcode()
+use_legacy_dynamo_graph_capture = True
+if TYPE_CHECKING:
+    from torch.utils._config_typing import *  # noqa: F401, F403
+    def _make_closure_patcher(**changes: Any) -> Any: ...
+install_config_module(sys.modules[__name__])

Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/_export/error.py ADDED Viewed

	@@ -0,0 +1,56 @@

+from enum import Enum
+class ExportErrorType(Enum):
+    # User providing invalid inputs to either tracer, or other public facing APIs
+    INVALID_INPUT_TYPE = 1
+    # User returning values from their models that we don't support.
+    INVALID_OUTPUT_TYPE = 2
+    # Generated IR does not conform to Export IR Specification.
+    VIOLATION_OF_SPEC = 3
+    # User's code contains types and functionalities we don't support.
+    NOT_SUPPORTED = 4
+    # User's code didn't provide necessary details for us to successfully trace and export.
+    # For example, we use a lot of decorators and ask users to annotate their model.
+    MISSING_PROPERTY = 5
+    # User is using an API without proper initialization step.
+    UNINITIALIZED = 6
+def internal_assert(pred: bool, assert_msg: str) -> None:
+    """
+    This is exir's custom assert method. It internally just throws InternalError.
+    Note that the sole purpose is to throw our own error while maintaining similar syntax
+    as python assert.
+    """
+    if not pred:
+        raise InternalError(assert_msg)
+class InternalError(Exception):
+    """
+    Raised when an internal invariance is violated in EXIR stack.
+    Should hint users to report a bug to dev and expose the original
+    error message.
+    """
+    def __init__(self, message: str) -> None:
+        super().__init__(message)
+class ExportError(Exception):
+    """
+    This type of exception is raised for errors that are directly caused by the user
+    code. In general, user errors happen during model authoring, tracing, using our public
+    facing APIs, and writing graph passes.
+    """
+    def __init__(self, error_code: ExportErrorType, message: str) -> None:
+        prefix = f"[{error_code}]: "
+        super().__init__(prefix + message)

Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/_export/verifier.py ADDED Viewed

	@@ -0,0 +1,531 @@

+# mypy: allow-untyped-defs
+import inspect
+import math
+import operator
+from collections.abc import Iterable
+from typing import Any, final, TYPE_CHECKING
+import torch
+from torch._library.opaque_object import is_opaque_type
+from torch._ops import HigherOrderOperator, OpOverload
+from torch._subclasses.fake_tensor import FakeTensor
+from torch.export.graph_signature import (
+    CustomObjArgument,
+    InputKind,
+    SymBoolArgument,
+    SymFloatArgument,
+    SymIntArgument,
+    TensorArgument,
+    TokenArgument,
+)
+from torch.fx import GraphModule
+if TYPE_CHECKING:
+    from torch.export.exported_program import ExportedProgram
+class SpecViolationError(Exception):
+    pass
+def is_functional(op: OpOverload) -> bool:
+    return not op._schema.is_mutable
+def _check_has_fake_tensor(node: torch.fx.Node) -> None:
+    # TODO(angelayi): remove this in favor of _check_val
+    return _check_val(node)
+def _check_val(node: torch.fx.Node) -> None:
+    from torch.fx.experimental.symbolic_shapes import SymBool, SymFloat, SymInt
+    def _check_correct_val(val):
+        if val is None:
+            return True
+        elif isinstance(val, (int, bool, str, float)):
+            return True
+        elif isinstance(
+            val, (torch.memory_format, torch.dtype, torch.device, torch.layout)
+        ):
+            return True
+        elif isinstance(
+            val, (FakeTensor, torch.Tensor)
+        ):  # TODO(zhxchen17) Remove Tensor.
+            return True
+        elif isinstance(val, (SymInt, SymFloat, SymBool)):
+            return True
+        elif isinstance(val, CustomObjArgument):
+            return True
+        elif isinstance(val, Iterable):
+            return all(_check_correct_val(x) for x in val)
+        elif is_opaque_type(type(val)):
+            return True
+        return False
+    def _no_returns(op):
+        if not isinstance(op, OpOverload):
+            return False
+        return len(op._schema.returns) == 0
+    if "val" not in node.meta:
+        if node.op == "call_function" and _no_returns(node.target):
+            return
+        raise SpecViolationError(f"Node.meta {node.name} is missing val field.")
+    val = node.meta["val"]
+    if not _check_correct_val(val):
+        raise SpecViolationError(f"Node.meta {node.name} has invalid val field {val}")
+def _check_torch_fn(node: torch.fx.Node) -> None:
+    torch_fn = node.meta.get("torch_fn")
+    if torch_fn is None:
+        raise SpecViolationError(
+            f"Unable to find torch_fn metadata for node {node.name}"
+        )
+    if (
+        not isinstance(torch_fn, tuple)
+        and isinstance(torch_fn[0], str)
+        and isinstance(torch_fn[1], str)
+    ):
+        raise SpecViolationError(
+            f"Node.meta {node.name} has invalid torch_fn field {torch_fn}"
+        )
+class _VerifierMeta(type):
+    _registry: dict[str, type["Verifier"]] = {}
+    def __new__(metacls, name, bases, attrs):
+        if bases:
+            if "check" in attrs or "_check_graph_module" in attrs:
+                raise SyntaxError("Overriding method check is not allowed.")
+            assert "dialect" in attrs and attrs["dialect"] != "ATEN"
+        else:
+            assert "check" in attrs
+            assert "_check_graph_module" in attrs
+            assert attrs["dialect"] == "ATEN"
+        assert isinstance(attrs["dialect"], str)
+        ret = type.__new__(metacls, name, bases, attrs)
+        metacls._registry[attrs["dialect"]] = ret  # type: ignore[assignment]
+        return ret
+def getattr_recursive(obj: Any, target: str) -> Any:
+    target_atoms = target.split(".")
+    attr_itr = obj
+    for i, atom in enumerate(target_atoms):
+        if not hasattr(attr_itr, atom):
+            raise RuntimeError(
+                f"Node referenced nonexistent target {'.'.join(target_atoms[:i])}"
+            )
+        attr_itr = getattr(attr_itr, atom)
+    return attr_itr
+class Verifier(metaclass=_VerifierMeta):
+    dialect = "ATEN"
+    def allowed_builtin_ops(self) -> list:
+        return [
+            operator.getitem,
+            operator.add,
+            operator.mul,
+            operator.sub,
+            operator.truediv,
+            operator.ge,
+            operator.le,
+            operator.gt,
+            operator.lt,
+            operator.eq,
+            operator.ne,
+            operator.floordiv,
+            operator.mod,
+            operator.and_,
+            operator.or_,
+            operator.not_,
+            operator.pow,
+            operator.neg,
+            operator.abs,
+            operator.lshift,
+            operator.rshift,
+            math.ceil,
+            math.floor,
+            math.trunc,
+            round,
+        ]
+    def allowed_op_types(self) -> tuple[type[Any], ...]:
+        return (OpOverload, HigherOrderOperator)
+    def allowed_getattr_types(self) -> tuple[type[Any], ...]:
+        return (torch.fx.GraphModule, torch.utils._pytree.TreeSpec)
+    def allowed_getattr_types_for_subgm(self) -> tuple[type[Any], ...]:
+        # subgm in HOP's argument could has have getattr(weight) nodes, thus stateful
+        return (
+            torch.fx.GraphModule,
+            torch.nn.parameter.Parameter,
+            torch.Tensor,  # for buffer and constant tensor
+            torch.utils._pytree.TreeSpec,
+        )
+    def check_valid_op(self, op):
+        pass
+    def check_additional(self, gm: GraphModule) -> None:
+        """
+        Additional checks that are specific to some dialects.
+        """
+    @final
+    def check(self, ep: "ExportedProgram") -> None:
+        self._check_graph_module(ep.graph_module)
+        _verify_exported_program_module_call_graph(ep)
+        _verify_exported_program_signature(ep)
+    @final
+    def _check_graph_module(self, gm: torch.fx.GraphModule) -> None:
+        def _allowed_getattr_types(is_toplevel_gm) -> tuple[type[Any], ...]:
+            if is_toplevel_gm:
+                ret = self.allowed_getattr_types()
+            else:
+                ret = self.allowed_getattr_types_for_subgm()
+            assert not any(t is object for t in ret)
+            return ret
+        def _check_valid_op(op) -> None:
+            def _allowed_builtin_ops() -> list:
+                ret = self.allowed_builtin_ops()
+                assert all(inspect.isbuiltin(op) for op in ret)
+                return ret
+            def _allowed_op_types() -> tuple[type[Any], ...]:
+                ret = self.allowed_op_types()
+                assert not any(t is object for t in ret)
+                return ret
+            # TODO Remove this allowlist.
+            _allowed_torch_functions = (
+                torch.autograd.grad_mode.set_grad_enabled,
+                torch.sym_int,
+                torch.sym_float,
+                torch.sym_ite,
+                torch.sym_max,
+                torch.sym_min,
+                torch.sym_not,
+                torch.sym_sqrt,
+                torch.sym_sum,
+                torch.export.custom_ops._call_custom_autograd_function_in_pre_dispatch,
+                # TODO (tmanlaibaatar)
+                # Predispatch export is able to contain autograd ops.
+                # These will be modeled as HOO later
+                torch._C._set_grad_enabled,
+                torch.amp.autocast_mode._enter_autocast,
+                torch.amp.autocast_mode._exit_autocast,
+                torch.fx.experimental.symbolic_shapes.cast_symbool_to_symint_guardless,
+                torch._functorch.predispatch._add_batch_dim,
+                torch._functorch.predispatch._remove_batch_dim,
+                torch._functorch.predispatch._vmap_increment_nesting,
+                torch._functorch.predispatch._vmap_decrement_nesting,
+                torch._functorch.predispatch.lazy_load_decompositions,
+            )
+            if not isinstance(op, _allowed_op_types()):
+                if (
+                    op not in _allowed_builtin_ops()
+                    and op not in _allowed_torch_functions
+                ):
+                    raise SpecViolationError(
+                        f"Operator '{op}' is not an allowed operator type: {_allowed_op_types()}\n"
+                        f"Valid builtin ops: {_allowed_builtin_ops()}"
+                        f"Valid torch functions: {_allowed_torch_functions}"
+                    )
+            if isinstance(op, OpOverload):
+                # All ops functional
+                # TODO (tmanlaibaatar) more proper way is needed here
+                if self.dialect != "TRAINING" and not is_functional(op):
+                    raise SpecViolationError(f"operator '{op}' is not functional")
+            self.check_valid_op(op)
+        for mod in gm.modules():
+            is_toplevel_gm = mod is gm
+            if not isinstance(mod, torch.fx.GraphModule):
+                continue
+            mod.graph.lint()
+            for node in mod.graph.nodes:
+                # TODO(T140410192): should have fake tensor for all dialects
+                if node.op in {"call_module", "call_method"}:
+                    raise SpecViolationError(
+                        f"call_module is not valid: got a class '{node.target}' ",
+                    )
+                elif node.op == "call_function":
+                    _check_val(node)
+                    _check_valid_op(node.target)
+                elif node.op == "get_attr":
+                    if not isinstance(node.target, str):
+                        raise SpecViolationError(
+                            f"Expected get_attr target to be string, but got {type(node.target)}"
+                        )
+                    attr = getattr_recursive(mod, node.target)
+                    if isinstance(attr, torch.nn.Module):
+                        def _is_type(name, ty):
+                            return isinstance(getattr(attr, name, None), ty)
+                        if type(attr).__name__ == "LoweredBackendModule":
+                            if (
+                                _is_type("backend_id", str)
+                                and hasattr(attr, "original_module")
+                                and hasattr(attr, "module_name")
+                                and getattr(attr, "backend_id", None) == "aoti"
+                            ):
+                                continue
+                            if (
+                                _is_type("backend_id", str)
+                                and _is_type("processed_bytes", bytes)
+                                and _is_type("compile_specs", list)
+                                and hasattr(attr, "original_module")
+                            ):
+                                continue
+                            else:
+                                backend_id = getattr(attr, "backend_id", None)
+                                processed_bytes = getattr(attr, "processed_bytes", None)
+                                compile_specs = getattr(attr, "compile_specs", None)
+                                raise SpecViolationError(
+                                    f"Invalid get_attr type {type(attr)}. \n"
+                                    f"LoweredBackendModule fields: "
+                                    f"backend_id(str) : {type(backend_id)}, "
+                                    f"processed_bytes(bytes) : {type(processed_bytes)}, "
+                                    f"compile_specs(list) : {type(compile_specs)}"
+                                )
+                        elif type(attr).__name__ == "AOTInductorEPModule":
+                            continue
+                        elif type(attr).__name__ == "AOTInductorRunnerWrapper":
+                            continue
+                    if not isinstance(attr, _allowed_getattr_types(is_toplevel_gm)):
+                        raise SpecViolationError(
+                            f"Invalid get_attr type {type(attr)} on target {node.target}. \n"
+                            f"Valid get_attr types: {_allowed_getattr_types(is_toplevel_gm)}"
+                        )
+                elif node.op == "placeholder":
+                    _check_val(node)
+                # TODO(zhxchen17)
+                # elif node.op == "output":
+                #     _check_flattened_outputs()
+        self.check_additional(gm)
+class TrainingIRVerifier(Verifier):
+    dialect = "TRAINING"
+def _verify_exported_program_module_call_graph(exported_program) -> None:
+    module_call_graph = exported_program.module_call_graph
+    nodes = {node.name for node in exported_program.graph.nodes}
+    for entry in module_call_graph:
+        if entry.signature is not None:
+            for arg in entry.signature.inputs:
+                if arg.name and arg.name not in nodes:
+                    raise SpecViolationError(
+                        f"Input {arg.name} does not exist in the graph."
+                    )
+            for arg in entry.signature.outputs:
+                if arg.name and arg.name not in nodes:
+                    raise SpecViolationError(
+                        f"Output {arg.name} does not exist in the graph."
+                    )
+def _verify_exported_program_signature(exported_program) -> None:
+    # Check ExportedProgram signature matches
+    gs = exported_program.graph_signature
+    # Check every node in the signature exists in the graph
+    input_node_names = [
+        node.name for node in exported_program.graph.nodes if node.op == "placeholder"
+    ]
+    if len(input_node_names) != len(gs.input_specs):
+        raise SpecViolationError(
+            f"Number of graph inputs ({len(input_node_names)}) "
+            f"does not match number of inputs in the graph signature ({len(gs.input_specs)})"
+        )
+    for input_spec, node in zip(gs.input_specs, input_node_names):
+        if isinstance(
+            input_spec.arg,
+            (TensorArgument, SymIntArgument, SymFloatArgument, SymBoolArgument),
+        ):
+            if input_spec.arg.name != node:
+                raise SpecViolationError(
+                    f"Input spec name {input_spec.arg.name} does not match node name {node}"
+                )
+        if input_spec.kind == InputKind.USER_INPUT:
+            continue
+        elif input_spec.kind == InputKind.PARAMETER:
+            if not isinstance(input_spec.arg, TensorArgument):
+                raise SpecViolationError(
+                    f"Parameter {input_spec.name} is not a tensor argument. Found {input_spec.arg} instead."
+                )
+            if input_spec.target is None:
+                raise SpecViolationError(
+                    f"InputSpec for {input_spec.name} has no target."
+                )
+            param = input_spec.target
+            if param not in exported_program.state_dict:
+                raise SpecViolationError(f"Parameter {param} is not in the state dict.")
+            if not isinstance(exported_program.state_dict[param], torch.nn.Parameter):
+                raise SpecViolationError(
+                    f"State dict entry for parameter {param} is not an instance of torch.nn.Parameter."
+                )
+        elif input_spec.kind == InputKind.BUFFER:
+            if not isinstance(input_spec.arg, TensorArgument):
+                raise SpecViolationError(
+                    f"Buffer {input_spec.name} is not a tensor argument. Found {input_spec.arg} instead."
+                )
+            if input_spec.target is None:
+                raise SpecViolationError(
+                    f"InputSpec for {input_spec.name} has no target."
+                )
+            buffer = input_spec.target
+            if input_spec.persistent is None:
+                raise SpecViolationError(
+                    f"Buffer {buffer} is missing a persistence flag"
+                )
+            if (
+                input_spec.persistent is True
+                and buffer not in exported_program.state_dict
+            ):
+                raise SpecViolationError(f"Buffer {buffer} is not in the state dict.")
+            if input_spec.persistent is False and buffer in exported_program.state_dict:
+                raise SpecViolationError(
+                    f"Non-persistent buffer {buffer} is in the state dict, it should not be."
+                )
+        elif input_spec.kind == InputKind.CONSTANT_TENSOR:
+            if not isinstance(input_spec.arg, TensorArgument):
+                raise SpecViolationError(
+                    f"Constant tensor {input_spec.name} is not a tensor argument. Found {input_spec.arg} instead."
+                )
+            if input_spec.target is None:
+                raise SpecViolationError(
+                    f"InputSpec for {input_spec.name} has no target."
+                )
+            tensor_const = input_spec.target
+            if tensor_const not in exported_program.constants:
+                raise SpecViolationError(
+                    f"Constant tensor {tensor_const} is not in the constants dictionary."
+                )
+        elif input_spec.kind == InputKind.CUSTOM_OBJ:
+            if not isinstance(input_spec.arg, CustomObjArgument):
+                raise SpecViolationError(
+                    f"Custom object {input_spec.name} is not a custom object argument. Found {input_spec.arg} instead."
+                )
+            if input_spec.target is None:
+                raise SpecViolationError(
+                    f"InputSpec for {input_spec.name} has no target."
+                )
+            custom_obj = input_spec.target
+            if custom_obj not in exported_program.constants:
+                raise SpecViolationError(
+                    f"Custom object {custom_obj} is not in the constants dictionary."
+                )
+        elif input_spec.kind == InputKind.TOKEN:
+            if not isinstance(input_spec.arg, TokenArgument):
+                raise SpecViolationError(
+                    f"Constant tensor {input_spec.name} is not a tensor argument. Found {input_spec.arg} instead."
+                )
+        else:
+            raise SpecViolationError(f"Unknown InputKind {input_spec.kind}.")
+    # Check outputs
+    output_node = list(exported_program.graph.nodes)[-1]
+    assert output_node.op == "output"
+    output_nodes = [
+        arg.name if isinstance(arg, torch.fx.Node) else arg
+        for arg in output_node.args[0]
+    ]
+    if len(output_nodes) != len(gs.output_specs):
+        raise SpecViolationError(
+            f"Number of output nodes {len(output_nodes)} is different "
+            "Than the number of outputs specified by the graph signature: \n"
+            f"Number of mutated buffers: {len(gs.buffers_to_mutate)}. \n"
+            f"Number of user outputs: {len(gs.user_outputs)}. \n"
+        )
+    num_tokens = len(gs.output_tokens)
+    end = (
+        len(gs.buffers_to_mutate)
+        + len(gs.parameters_to_mutate)
+        + len(gs.user_inputs_to_mutate)
+        + num_tokens
+    )
+    mutate_nodes: list[str] = output_nodes[num_tokens:end]
+    user_output_nodes = output_nodes[end : end + len(gs.user_outputs)]
+    for mutation_node in mutate_nodes:
+        if mutation_node in gs.buffers_to_mutate:
+            if gs.buffers_to_mutate[mutation_node] not in gs.buffers:
+                raise SpecViolationError(
+                    f"Buffer output {mutation_node} does not point to a buffer that exists. \n"
+                    f"Dict of buffers that are mutated, in order: {gs.buffers_to_mutate} \n"
+                    f"Buffer nodes available: {gs.buffers} \n"
+                )
+        elif mutation_node in gs.parameters_to_mutate:
+            if gs.parameters_to_mutate[mutation_node] not in gs.parameters:
+                raise SpecViolationError(
+                    f"Parameter output {mutation_node} does not point to a parameter that exists. \n"
+                    f"Dict of parameters that are mutated, in order: {gs.parameters_to_mutate} \n"
+                    f"Parameter nodes available: {gs.parameters} \n"
+                )
+        elif mutation_node in gs.user_inputs_to_mutate:
+            if gs.user_inputs_to_mutate[mutation_node] not in gs.user_inputs:
+                raise SpecViolationError(
+                    f"User input output {mutation_node} does not point to a user input that exists. \n"
+                    f"Dict of user inputs that are mutated, in order: {gs.user_inputs_to_mutate} \n"
+                    f"User input nodes available: {gs.user_inputs} \n"
+                )
+        else:
+            raise SpecViolationError(
+                f"Mutation node {mutation_node} is neither a buffer nor a user input. "
+                f"Buffers to mutate: {gs.buffers_to_mutate}, User inputs to mutate: {gs.user_inputs_to_mutate}"
+            )
+    for user_output_node, user_output_name in zip(user_output_nodes, gs.user_outputs):
+        if user_output_node != user_output_name:
+            raise SpecViolationError(
+                f"User output {user_output_node} is not in the correct "
+                "order or is not found in the "
+                f"exported program's user_output list: {gs.user_outputs}. "
+            )
+def load_verifier(dialect: str) -> type[Verifier]:
+    if dialect == "ATEN" or dialect == "":
+        return _VerifierMeta._registry.get(dialect, Verifier)
+    return _VerifierMeta._registry[dialect]

Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/_inductor/__autotune_main__.py ADDED Viewed

	@@ -0,0 +1,33 @@

+import argparse
+import logging
+import os
+from torch._inductor.autotune_process import TuningProcess
+from torch._inductor.compile_worker.utils import _async_compile_initializer
+log = logging.getLogger(__name__)
+def main() -> None:
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--parent", type=int)
+    parser.add_argument("--read-fd", type=int)
+    parser.add_argument("--write-fd", type=int)
+    args = parser.parse_args()
+    read_pipe = os.fdopen(args.read_fd, "rb")
+    write_pipe = os.fdopen(args.write_fd, "wb")
+    try:
+        # Ensures the subprocess exits if the parent crashes:
+        _async_compile_initializer(args.parent)
+        TuningProcess.process_main(read_pipe, write_pipe)
+    except Exception:
+        log.exception("Uncaught exception in autotune subprocess")
+    finally:
+        read_pipe.close()
+        write_pipe.close()
+if __name__ == "__main__":
+    main()

Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/_inductor/__init__.py ADDED Viewed

	@@ -0,0 +1,447 @@

+# mypy: allow-untyped-defs
+from __future__ import annotations
+import io
+import logging
+import os
+from typing import Any, IO, Literal, Optional, TYPE_CHECKING, Union
+import torch.fx
+from .standalone_compile import CompiledArtifact  # noqa: TC001
+if TYPE_CHECKING:
+    from torch._inductor.utils import InputType
+    from torch.export import ExportedProgram
+    from torch.export.pt2_archive._package import AOTICompiledModel
+    from torch.export.pt2_archive._package_weights import Weights
+    from torch.types import FileLike
+__all__ = [
+    "compile",
+    "list_mode_options",
+    "list_options",
+    "cudagraph_mark_step_begin",
+    "standalone_compile",
+]
+log = logging.getLogger(__name__)
+def compile(
+    gm: torch.fx.GraphModule,
+    example_inputs: list[InputType],
+    options: Optional[dict[str, Any]] = None,
+):
+    """
+    Compile a given FX graph with TorchInductor.  This allows compiling
+    FX graphs captured without using TorchDynamo.
+    Args:
+        gm: The FX graph to compile.
+        example_inputs:  List of tensor inputs.
+        options:  Optional dict of config options.  See `torch._inductor.config`.
+    Returns:
+        Callable with same behavior as gm but faster.
+    """
+    from .compile_fx import compile_fx
+    return compile_fx(gm, example_inputs, config_patches=options)
+def aoti_compile_and_package(
+    exported_program: ExportedProgram,
+    _deprecated_unused_args=None,
+    _deprecated_unused_kwargs=None,
+    *,
+    package_path: Optional[FileLike] = None,
+    inductor_configs: Optional[dict[str, Any]] = None,
+) -> str:
+    """
+    Compiles the exported program with AOTInductor, and packages it into a .pt2
+    artifact specified by the input package_path. To load the package, you can
+    call ``torch._inductor.aoti_load_package(package_path)``.
+    An example usage is as follows:
+    .. code-block:: python
+        ep = torch.export.export(M(), ...)
+        aoti_file = torch._inductor.aoti_compile_and_package(
+            ep, package_path="my_package.pt2"
+        )
+        compiled_model = torch._inductor.aoti_load_package("my_package.pt2")
+    To compile and save multiple models into a single ``.pt2`` artifact, you can do
+    the following:
+    .. code-block:: python
+        ep1 = torch.export.export(M1(), ...)
+        aoti_file1 = torch._inductor.aot_compile(
+            ep1, ..., options={"aot_inductor.package": True}
+        )
+        ep2 = torch.export.export(M2(), ...)
+        aoti_file2 = torch._inductor.aot_compile(
+            ep2, ..., options={"aot_inductor.package": True}
+        )
+        from torch._inductor.package import package_aoti, load_package
+        package_aoti("my_package.pt2", {"model1": aoti_file1, "model2": aoti_file2})
+        compiled_model1 = load_package("my_package.pt2", "model1")
+        compiled_model2 = load_package("my_package.pt2", "model2")
+    Args:
+        exported_program: An exported program created through a call from torch.export
+        package_path: Optional specified path to the generated .pt2 artifact.
+        inductor_configs: Optional dictionary of configs to control inductor.
+    Returns:
+        Path to the generated artifact
+    """
+    from torch.export import ExportedProgram
+    from .debug import aot_inductor_minifier_wrapper
+    if not isinstance(exported_program, ExportedProgram):
+        raise ValueError("Only ExportedProgram is supported")
+    if exported_program.example_inputs is None:
+        raise RuntimeError(
+            "exported_program.example_inputs is required to be set in order "
+            "for AOTInductor compilation."
+        )
+    if _deprecated_unused_args is not None or _deprecated_unused_kwargs is not None:
+        log.warning(
+            "You no longer need to specify args/kwargs to aoti_compile_and_package "
+            "as we can get this information from exported_program.example_inputs."
+        )
+    assert (
+        package_path is None
+        or (
+            isinstance(package_path, (io.IOBase, IO))
+            and package_path.writable()
+            and package_path.seekable()
+        )
+        or (
+            isinstance(package_path, (str, os.PathLike))
+            and os.fspath(package_path).endswith(".pt2")
+        )
+    ), (
+        f"Expect package path to be a file ending in .pt2, is None, or is a buffer. Instead got {package_path}"
+    )
+    inductor_configs = inductor_configs or {}
+    inductor_configs["aot_inductor.package"] = True
+    if inductor_configs.get("aot_inductor.output_path"):
+        raise RuntimeError(
+            "Please pass in a package path to aot_inductor_compile() instead "
+            "of setting the aot_inductor.output_path config."
+        )
+    # a wrapper around aoti_compile_and_package_inner.
+    return aot_inductor_minifier_wrapper(
+        _aoti_compile_and_package_inner,
+        exported_program,
+        # pyrefly: ignore [bad-argument-type]
+        package_path=package_path,
+        inductor_configs=inductor_configs,
+    )
+def _aoti_compile_and_package_inner(
+    gm: torch.nn.Module,
+    # flat_example_inputs: List[Any],
+    args: tuple[Any],
+    kwargs: Optional[dict[str, Any]] = None,
+    *,
+    load_and_run: bool = False,
+    check_accuracy: Optional[str] = None,
+    package_path: Optional[Union[str, io.BytesIO]] = None,
+    inductor_configs: Optional[dict[str, Any]] = None,
+):
+    """
+    See docstring for aoti_compile_and_package.
+    If `load_and_run` is True, this function will load the compiled model and run it.
+    This is for the minifier to check the correctness of the compiled model.
+    If `check_accuracy` is set, this function will check the accuracy of the compiled
+    model against gm. kwargs must be None if check_accuracy is set.
+    "strict_accuracy" means "we will minify any time we see anything that
+     diverges", whereas "accuracy" is more conservative, and will only minify if there
+     is a meaningful fp64 divergence
+    """
+    if check_accuracy:
+        assert kwargs is None or len(kwargs) == 0, (
+            "when checking for accuracy, the inputs must have been flattened and kwargs is None"
+        )
+    from .package import package_aoti
+    assert isinstance(gm, torch.fx.GraphModule)
+    kwargs = kwargs or {}
+    aoti_files = aot_compile(gm, args, kwargs, options=inductor_configs)
+    assert isinstance(aoti_files, list)
+    if package_path is None:
+        path = [
+            os.path.splitext(file)[0]
+            for file in aoti_files
+            if isinstance(file, str) and os.path.splitext(file)[1] == ".so"
+        ]
+        if len(path) == 0:
+            path = [
+                os.path.splitext(file)[0]
+                for file in aoti_files
+                if isinstance(file, str) and os.path.splitext(file)[1] == ".cpp"
+            ]
+        package_path = path[0] + ".pt2"
+    res = package_aoti(package_path, aoti_files)
+    assert res == package_path
+    if load_and_run or check_accuracy:
+        compiled_model = aoti_load_package(package_path)
+        if check_accuracy:
+            from torch._dynamo.debug_utils import AccuracyError, same_two_models
+            # This might look inverted but it's not.  strict_accuracy means "we will
+            # minify any time we see anything that diverges", whereas accuracy is more
+            # conservative, and will only minify if there is a meaningful fp64
+            # divergence
+            not_strict_accuracy = check_accuracy == "accuracy"
+            if not same_two_models(
+                gm,
+                compiled_model,  # type: ignore[arg-type]
+                args,
+                only_fwd=True,
+                require_fp64=not_strict_accuracy,
+                ignore_non_fp=not_strict_accuracy,
+            ):
+                raise AccuracyError("Bad accuracy detected")
+        else:
+            compiled_model(*args, **kwargs)
+    return package_path
+def aoti_load_package(
+    path: FileLike, run_single_threaded: bool = False, device_index: int = -1
+) -> AOTICompiledModel:
+    """
+    Loads the model from the PT2 package.
+    If multiple models were packaged into the PT2, this will load the default
+    model. To load a specific model, you can directly call the load API
+    .. code-block:: python
+        from torch._inductor.package import load_package
+        compiled_model1 = load_package("my_package.pt2", "model1")
+        compiled_model2 = load_package("my_package.pt2", "model2")
+    Args:
+        path: Path to the .pt2 package
+        run_single_threaded (bool): Whether the model should be run without
+            thread synchronization logic. This is useful to avoid conflicts with
+            CUDAGraphs.
+        device_index (int): The index of the device to which the PT2 package is
+            to be loaded. By default, `device_index=-1` is used, which corresponds
+            to the device `cuda` when using CUDA. Passing `device_index=1` would
+            load the package to `cuda:1`, for example.
+    """
+    from torch._inductor.package import load_package
+    return load_package(
+        path, run_single_threaded=run_single_threaded, device_index=device_index
+    )
+def aot_compile(
+    gm: torch.fx.GraphModule,
+    args: tuple[Any, ...],
+    kwargs: Optional[dict[str, Any]] = None,
+    *,
+    options: Optional[dict[str, Any]] = None,
+) -> Union[str, list[Union[str, Weights]], torch.fx.GraphModule]:
+    """
+    Ahead-of-time compile a given FX graph with TorchInductor into a shared library.
+    Args:
+        gm: The FX graph to compile.
+        args:  Example arguments
+        kwargs: Example keyword arguments
+        options:  Optional dict of config options.  See `torch._inductor.config`.
+    Returns:
+        Path to the generated shared library, or a list of files generated by
+        AOTI if aot_inductor.package=True.
+        TODO: make it return a list by default
+    """
+    from .compile_fx import _aoti_flatten_inputs, compile_fx_aot
+    if hasattr(gm, "_guards_fn"):
+        # Do not compile the guards function, since it may contain checks
+        # that are not currently supported by AOTI. In particular, non-Tensor
+        # arguments are converted to None and will fail specialization checks.
+        node = next(iter(gm.graph.find_nodes(op="call_module", target="_guards_fn")))
+        gm.graph.erase_node(node)
+        delattr(gm, "_guards_fn")
+        gm.recompile()
+    flat_example_inputs, options = _aoti_flatten_inputs(
+        gm, args, kwargs, options=options
+    )
+    from torch._export.utils import _compiling_state_context
+    with _compiling_state_context():
+        return compile_fx_aot(
+            gm,
+            flat_example_inputs,  # type: ignore[arg-type]
+            config_patches=options,
+        )
+lite_mode_options = {
+    # Fallback by default unless users explicitly annotated with
+    # regional inductor compile.
+    "fallback_by_default": True,
+    "selective_decompose": True,
+    # Disable reorder optimizations
+    "reorder_for_peak_memory": False,
+    "reorder_for_compute_comm_overlap": False,
+    "triton.reorder_for_reducing_graph_partitions": False,
+    # Disable pre-, joint-, post-grad passes
+    "use_pre_grad_passes": False,
+    "use_joint_graph_passes": False,
+    "use_post_grad_passes": False,
+    # Disable dead code elimination (dce) and buffer reuse
+    "use_dce": False,
+    "allow_buffer_reuse": False,
+}
+def list_mode_options(
+    mode: Optional[str] = None, dynamic: Optional[bool] = None
+) -> dict[str, Any]:
+    r"""Returns a dictionary describing the optimizations that each of the available
+    modes passed to `torch.compile()` performs.
+    Args:
+        mode (str, optional): The mode to return the optimizations for.
+        If None, returns optimizations for all modes
+        dynamic (bool, optional): Whether dynamic shape is enabled.
+    Example::
+        >>> torch._inductor.list_mode_options()
+    """
+    mode_options: dict[str, dict[str, bool]] = {
+        "default": {},
+        # lite backend for opt-in optimizations
+        "lite": lite_mode_options,
+        # enable cudagraphs
+        "reduce-overhead": {
+            "triton.cudagraphs": True,
+        },
+        # enable max-autotune
+        "max-autotune-no-cudagraphs": {
+            "max_autotune": True,
+            "coordinate_descent_tuning": True,
+        },
+        # enable max-autotune
+        # enable cudagraphs
+        "max-autotune": {
+            "max_autotune": True,
+            "triton.cudagraphs": True,
+            "coordinate_descent_tuning": True,
+        },
+    }
+    try:
+        return mode_options[mode] if mode else mode_options
+    except KeyError as e:
+        raise RuntimeError(
+            f"Unrecognized mode={mode}, should be one of: {', '.join(mode_options.keys())}"
+        ) from e
+def list_options() -> list[str]:
+    r"""Returns a dictionary describing the optimizations and debug configurations
+    that are available to `torch.compile()`.
+    The options are documented in `torch._inductor.config`.
+    Example::
+        >>> torch._inductor.list_options()
+    """
+    from torch._inductor import config
+    current_config: dict[str, Any] = config.get_config_copy()
+    return list(current_config.keys())
+def cudagraph_mark_step_begin():
+    "Indicates that a new iteration of inference or training is about to begin."
+    from .cudagraph_trees import mark_step_begin
+    mark_step_begin()
+def standalone_compile(
+    gm: torch.fx.GraphModule,
+    example_inputs: list[InputType],
+    *,
+    dynamic_shapes: Literal[
+        "from_example_inputs", "from_tracing_context", "from_graph"
+    ] = "from_graph",
+    options: Optional[dict[str, Any]] = None,
+    aot: bool = False,  # AOT mode, which uses BundledAOTAutogradCache
+) -> CompiledArtifact:
+    """
+    Precompilation API for inductor.
+    .. code-block:: python
+        compiled_artifact = torch._inductor.standalone_compile(gm, args)
+        compiled_artifact.save(path=path, format="binary")
+        # Later on a new process
+        loaded = torch._inductor.CompiledArtifact.load(path=path, format="binary")
+        compiled_out = loaded(*args)
+    Args:
+        gm: Graph Module
+        example_inputs: Inputs for the graph module
+        dynamic_shapes: If "from_graph" (default), we will use the dynamic
+            shapes in the passed-in graph module.
+            If "from_tracing_context", we use the dynamic shape info in the
+            ambient tracing context.
+            If "from_example_inputs", we will specialize the graph on the
+            example_inputs.
+        options: Inductor compilation options
+    Returns:
+        CompiledArtifact that can be saved to disk or invoked directly.
+    """
+    from .standalone_compile import standalone_compile
+    options = options if options else {}
+    return standalone_compile(
+        gm, example_inputs, dynamic_shapes=dynamic_shapes, options=options, aot=aot
+    )