BryanW commited on
Commit
175af23
·
verified ·
1 Parent(s): a1d17d3

Add files using upload-large-folder tool

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/__pycache__/_VF.cpython-312.pyc +0 -0
  2. Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/__pycache__/__config__.cpython-312.pyc +0 -0
  3. Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/__pycache__/__future__.cpython-312.pyc +0 -0
  4. Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/__pycache__/_appdirs.cpython-312.pyc +0 -0
  5. Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/__pycache__/_classes.cpython-312.pyc +0 -0
  6. Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/__pycache__/_compile.cpython-312.pyc +0 -0
  7. Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/__pycache__/_custom_ops.cpython-312.pyc +0 -0
  8. Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/__pycache__/_environment.cpython-312.pyc +0 -0
  9. Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/__pycache__/_guards.cpython-312.pyc +0 -0
  10. Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/__pycache__/_jit_internal.cpython-312.pyc +0 -0
  11. Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/__pycache__/_linalg_utils.cpython-312.pyc +0 -0
  12. Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/__pycache__/_lobpcg.cpython-312.pyc +0 -0
  13. Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/__pycache__/_lowrank.cpython-312.pyc +0 -0
  14. Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/__pycache__/_namedtensor_internals.cpython-312.pyc +0 -0
  15. Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/__pycache__/_ops.cpython-312.pyc +0 -0
  16. Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/__pycache__/_python_dispatcher.cpython-312.pyc +0 -0
  17. Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/__pycache__/_size_docs.cpython-312.pyc +0 -0
  18. Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/__pycache__/_sources.cpython-312.pyc +0 -0
  19. Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/__pycache__/_storage_docs.cpython-312.pyc +0 -0
  20. Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/__pycache__/_streambase.cpython-312.pyc +0 -0
  21. Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/__pycache__/_tensor.cpython-312.pyc +0 -0
  22. Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/__pycache__/_tensor_str.cpython-312.pyc +0 -0
  23. Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/__pycache__/_thread_safe_fork.cpython-312.pyc +0 -0
  24. Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/__pycache__/_utils.cpython-312.pyc +0 -0
  25. Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/__pycache__/_utils_internal.cpython-312.pyc +0 -0
  26. Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/__pycache__/_vmap_internals.cpython-312.pyc +0 -0
  27. Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/__pycache__/_weights_only_unpickler.cpython-312.pyc +0 -0
  28. Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/__pycache__/functional.cpython-312.pyc +0 -0
  29. Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/__pycache__/hub.cpython-312.pyc +0 -0
  30. Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/__pycache__/library.cpython-312.pyc +0 -0
  31. Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/__pycache__/quasirandom.cpython-312.pyc +0 -0
  32. Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/__pycache__/random.cpython-312.pyc +0 -0
  33. Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/__pycache__/return_types.cpython-312.pyc +0 -0
  34. Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/__pycache__/serialization.cpython-312.pyc +0 -0
  35. Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/__pycache__/storage.cpython-312.pyc +0 -0
  36. Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/__pycache__/torch_version.cpython-312.pyc +0 -0
  37. Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/__pycache__/types.cpython-312.pyc +0 -0
  38. Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/__pycache__/version.cpython-312.pyc +0 -0
  39. Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/_awaits/__init__.py +53 -0
  40. Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/_decomp/__init__.py +549 -0
  41. Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/_decomp/decompositions.py +0 -0
  42. Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/_decomp/decompositions_for_jvp.py +336 -0
  43. Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/_decomp/decompositions_for_rng.py +266 -0
  44. Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/_dispatch/__init__.py +0 -0
  45. Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/_dispatch/python.py +192 -0
  46. Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/_export/config.py +45 -0
  47. Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/_export/error.py +56 -0
  48. Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/_export/verifier.py +531 -0
  49. Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/_inductor/__autotune_main__.py +33 -0
  50. Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/_inductor/__init__.py +447 -0
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/__pycache__/_VF.cpython-312.pyc ADDED
Binary file (1.56 kB). View file
 
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/__pycache__/__config__.cpython-312.pyc ADDED
Binary file (1.08 kB). View file
 
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/__pycache__/__future__.cpython-312.pyc ADDED
Binary file (3.58 kB). View file
 
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/__pycache__/_appdirs.cpython-312.pyc ADDED
Binary file (29.6 kB). View file
 
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/__pycache__/_classes.cpython-312.pyc ADDED
Binary file (3.34 kB). View file
 
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/__pycache__/_compile.cpython-312.pyc ADDED
Binary file (2.54 kB). View file
 
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/__pycache__/_custom_ops.cpython-312.pyc ADDED
Binary file (13.9 kB). View file
 
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/__pycache__/_environment.cpython-312.pyc ADDED
Binary file (344 Bytes). View file
 
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/__pycache__/_guards.cpython-312.pyc ADDED
Binary file (57 kB). View file
 
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/__pycache__/_jit_internal.cpython-312.pyc ADDED
Binary file (52.6 kB). View file
 
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/__pycache__/_linalg_utils.cpython-312.pyc ADDED
Binary file (6.66 kB). View file
 
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/__pycache__/_lobpcg.cpython-312.pyc ADDED
Binary file (49.5 kB). View file
 
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/__pycache__/_lowrank.cpython-312.pyc ADDED
Binary file (12.8 kB). View file
 
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/__pycache__/_namedtensor_internals.cpython-312.pyc ADDED
Binary file (6.34 kB). View file
 
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/__pycache__/_ops.cpython-312.pyc ADDED
Binary file (63.1 kB). View file
 
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/__pycache__/_python_dispatcher.cpython-312.pyc ADDED
Binary file (5.34 kB). View file
 
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/__pycache__/_size_docs.cpython-312.pyc ADDED
Binary file (1.35 kB). View file
 
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/__pycache__/_sources.cpython-312.pyc ADDED
Binary file (5.69 kB). View file
 
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/__pycache__/_storage_docs.cpython-312.pyc ADDED
Binary file (1.75 kB). View file
 
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/__pycache__/_streambase.cpython-312.pyc ADDED
Binary file (968 Bytes). View file
 
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/__pycache__/_tensor.cpython-312.pyc ADDED
Binary file (78.1 kB). View file
 
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/__pycache__/_tensor_str.cpython-312.pyc ADDED
Binary file (32.2 kB). View file
 
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/__pycache__/_thread_safe_fork.cpython-312.pyc ADDED
Binary file (220 Bytes). View file
 
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/__pycache__/_utils.cpython-312.pyc ADDED
Binary file (44.5 kB). View file
 
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/__pycache__/_utils_internal.cpython-312.pyc ADDED
Binary file (14.3 kB). View file
 
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/__pycache__/_vmap_internals.cpython-312.pyc ADDED
Binary file (10.2 kB). View file
 
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/__pycache__/_weights_only_unpickler.cpython-312.pyc ADDED
Binary file (26 kB). View file
 
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/__pycache__/functional.cpython-312.pyc ADDED
Binary file (85.1 kB). View file
 
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/__pycache__/hub.cpython-312.pyc ADDED
Binary file (37.9 kB). View file
 
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/__pycache__/library.cpython-312.pyc ADDED
Binary file (75 kB). View file
 
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/__pycache__/quasirandom.cpython-312.pyc ADDED
Binary file (10.3 kB). View file
 
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/__pycache__/random.cpython-312.pyc ADDED
Binary file (8.97 kB). View file
 
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/__pycache__/return_types.cpython-312.pyc ADDED
Binary file (2.04 kB). View file
 
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/__pycache__/serialization.cpython-312.pyc ADDED
Binary file (87.8 kB). View file
 
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/__pycache__/storage.cpython-312.pyc ADDED
Binary file (73.7 kB). View file
 
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/__pycache__/torch_version.cpython-312.pyc ADDED
Binary file (3.27 kB). View file
 
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/__pycache__/types.cpython-312.pyc ADDED
Binary file (4.21 kB). View file
 
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/__pycache__/version.cpython-312.pyc ADDED
Binary file (636 Bytes). View file
 
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/_awaits/__init__.py ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ from typing import Generic, TypeVar
4
+
5
+ import torch
6
+
7
+ __all__ = ['Await']
8
+
9
+ W = TypeVar("W")
10
+
11
+ class _PyAwaitMeta(type(torch._C._Await), type(Generic)): # type: ignore[misc, no-redef]
12
+ pass
13
+
14
+ class _Await(torch._C._Await, Generic[W], metaclass=_PyAwaitMeta):
15
+ r"""
16
+ Wrapper around a ``torch._C.Await`` which encapsulates delayed execution
17
+ of a callable. All manipulations happen with functions ``torch.jit._awaitable``,
18
+ ``torch.jit._awaitable_wait``, ``torch.jit._awaitable_nowait``.
19
+
20
+ Torch scriptable manipulations:
21
+ ``torch.jit._awaitable(func, *args)``
22
+ Creates ``Await[W]`` object, where W is return type of func.
23
+
24
+ Returns:
25
+ ``torch.jit._awaitable_wait(Await[W])``
26
+ Returns the result of the function, specified at ``_awaitable``, with specified arguments.
27
+
28
+ Returns:
29
+ The result of type ``W`` of the function call. The result is owned by ``Await[W]``
30
+ and returned on all following ``_awaitable_wait`` calls.
31
+
32
+
33
+ ``torch.jit._awaitable_nowait(W)``
34
+ Returns:
35
+ Trivial ``Await[W]`` with specified result.
36
+
37
+
38
+ Only in eager mode:
39
+ ``fn() -> Callable[Tuple[Any], W]``
40
+ Returns:
41
+ Specified at ``_awaitable`` python function ``func``.
42
+
43
+ ``args() -> Tuple[Any]``
44
+ Returns:
45
+ Specified at ``_awaitable`` python args.
46
+
47
+ ``is_nowait() -> _bool``
48
+ Returns:
49
+ ``True`` if this object was created via ``_awaitable_nowait`` call (trivial `Await[W]`).
50
+
51
+ In eager mode ``Await[W]`` can be used as ``W`` i.e. attributes of W can be called on ``Await[W]``,
52
+ ``_awaitable_wait()`` call will be transparently added.
53
+ """
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/_decomp/__init__.py ADDED
@@ -0,0 +1,549 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # mypy: allow-untyped-defs
2
+ import inspect
3
+ from collections import defaultdict
4
+ from collections.abc import Callable, Sequence
5
+ from functools import lru_cache, partial, wraps
6
+ from itertools import chain
7
+ from typing import Optional, TYPE_CHECKING, TypeVar, Union
8
+ from typing_extensions import ParamSpec
9
+
10
+
11
+ if TYPE_CHECKING:
12
+ from torch.export.decomp_utils import CustomDecompTable
13
+
14
+ import torch
15
+ import torch.library
16
+ from torch._ops import HigherOrderOperator, OperatorBase, OpOverload, OpOverloadPacket
17
+ from torch._prims_common import CustomOutParamAnnotation
18
+ from torch._subclasses.functional_tensor import FunctionalTensor
19
+ from torch.utils import _pytree as pytree
20
+
21
+
22
+ __all__ = [
23
+ "decomposition_table",
24
+ "pre_autograd_decomposition_table",
25
+ "meta_table",
26
+ "register_decomposition",
27
+ "get_decompositions",
28
+ "core_aten_decompositions",
29
+ "_should_decompose_because_unsafe_op",
30
+ ]
31
+
32
+ _T = TypeVar("_T")
33
+ _P = ParamSpec("_P")
34
+
35
+ # TODO: relax key type here; torch registrations should be possible to; but
36
+ # right now this type is accurate
37
+ global_decomposition_table: dict[str, dict[torch._ops.OperatorBase, Callable]] = (
38
+ defaultdict(dict)
39
+ )
40
+
41
+ decomposition_table = global_decomposition_table["post_autograd"]
42
+ pre_autograd_decomposition_table = global_decomposition_table["pre_autograd"]
43
+ meta_table = global_decomposition_table["meta"]
44
+
45
+
46
+ def _should_decompose_because_unsafe_op(op: torch._ops.OperatorBase) -> bool:
47
+ """
48
+ Returns True if the op must always decompose in export/compile tracing system
49
+
50
+ In export, we always decompose certain CIA ops that are tagged with
51
+ maybe_aliasing_or_mutating because we statically need to know if the op is
52
+ mutating or not. But these CIA ops could have different behaviour in runtime.
53
+
54
+ native_batch_norm is a prim op which has a wrong schema and it needs to be replaced
55
+ with correct schema. But until then, we will force decompose it via this tag.
56
+ """
57
+ if not isinstance(op, torch._ops.OpOverload):
58
+ return False
59
+ if torch.Tag.maybe_aliasing_or_mutating in op.tags:
60
+ return True
61
+ return op is torch.ops.aten.native_batch_norm.default
62
+
63
+
64
+ def _add_op_to_registry(registry, op, fn):
65
+ """
66
+ This is an internal API for adding an op to the decomposition table.
67
+
68
+ If op is OpOverload, it will be added to the registry directly.
69
+ If op is OpOverloadPacket, all the valid op_overloads in the packet will be added to the registry.
70
+ """
71
+ overloads: list[Union[torch._ops.OperatorBase]] = []
72
+ if isinstance(op, HigherOrderOperator):
73
+ # There's no concept of overloads for HigherOrderOperator
74
+ registry[op] = fn
75
+ return
76
+ elif isinstance(op, OpOverload):
77
+ overloads.append(op)
78
+ else:
79
+ assert isinstance(op, OpOverloadPacket)
80
+ for ol in op.overloads():
81
+ overloads.append(getattr(op, ol))
82
+
83
+ for op_overload in overloads:
84
+ if op_overload in registry:
85
+ raise RuntimeError(f"duplicate registrations for {op_overload}")
86
+ # TorchScript dumps a bunch of extra nonsense overloads
87
+ # which don't have corresponding dispatcher entries, we need
88
+ # to filter those out, e.g aten.add.float_int
89
+ if torch._C._dispatch_has_kernel(op_overload.name()):
90
+ registry[op_overload] = fn
91
+
92
+
93
+ def _convert_out_params(f):
94
+ out_annotation = f.__annotations__.get("out")
95
+
96
+ # If there are no out params, do not wrap the function.
97
+ if not out_annotation:
98
+ return f
99
+
100
+ # Hack to detect when out is a Tuple. There seems to be no pretty way of doing this
101
+ if getattr(out_annotation, "__origin__", None) is tuple:
102
+ sig = inspect.signature(f)
103
+ out_names = sig.return_annotation._fields
104
+ # If out is a tuple, we need to register a function that unpacks all the out
105
+ # elements as this is what native_functions.yaml expects
106
+
107
+ @wraps(f)
108
+ def _fn(*args, **kwargs):
109
+ out_kwargs = tuple(kwargs.pop(o, None) for o in out_names)
110
+ # Either all of the out kwargs are set or none of them
111
+ is_none = out_kwargs[0] is None
112
+ assert all((o is None) == is_none for o in out_kwargs)
113
+ return f(*args, **kwargs, out=None if is_none else out_kwargs)
114
+
115
+ out_params = [
116
+ inspect.Parameter(
117
+ o,
118
+ kind=inspect.Parameter.KEYWORD_ONLY,
119
+ default=None,
120
+ annotation=t,
121
+ )
122
+ for o, t in zip(out_names, out_annotation.__args__)
123
+ ]
124
+ # Drop the out parameter and concatenate the new kwargs in the signature
125
+ params = chain((v for k, v in sig.parameters.items() if k != "out"), out_params)
126
+ _fn.__signature__ = inspect.Signature( # type: ignore[attr-defined]
127
+ parameters=params, # type: ignore[arg-type]
128
+ return_annotation=sig.return_annotation,
129
+ )
130
+ # Drop the out parameter and concatenate the new kwargs in the annotations
131
+ _fn.__annotations__ = {k: v for k, v in f.__annotations__.items() if k != "out"}
132
+ for o in out_params:
133
+ _fn.__annotations__[o.name] = o.annotation
134
+
135
+ # Propagate that this function is wrapped by `out_wrapper`
136
+ _fn._torch_decompositions_out_wrapper = f._torch_decompositions_out_wrapper # type: ignore[attr-defined]
137
+
138
+ return _fn
139
+
140
+ # Alternatively, there may be a single tensor out parameter with a name
141
+ # other than "out". This will need special treatment and is indicated by an
142
+ # annotation, which we will remove here so it is not exposed after wrapping.
143
+ custom_out_param_name = f.__annotations__.pop(CustomOutParamAnnotation, None)
144
+ if custom_out_param_name:
145
+
146
+ @wraps(f)
147
+ def _fn(*args, **kwargs):
148
+ out_kwarg = kwargs.pop(custom_out_param_name, None)
149
+ return f(*args, **kwargs, out=out_kwarg)
150
+
151
+ out_param = inspect.Parameter(
152
+ custom_out_param_name,
153
+ kind=inspect.Parameter.KEYWORD_ONLY,
154
+ default=None,
155
+ annotation=out_annotation,
156
+ )
157
+
158
+ # Drop the out parameter and concatenate the new kwarg in the signature
159
+ sig = inspect.signature(f)
160
+ params = chain(
161
+ (v for k, v in sig.parameters.items() if k != "out"), (out_param,)
162
+ )
163
+ _fn.__signature__ = inspect.Signature( # type: ignore[attr-defined]
164
+ parameters=params, # type: ignore[arg-type]
165
+ return_annotation=sig.return_annotation,
166
+ )
167
+
168
+ # Drop the out parameter and concatenate the new kwargs in the annotations
169
+ _fn.__annotations__ = {k: v for k, v in f.__annotations__.items() if k != "out"}
170
+ _fn.__annotations__[out_param.name] = out_param.annotation
171
+
172
+ return _fn
173
+
174
+ return f
175
+
176
+
177
+ def register_decomposition(
178
+ aten_op, registry=None, *, type="post_autograd", unsafe=False
179
+ ) -> Callable[[Callable[_P, _T]], Callable[_P, _T]]:
180
+ """
181
+ A decorator to register a function as a decomposition to the Python
182
+ decomposition table. Use it like this::
183
+
184
+ @register_decomposition(torch.ops.aten.clamp_min)
185
+ def clamp_min(x):
186
+ return torch.clamp(self, min=min)
187
+
188
+ If you are writing a new decomposition, consider contributing it
189
+ directly to PyTorch in torch._decomp.decompositions.
190
+
191
+ This API is experimental; we are almost certainly going to extend
192
+ the API when we make decompositions eligible for use in transforms (e.g.,
193
+ autograd) and not just backend tracing, where we then need to know if a
194
+ decomposition can be used to simulate a transform.
195
+
196
+ By default, we also will register it to the Meta key of dispatcher,
197
+ and replace the c++ Meta implementation if there is already one.
198
+
199
+ unsafe kwarg is for reuse of this function for registering non-function
200
+ things
201
+ """
202
+
203
+ assert type in {"post_autograd", "pre_autograd", "meta"}
204
+
205
+ def decomposition_decorator(fn: Callable[_P, _T]) -> Callable[_P, _T]:
206
+ orig_fn = fn
207
+ if not unsafe:
208
+ fn = _convert_out_params(fn)
209
+
210
+ nonlocal registry
211
+ if registry is None:
212
+ registry = global_decomposition_table[type]
213
+
214
+ def register(op):
215
+ _add_op_to_registry(registry, op, fn)
216
+
217
+ # To handle allowing multiple aten_ops at once
218
+ pytree.tree_map_(register, aten_op)
219
+ return orig_fn
220
+
221
+ return decomposition_decorator
222
+
223
+
224
+ def get_decompositions(
225
+ aten_ops: Sequence[Union[torch._ops.OperatorBase, OpOverloadPacket]],
226
+ type: str = "post_autograd",
227
+ ) -> dict[torch._ops.OperatorBase, Callable]:
228
+ """
229
+ Retrieve a dictionary of decompositions corresponding to the list of
230
+ operator overloads and overload packets passed as input. Overload
231
+ packets will include all decomposed overloads in the packet. If there is
232
+ no decomposition for a requested operator, it is silently ignored.
233
+
234
+ This API is experimental; we are almost certainly going to give an alternate,
235
+ more recommended formulation, where a user provides the set of operators
236
+ they know how to implement, and we provide decompositions for everything
237
+ not in this set.
238
+ """
239
+ assert type in {"post_autograd", "pre_autograd", "meta"}
240
+
241
+ registry = global_decomposition_table[type]
242
+ packets_to_overloads = defaultdict(list)
243
+
244
+ for opo in registry:
245
+ if isinstance(opo, (OpOverload, OpOverloadPacket)):
246
+ packets_to_overloads[opo.overloadpacket].append(opo)
247
+ decompositions: dict[torch._ops.OperatorBase, Callable] = {}
248
+ for op in aten_ops:
249
+ if isinstance(op, OpOverloadPacket) and op in packets_to_overloads:
250
+ for op_overload in packets_to_overloads[op]:
251
+ decompositions[op_overload] = registry[op_overload]
252
+ elif isinstance(op, (torch._ops.OperatorBase)) and op in registry:
253
+ decompositions[op] = registry[op]
254
+ return decompositions
255
+
256
+
257
+ def remove_decompositions(
258
+ decompositions: dict[torch._ops.OperatorBase, Callable],
259
+ aten_ops: Sequence[Union[OpOverload, OpOverloadPacket]],
260
+ ) -> None:
261
+ """
262
+ Given a dictionary of decompositions obtained from get_decompositions(), removes
263
+ operators associated with a list of operator overloads and overload packets passed
264
+ as input. If the decomposition dictionary does not contain a decomposition that is
265
+ specified to be removed, it is silently ignored.
266
+ """
267
+ for op in aten_ops:
268
+ if isinstance(op, OpOverloadPacket):
269
+ for overload_name in op.overloads():
270
+ opo = getattr(op, overload_name)
271
+ decompositions.pop(opo, None)
272
+ elif isinstance(op, OpOverload):
273
+ decompositions.pop(op, None)
274
+
275
+
276
+ # populate the table
277
+ import torch._decomp.decompositions
278
+ import torch._refs
279
+
280
+
281
+ def core_aten_decompositions() -> "CustomDecompTable":
282
+ from torch.export.exported_program import default_decompositions
283
+
284
+ return default_decompositions()
285
+
286
+
287
+ # See NOTE [Core ATen Ops]
288
+ #
289
+ # list was copied from torch/_inductor/decomposition.py
290
+ # excluding decompositions that results in prim ops
291
+ # Resulting opset of decomposition is core aten ops
292
+ def _core_aten_decompositions_post_autograd() -> dict[
293
+ torch._ops.OperatorBase, Callable
294
+ ]:
295
+ aten = torch.ops.aten
296
+ return get_decompositions(
297
+ [
298
+ aten.addcdiv,
299
+ aten.addcdiv_,
300
+ aten.addcmul,
301
+ aten.addcmul_,
302
+ aten.addr,
303
+ aten.affine_grid_generator,
304
+ aten.alias_copy,
305
+ aten.all,
306
+ aten.aminmax,
307
+ aten.arange.default,
308
+ aten.arange.start,
309
+ aten.avg_pool2d_backward,
310
+ aten.baddbmm,
311
+ aten.binary_cross_entropy,
312
+ aten.binary_cross_entropy_backward,
313
+ aten.binary_cross_entropy_with_logits,
314
+ aten.block_diag,
315
+ aten.bernoulli.p,
316
+ aten.bernoulli.default,
317
+ aten.celu,
318
+ aten.celu_,
319
+ aten.channel_shuffle,
320
+ aten.clamp_max,
321
+ aten.clamp_min,
322
+ aten.col2im,
323
+ aten.count_nonzero,
324
+ aten.linalg_cross,
325
+ aten.cudnn_batch_norm,
326
+ aten.cudnn_batch_norm_backward,
327
+ aten.miopen_batch_norm_backward,
328
+ aten.deg2rad,
329
+ aten.deg2rad_,
330
+ aten.detach,
331
+ aten.diag_embed,
332
+ aten.diagonal_backward,
333
+ aten.diagonal_copy,
334
+ aten.dot,
335
+ aten.vdot,
336
+ aten.elu_,
337
+ aten.elu_backward,
338
+ aten._embedding_bag,
339
+ aten.embedding_dense_backward,
340
+ aten.empty_like,
341
+ aten._euclidean_dist.default,
342
+ aten.expand_as,
343
+ aten.expand_copy,
344
+ aten.eye,
345
+ aten.fill,
346
+ aten.fill_,
347
+ aten.floor_divide,
348
+ aten.frac,
349
+ aten.frac_,
350
+ aten._fused_moving_avg_obs_fq_helper,
351
+ aten.gelu_,
352
+ aten.gelu_backward,
353
+ aten.glu,
354
+ aten.glu_backward,
355
+ aten.hardshrink,
356
+ aten.hardsigmoid,
357
+ aten.hardsigmoid_,
358
+ aten.hardsigmoid_backward,
359
+ aten.hardswish,
360
+ aten.hardswish_,
361
+ aten.hardswish_backward,
362
+ aten.hardtanh_,
363
+ aten.hardtanh_backward,
364
+ aten.heaviside,
365
+ aten.heaviside_,
366
+ aten.huber_loss,
367
+ aten.huber_loss_backward,
368
+ aten.im2col,
369
+ aten.index_add.out,
370
+ aten.index_add.default,
371
+ aten.index_add_,
372
+ aten.index_copy.out,
373
+ aten.index_copy.default,
374
+ aten.index_copy_,
375
+ aten.index_fill.int_Scalar,
376
+ aten.index_fill.int_Tensor,
377
+ aten.index_fill.int_Scalar_out,
378
+ aten.index_fill.int_Tensor_out,
379
+ aten.index_fill_,
380
+ aten.isin,
381
+ aten.isneginf,
382
+ aten.isposinf,
383
+ aten.l1_loss,
384
+ aten._lazy_clone,
385
+ aten._test_parallel_materialize,
386
+ aten.leaky_relu_,
387
+ aten.leaky_relu_backward,
388
+ aten.lerp,
389
+ aten.lerp_,
390
+ aten.linspace,
391
+ aten.logaddexp,
392
+ aten.logaddexp2,
393
+ aten.logit,
394
+ aten.logit_,
395
+ aten.logit_backward,
396
+ aten.log_sigmoid_backward,
397
+ aten.log_sigmoid_forward,
398
+ aten._log_softmax_backward_data,
399
+ aten.logspace,
400
+ aten.logsumexp.default,
401
+ aten.masked_fill,
402
+ aten.masked_fill_,
403
+ aten.max_unpool2d,
404
+ aten.max_unpool3d,
405
+ aten.mish,
406
+ aten.mish_,
407
+ aten.mish_backward,
408
+ aten.mse_loss,
409
+ aten.mse_loss_backward,
410
+ aten.multi_margin_loss,
411
+ aten.multilabel_margin_loss_forward,
412
+ aten.mv,
413
+ aten.mvlgamma,
414
+ aten.mvlgamma_,
415
+ aten.nansum,
416
+ aten.nan_to_num,
417
+ aten.nan_to_num_,
418
+ aten.narrow,
419
+ aten.native_batch_norm_backward,
420
+ aten.native_dropout_backward,
421
+ aten.native_group_norm_backward,
422
+ aten.native_layer_norm_backward,
423
+ aten._fused_rms_norm,
424
+ aten._fused_rms_norm_backward,
425
+ aten.new_empty,
426
+ aten.new_full,
427
+ aten.new_ones,
428
+ aten.new_zeros,
429
+ aten.nll_loss2d_forward,
430
+ aten.nll_loss2d_backward,
431
+ aten.nll_loss_backward,
432
+ aten.nll_loss_forward,
433
+ aten.norm.ScalarOpt_dtype,
434
+ aten.norm.Scalar,
435
+ aten.norm.ScalarOpt_dim_dtype,
436
+ aten.norm.ScalarOpt_dim,
437
+ aten.norm.dtype_out,
438
+ aten.norm.out,
439
+ aten.norm.names_dtype_out,
440
+ aten.norm.names_out,
441
+ aten.norm.ScalarOpt_dtype_out,
442
+ aten.norm.Scalar_out,
443
+ aten.ones,
444
+ aten.ones_like,
445
+ aten.pixel_shuffle,
446
+ aten.pixel_unshuffle,
447
+ aten._prelu_kernel,
448
+ aten._prelu_kernel_backward,
449
+ aten._reshape_alias,
450
+ aten.rad2deg,
451
+ aten.rad2deg_,
452
+ aten.reflection_pad1d,
453
+ aten.reflection_pad1d_backward,
454
+ aten.reflection_pad2d,
455
+ aten.reflection_pad2d_backward,
456
+ aten.reflection_pad3d,
457
+ aten.reflection_pad3d_backward,
458
+ aten.replication_pad1d,
459
+ aten.replication_pad2d,
460
+ aten.replication_pad3d,
461
+ aten.renorm,
462
+ aten.renorm_,
463
+ aten.replication_pad2d,
464
+ aten.resize_as,
465
+ aten.roll,
466
+ aten.rot90,
467
+ aten.rrelu_with_noise,
468
+ aten.rrelu_with_noise_,
469
+ aten.rsub,
470
+ aten._safe_softmax,
471
+ aten._scaled_dot_product_flash_attention_for_cpu.default,
472
+ aten.select_backward,
473
+ aten.select_scatter,
474
+ aten.sgn,
475
+ aten.sgn_,
476
+ aten.sigmoid_backward,
477
+ aten.silu,
478
+ aten.silu_,
479
+ aten.silu_backward.grad_input,
480
+ aten.silu_backward,
481
+ aten.sinc,
482
+ aten.sinc_,
483
+ aten.slice_backward,
484
+ aten.smooth_l1_loss,
485
+ aten.smooth_l1_loss_backward,
486
+ aten.soft_margin_loss,
487
+ aten.soft_margin_loss_backward,
488
+ aten._softmax_backward_data,
489
+ aten.softplus,
490
+ aten.softplus_backward,
491
+ aten.softshrink,
492
+ aten.special_entr,
493
+ aten.special_log_ndtr,
494
+ aten.special_xlog1py,
495
+ aten.split.Tensor,
496
+ aten.split_with_sizes_copy,
497
+ aten.squeeze_copy,
498
+ aten.squeeze.default,
499
+ aten.squeeze.dim,
500
+ aten.std.correction,
501
+ aten.std.out,
502
+ aten.std.correction_out,
503
+ aten.std.names_out,
504
+ aten.std.correction_names_out,
505
+ aten.std_mean.correction,
506
+ aten.std_mean.correction_out,
507
+ aten.stack,
508
+ aten.sum.default,
509
+ aten.sum.out,
510
+ aten.t,
511
+ aten.t_copy,
512
+ aten.take,
513
+ aten.tanh_backward,
514
+ aten.threshold,
515
+ aten.threshold_,
516
+ aten.threshold_backward,
517
+ aten.trace,
518
+ aten.transpose.int,
519
+ aten.transpose_copy,
520
+ aten.tril,
521
+ aten.tril_,
522
+ aten.triu,
523
+ aten.triu_,
524
+ aten.unbind,
525
+ aten.unfold_backward,
526
+ aten.unfold_copy,
527
+ aten._unsafe_index,
528
+ aten._unsafe_index_put,
529
+ aten._unsafe_masked_index,
530
+ aten._unsafe_masked_index_put_accumulate,
531
+ aten.unsafe_split.Tensor,
532
+ aten.unsafe_split_with_sizes,
533
+ aten.unsqueeze_copy,
534
+ aten._unsafe_view,
535
+ aten.upsample_linear1d,
536
+ aten.upsample_bilinear2d.out,
537
+ aten.upsample_trilinear3d.out,
538
+ aten.upsample_nearest2d_backward,
539
+ aten.view_as_complex,
540
+ aten.xlogy,
541
+ aten.xlogy_,
542
+ aten.zero,
543
+ aten.zero_,
544
+ aten.zeros,
545
+ aten.zeros_like,
546
+ aten._chunk_cat,
547
+ aten._weight_norm_interface,
548
+ ]
549
+ )
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/_decomp/decompositions.py ADDED
The diff for this file is too large to render. See raw diff
 
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/_decomp/decompositions_for_jvp.py ADDED
@@ -0,0 +1,336 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # mypy: allow-untyped-decorators
2
+ # mypy: allow-untyped-defs
3
+ import inspect
4
+ from collections.abc import Callable
5
+ from typing import Optional
6
+
7
+ import torch
8
+ import torch._decomp
9
+ from torch import Tensor
10
+ from torch._prims_common.wrappers import _maybe_remove_out_wrapper
11
+
12
+
13
+ decomposition_table = torch._decomp.decomposition_table
14
+ decomposition_table_for_jvp: dict[torch._ops.OperatorBase, Callable] = {}
15
+ register_decomposition = torch._decomp.register_decomposition
16
+ aten = torch.ops.aten
17
+
18
+ # NOTE: [forward-mode AD decompositions mechanism]
19
+ #
20
+ # The mechanism is in VariableType,
21
+ # IF any inputs have forward grad
22
+ # AND there is no forward AD formula implemented
23
+ # AND the functions are actually differentiable
24
+ # run the decomposition
25
+ # See run_jit_decomposition_with_args_for_jvp
26
+ # We currently use python decompositions that we torchscript.
27
+ #
28
+ # Note that we would be building the backward graph at the decomposed level
29
+ # too, but that is OK, because we would've errored out otherwise anyway.
30
+ #
31
+ # TODO: The mechanism we are using to register decompositions doesn't
32
+ # seem to be exclusively used for jvp. So open question here is whether
33
+ # torch/csrc/jit/runtime/decomposition_registry.cpp is being used for other things.
34
+ # If that is the case, we may go down the decomposition path unexpectedly
35
+ # (and possibly produce an unintelligible error) vs erroring out earlier and
36
+ # printing that the forward AD formula is not implemented.
37
+ #
38
+ # The solution to this may be to have an explicitly white list control when
39
+ # to enable the decomposition.
40
+
41
+
42
+ def maybe_register_decomposition(op):
43
+ def decorator(f):
44
+ try:
45
+ return register_decomposition(op)(f)
46
+ except Exception:
47
+ return f
48
+
49
+ return decorator
50
+
51
+
52
+ # Functions where we need a special decomposition for jvp but there's another version that
53
+ # should be used more generally (ex. for jvp we need to recompute the mean and variance for
54
+ # the backwards of a normalization function. Without jvp, it should use the saved value)
55
+ decomposition_table_for_jvp = {}
56
+
57
+
58
+ def register_decomposition_for_jvp(fn):
59
+ return register_decomposition(fn, registry=decomposition_table_for_jvp)
60
+
61
+
62
+ def _register_jit_decomposition_for_jvp(decomp, use_python=False):
63
+ if decomp in decomposition_table_for_jvp:
64
+ decomposition_table_used = decomposition_table_for_jvp
65
+ elif decomp in decomposition_table:
66
+ decomposition_table_used = decomposition_table
67
+ else:
68
+ raise RuntimeError(f"could not find decomposition for {decomp}")
69
+ decomp_fn = decomposition_table_used[decomp]
70
+
71
+ # `out_wrapper` extends a decompositions signature with
72
+ # an `out` parameter. However jit will use the unwrapped function's
73
+ # signature instead so we need to unwrap here to prevent an error
74
+ decomp_fn = _maybe_remove_out_wrapper(decomp_fn)
75
+
76
+ if use_python:
77
+ decomp_fn = torch.jit.ignore(decomp_fn)
78
+ sig = inspect.signature(decomp_fn)
79
+
80
+ # Create a string wrapping the function from the signature
81
+ # example output:
82
+ # def wrapped_decomp(x: torch.Tensor, y: int, z: int):
83
+ # return decomp_fn(x, y, z)
84
+ # Thanks copilot!
85
+ def get_function_def(sig):
86
+ param_def = [f"{param_str}" for param_str in sig.parameters.values()]
87
+ param_use = [f"{param_str}" for param_str in sig.parameters]
88
+
89
+ return f"def wrapped_decomp({', '.join(param_def)}):\n return decomp_fn({', '.join(param_use)})\n"
90
+
91
+ f_str = get_function_def(sig)
92
+ graph = torch.jit.CompilationUnit(f_str).wrapped_decomp.graph
93
+ else:
94
+ graph = torch.jit.script(decomp_fn).graph
95
+ torch.jit._register_decomposition(decomp, graph)
96
+
97
+
98
+ # The only decompositions here are temporary or hacks for the purposes of jvp
99
+
100
+
101
+ # TODO: do these also belong here?
102
+ @maybe_register_decomposition(aten.trace.default)
103
+ def trace(self: Tensor) -> Tensor:
104
+ return torch.sum(torch.diag(self))
105
+
106
+
107
+ @maybe_register_decomposition(aten.log_sigmoid_forward.default)
108
+ def log_sigmoid_forward(self: Tensor) -> tuple[Tensor, Tensor]:
109
+ min = torch.minimum(self.new_zeros(()), self)
110
+ z = torch.exp(-torch.abs(self))
111
+ if self.is_cuda or self.is_xpu:
112
+ buffer = self.new_zeros((0,))
113
+ else:
114
+ buffer = z
115
+ return min - torch.log1p(z), buffer
116
+
117
+
118
+ def recompute_mean_var(
119
+ input: Tensor, rstd: Tensor, inner_dim_indices: list[int], keepdim: bool
120
+ ):
121
+ # for most norm decompositions, it will be the same as the core version except for here.
122
+ # We recompute the mean and variance so that they track gradients through input
123
+
124
+ mean = torch.mean(input, dim=inner_dim_indices, keepdim=keepdim)
125
+ var = torch.var(input, dim=inner_dim_indices, unbiased=False, keepdim=keepdim)
126
+ eps = torch.pow(1 / rstd, 2) - var # this makes me so sad inside
127
+ eps = eps.detach()
128
+ rstd = 1 / torch.sqrt(var + eps)
129
+ return mean, rstd
130
+
131
+
132
+ @register_decomposition_for_jvp(aten.native_layer_norm_backward)
133
+ def native_layer_norm_backward(
134
+ grad_out: Tensor,
135
+ input: Tensor,
136
+ normalized_shape: list[int],
137
+ mean: Tensor,
138
+ rstd: Tensor,
139
+ weight: Optional[Tensor],
140
+ bias: Optional[Tensor],
141
+ output_mask: list[bool],
142
+ ) -> tuple[Optional[Tensor], Optional[Tensor], Optional[Tensor]]:
143
+ input_shape = input.shape
144
+ input_ndim = input.dim()
145
+
146
+ axis = input_ndim - len(normalized_shape)
147
+ inner_dims = input_shape[axis:]
148
+ outer_dims = input_shape[:axis]
149
+ inner_dim_indices = list(range(axis, input_ndim))
150
+ outer_dim_indices = list(range(axis))
151
+
152
+ N = 1
153
+ for i in inner_dims:
154
+ N *= i
155
+ M = 1
156
+ for i in outer_dims:
157
+ M *= i
158
+ if M <= 0 or N <= 0:
159
+ return (
160
+ input.new_zeros(input_shape),
161
+ input.new_zeros(input_shape[axis:]),
162
+ input.new_zeros(input_shape[axis:]),
163
+ )
164
+
165
+ mean_, rstd_ = recompute_mean_var(input, rstd, inner_dim_indices, keepdim=True)
166
+
167
+ x_hat = (input - mean_) * rstd_
168
+ if weight is not None:
169
+ grad_x_hat = grad_out * weight
170
+ else:
171
+ grad_x_hat = grad_out
172
+ a = grad_x_hat * N
173
+ b = torch.sum(grad_x_hat, inner_dim_indices, True)
174
+ c1 = torch.mul(grad_x_hat, x_hat)
175
+ c2 = torch.sum(c1, inner_dim_indices, True)
176
+ c3 = torch.mul(x_hat, c2)
177
+ inner = a - b - c3
178
+
179
+ if output_mask[0]:
180
+ d_input: Optional[Tensor] = (rstd_ / N) * inner
181
+ else:
182
+ d_input = torch.zeros_like(input) # should be None but doesn't work with vjp
183
+
184
+ if output_mask[1] and weight is not None:
185
+ if len(outer_dim_indices) > 0:
186
+ d_weight: Optional[Tensor] = torch.sum(
187
+ grad_out * x_hat, outer_dim_indices, False
188
+ )
189
+ else:
190
+ d_weight = grad_out * x_hat
191
+ elif weight is not None:
192
+ d_weight = torch.zeros_like(weight) # should be None but doesn't work with vjp
193
+ else:
194
+ d_weight = torch.zeros(()) # should be None but doesn't work with vjp
195
+
196
+ if output_mask[2] and bias is not None:
197
+ if len(outer_dim_indices) > 0:
198
+ d_bias: Optional[Tensor] = torch.sum(grad_out, outer_dim_indices, False)
199
+ else:
200
+ d_bias = grad_out.clone()
201
+ elif bias is not None:
202
+ d_bias = torch.zeros_like(bias) # should be None but doesn't work with vjp
203
+ else:
204
+ d_bias = torch.zeros(()) # should be None but doesn't work with vjp
205
+
206
+ return (d_input, d_weight, d_bias)
207
+
208
+
209
+ def prod(x: list[int]):
210
+ r = 1
211
+ for i in x:
212
+ r *= i
213
+ return r
214
+
215
+
216
+ @register_decomposition_for_jvp(aten.native_batch_norm_backward)
217
+ def native_batch_norm_backward(
218
+ grad_out: Tensor,
219
+ input: Tensor,
220
+ weight: Optional[Tensor],
221
+ running_mean: Optional[Tensor],
222
+ running_var: Optional[Tensor],
223
+ save_mean: Optional[Tensor],
224
+ save_invstd: Optional[Tensor],
225
+ train: bool,
226
+ eps: float,
227
+ output_mask: list[bool],
228
+ ) -> tuple[Tensor, Optional[Tensor], Optional[Tensor]]:
229
+ input_shape = input.shape
230
+ input_rank = input.dim()
231
+ assert input_rank >= 2, "rank of the input must be at least 2"
232
+
233
+ axis = 1
234
+ num_features = prod(input_shape) / input_shape[axis] # type: ignore[arg-type]
235
+ mean = save_mean
236
+ invstd = save_invstd
237
+ if train:
238
+ assert save_mean is not None and save_invstd is not None, (
239
+ "when train=True, save_mean and save_invstd are required"
240
+ )
241
+
242
+ reduciton_dims = [0] + list(range(2, input.dim()))
243
+ assert invstd is not None # for typing
244
+ mean, invstd = recompute_mean_var(input, invstd, reduciton_dims, keepdim=False)
245
+ else:
246
+ assert running_mean is not None and running_var is not None
247
+ mean = running_mean
248
+ invstd = torch.rsqrt(running_var + eps)
249
+
250
+ assert invstd is not None and mean is not None
251
+
252
+ broadcast_mask = [1] * input_rank
253
+ broadcast_mask[axis] = input_shape[axis]
254
+
255
+ reduction_axes: list[int] = []
256
+ for i in range(input_rank):
257
+ if i != axis:
258
+ reduction_axes.append(i)
259
+
260
+ mean = torch.reshape(mean, broadcast_mask)
261
+ norm = 1.0 / num_features
262
+ grad_output_sum = torch.sum(grad_out, reduction_axes)
263
+ dot_p = torch.sum(grad_out * (input - mean), reduction_axes)
264
+
265
+ grad_mean = torch.reshape(grad_output_sum * norm, broadcast_mask)
266
+ proj_scale = torch.reshape(torch.mul(dot_p * norm, invstd * invstd), broadcast_mask)
267
+
268
+ if weight is None:
269
+ grad_scale = torch.reshape(invstd, broadcast_mask) * 1.0
270
+ else:
271
+ grad_scale = torch.reshape(invstd * weight, broadcast_mask)
272
+
273
+ if train:
274
+ proj = (input - mean) * proj_scale
275
+ grad_input = ((grad_out - proj) - grad_mean) * grad_scale
276
+ else:
277
+ grad_input = grad_out * grad_scale
278
+
279
+ if output_mask[1]:
280
+ grad_weight = dot_p * invstd
281
+ elif weight is not None:
282
+ grad_weight = torch.zeros_like(
283
+ weight
284
+ ) # should be None but doesn't work with vjp
285
+ else:
286
+ grad_weight = torch.zeros(()) # should be None but doesn't work with vjp
287
+
288
+ if output_mask[2]:
289
+ grad_bias = grad_output_sum
290
+ else:
291
+ grad_bias = torch.zeros_like(
292
+ grad_output_sum
293
+ ) # should be None but doesn't work with vjp
294
+
295
+ return (grad_input, grad_weight, grad_bias)
296
+
297
+
298
+ @register_decomposition_for_jvp(aten.batch_norm_backward)
299
+ def batch_norm_backward(
300
+ grad_out: Tensor,
301
+ input: Tensor,
302
+ weight: Tensor,
303
+ running_mean: Optional[Tensor],
304
+ running_var: Optional[Tensor],
305
+ save_mean: Optional[Tensor],
306
+ save_var: Optional[Tensor],
307
+ update: bool,
308
+ eps: float,
309
+ output_mask: list[bool],
310
+ reserve: Tensor,
311
+ ) -> tuple[Tensor, Optional[Tensor], Optional[Tensor]]:
312
+ return native_batch_norm_backward(
313
+ grad_out,
314
+ input,
315
+ weight,
316
+ running_mean,
317
+ running_var,
318
+ save_mean,
319
+ save_var,
320
+ update,
321
+ eps,
322
+ output_mask,
323
+ )
324
+
325
+
326
+ _register_jit_decomposition_for_jvp(torch.ops.aten.trace.default, use_python=True)
327
+ _register_jit_decomposition_for_jvp(torch.ops.aten.nll_loss_backward.default)
328
+ _register_jit_decomposition_for_jvp(torch.ops.aten.nll_loss2d_backward.default)
329
+ _register_jit_decomposition_for_jvp(torch.ops.aten._log_softmax_backward_data.default)
330
+ _register_jit_decomposition_for_jvp(torch.ops.aten._softmax_backward_data.default)
331
+ _register_jit_decomposition_for_jvp(torch.ops.aten.log_sigmoid_forward.default)
332
+ _register_jit_decomposition_for_jvp(torch.ops.aten.native_layer_norm_backward.default)
333
+ _register_jit_decomposition_for_jvp(torch.ops.aten.native_batch_norm_backward.default)
334
+ _register_jit_decomposition_for_jvp(torch.ops.aten.cudnn_batch_norm_backward.default)
335
+ _register_jit_decomposition_for_jvp(torch.ops.aten.batch_norm_backward.default)
336
+ _register_jit_decomposition_for_jvp(torch.ops.aten.miopen_batch_norm_backward.default)
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/_decomp/decompositions_for_rng.py ADDED
@@ -0,0 +1,266 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # mypy: allow-untyped-decorators
2
+ # mypy: allow-untyped-defs
3
+ import functools
4
+ from collections import defaultdict
5
+ from collections.abc import Callable
6
+
7
+ import torch
8
+ import torch._decomp as decomp
9
+ from torch._decomp import get_decompositions
10
+ from torch._ops import OpOverload
11
+
12
+
13
+ aten = torch.ops.aten
14
+
15
+ rng_decompositions: dict[str, dict[OpOverload, Callable]] = defaultdict(dict)
16
+
17
+
18
+ def register_rng_decomposition(aten_op):
19
+ return decomp.register_decomposition(aten_op, rng_decompositions)
20
+
21
+
22
+ def throw_on_non_cuda(device):
23
+ raise RuntimeError(
24
+ f"You are trying to functionalize a {device.type} RNG operator but {device.type} does not "
25
+ f"use Philox/counter-based RNG. Therefore, functionalizing a {device.type} RNG operator is "
26
+ "not supported. We are discussing the possibility of a Philox-based RNG implementation for CPU."
27
+ )
28
+
29
+
30
+ # TODO - We have to register many more distributions here, and also higher level
31
+ # ops like dropout which have fused implementation and can hide the rand inside.
32
+ @register_rng_decomposition(aten.rand)
33
+ def rand(shape, dtype=None, layout=torch.strided, device=None, pin_memory=False):
34
+ if device and device.type != "cuda":
35
+ throw_on_non_cuda(device)
36
+ seed, offset = PhiloxStateTracker.get_state_as_tuple()
37
+ dtype = dtype or torch.float32
38
+ out, offset_jump = torch.ops.rngprims.philox_rand(
39
+ shape, seed, offset, None, device, dtype
40
+ )
41
+ PhiloxStateTracker.advance_offset(offset_jump)
42
+ return out
43
+
44
+
45
+ @register_rng_decomposition(aten.rand_like)
46
+ def rand_like(
47
+ x: torch.Tensor,
48
+ dtype=None,
49
+ layout=None,
50
+ device=None,
51
+ pin_memory=False,
52
+ memory_format=torch.preserve_format,
53
+ ):
54
+ device = device or x.device
55
+ if device.type != "cuda":
56
+ throw_on_non_cuda(device)
57
+ dtype = dtype or x.dtype
58
+ seed, offset = PhiloxStateTracker.get_state_as_tuple()
59
+ out, offset_jump = torch.ops.rngprims.philox_rand(
60
+ x.shape, seed, offset, None, device, dtype
61
+ )
62
+ PhiloxStateTracker.advance_offset(offset_jump)
63
+ return out
64
+
65
+
66
+ class PhiloxState:
67
+ """
68
+ Represents a PhiloxRngState - (seed, offset) where offset = base_offset +
69
+ relative_offset. seed and base_offset basically point to the rng state just
70
+ before tracing starts. relative offset tracks the totally consumed offset at
71
+ trace time.
72
+ """
73
+
74
+ def __init__(self) -> None:
75
+ self.reset()
76
+
77
+ def reset(self):
78
+ self.seed = torch.tensor(())
79
+ self.base_offset = torch.tensor(())
80
+ self.relative_offset = 0
81
+ self.offset_advanced_alteast_once = False
82
+
83
+ def validate_state(self):
84
+ assert self.seed.numel() != 0 and self.base_offset.numel() != 0
85
+
86
+ def advance_offset(self, consumed_offset):
87
+ self.offset_advanced_alteast_once = True
88
+ self.relative_offset = self.relative_offset + consumed_offset
89
+
90
+ def set_state(self, seed, base_offset, relative_offset=0):
91
+ self.seed = seed
92
+ self.base_offset = base_offset
93
+ self.relative_offset = relative_offset
94
+
95
+ def get_state_as_tuple(self):
96
+ self.validate_state()
97
+ return (self.seed, self.base_offset + self.relative_offset)
98
+
99
+ def get_state_as_tensor(self):
100
+ # Only needed because we override get_rng_state.
101
+ self.validate_state()
102
+ return torch.stack([self.seed, self.base_offset + self.relative_offset])
103
+
104
+ def set_state_from_tensor(self, state):
105
+ # Only needed because we override set_rng_state.
106
+ self.seed, self.base_offset = torch.unbind(state)
107
+ self.relative_offset = 0
108
+
109
+
110
+ class PhiloxStateTracker:
111
+ """
112
+ Singleton class to track the philox rng state during AOT Autograd tracing.
113
+ For each aot tracing instance, AOT Autograd resets this tracker and keeps
114
+ track of both forward and backward offsets. At runtime, we only care about
115
+ the total consumed forward and backward offsets. For dynamic shapes, these
116
+ offsets are a function of input shapes. Therefore, the AOT generated graphs
117
+ have additional outputs that compute total consumed forward and backward
118
+ offsets.
119
+ """
120
+
121
+ running_state: PhiloxState
122
+ fwd_state: PhiloxState
123
+ bwd_state: PhiloxState
124
+
125
+ def __enter__(self):
126
+ PhiloxStateTracker.reset()
127
+ return self
128
+
129
+ def __exit__(self, exc_type, exc_cal, exc_tb):
130
+ PhiloxStateTracker.reset()
131
+
132
+ @classmethod
133
+ def reset(cls):
134
+ cls.running_state = PhiloxState()
135
+ cls.fwd_state = PhiloxState()
136
+ cls.bwd_state = PhiloxState()
137
+
138
+ @classmethod
139
+ def mark_beginning_of_forward(cls):
140
+ # Tells the tracker to use fwd_state as the running state
141
+ cls.running_state = cls.fwd_state
142
+
143
+ @classmethod
144
+ def mark_beginning_of_backward(cls):
145
+ # Tells the tracker to use bwd_state as the running state
146
+ cls.running_state = cls.bwd_state
147
+
148
+ @classmethod
149
+ def record_state(cls, seed, offset, mode):
150
+ # Records the seed and offset tensors. These tensors are used to invoke
151
+ # the philox_rand functional primitives.
152
+ if mode == "forward":
153
+ cls.fwd_state.set_state(seed, offset)
154
+ cls.mark_beginning_of_forward()
155
+ else:
156
+ assert mode == "backward"
157
+ cls.bwd_state.set_state(seed, offset)
158
+
159
+ @classmethod
160
+ def get_state_as_tensor(cls):
161
+ # The only reason this exists is because we override get_rng_state and
162
+ # set_rng_state during tracing. get_rng_state expects a tensor output,
163
+ # so return (seed, offset) tuple upset other parts of the program like
164
+ # ctx.saved_tensors.
165
+
166
+ # A bad consequence is that if user saves and restores rng state, we
167
+ # have little bit of ugliness in the generated code, where we first
168
+ # concat the (seed, offset) to create a tensor for get_rng_state, and
169
+ # then split it back to get (seed, offset) tuple in set_rng_state.
170
+
171
+ # TODO: Investigate if there is be a better way to wrap the tuple in a
172
+ # false Tensor object, and then desugar it later on.
173
+ return cls.running_state.get_state_as_tensor()
174
+
175
+ @classmethod
176
+ def get_state_as_tuple(cls):
177
+ return cls.running_state.get_state_as_tuple()
178
+
179
+ @classmethod
180
+ def set_state_from_tensor(cls, x):
181
+ # This is only needed because we override set_rng_state. Look at the
182
+ # comment in get_state_from_tensor method.
183
+ cls.running_state.set_state_from_tensor(x)
184
+
185
+ @classmethod
186
+ def advance_offset(cls, consumed_offset):
187
+ cls.running_state.advance_offset(consumed_offset)
188
+
189
+ @classmethod
190
+ def get_current_relative_offset(cls):
191
+ return cls.running_state.relative_offset
192
+
193
+ @staticmethod
194
+ def multiple_of_4(offset):
195
+ # torch cuda rng state offset must be a multiple of 4. For inductor, as
196
+ # we sum up all the numel, the result might not be a multiple of 4. This
197
+ # method achieves that.
198
+ return (offset + 3) // 4 * 4
199
+
200
+ @classmethod
201
+ def get_updated_fwd_offset(cls):
202
+ # Short circuit if no rand ops were observed
203
+ if not cls.fwd_state.offset_advanced_alteast_once:
204
+ return cls.fwd_state.base_offset
205
+ return cls.multiple_of_4(
206
+ cls.fwd_state.base_offset + cls.fwd_state.relative_offset
207
+ )
208
+
209
+ @classmethod
210
+ def get_updated_bwd_offset(cls):
211
+ # Short circuit if no rand ops were observed
212
+ if not cls.bwd_state.offset_advanced_alteast_once:
213
+ return cls.bwd_state.base_offset
214
+ return cls.multiple_of_4(
215
+ cls.bwd_state.base_offset + cls.bwd_state.relative_offset
216
+ )
217
+
218
+
219
+ # Adding more decompositions which eventually use rand_like inside decomps.
220
+ # Adding these in rng_decompositions ensures the functionalization of rand_like
221
+ # ops used in these decomps. The list is copied from inductor codebase, which
222
+ # uses it for similar purpose.
223
+ #
224
+ # Caution - These decomps do not have same accuracy as that of eager. However,
225
+ # we can't just disable them with a config flag like fallback_random, because
226
+ # for functionalization of rng ops, we have to decompose these ops.
227
+ extra_random_decomps = get_decompositions(
228
+ [
229
+ aten.cauchy,
230
+ aten.cauchy_,
231
+ aten.exponential,
232
+ aten.exponential_,
233
+ aten.geometric,
234
+ aten.geometric_,
235
+ aten.native_dropout,
236
+ aten.normal,
237
+ aten.normal_,
238
+ aten.normal_functional,
239
+ aten.log_normal,
240
+ aten.log_normal_,
241
+ aten.rrelu_with_noise,
242
+ aten.rrelu_with_noise_,
243
+ aten.uniform_,
244
+ ]
245
+ )
246
+ register_extra_random_decomp = functools.partial(
247
+ decomp.register_decomposition, registry=extra_random_decomps
248
+ )
249
+
250
+
251
+ @register_extra_random_decomp([aten.bernoulli_])
252
+ def bernoulli_(self, p=0.5):
253
+ if self.device == torch.device("cpu"):
254
+ return NotImplemented
255
+ return self.copy_(torch.rand_like(self, dtype=torch.float32) < p)
256
+
257
+
258
+ @register_extra_random_decomp([aten.bernoulli.p])
259
+ def bernoulli_p(self, p=0.5, *, generator=None):
260
+ if self.device == torch.device("cpu"):
261
+ return NotImplemented
262
+ assert generator is None
263
+ return torch.rand_like(self, dtype=torch.float32) < p
264
+
265
+
266
+ rng_decompositions.update(extra_random_decomps) # type: ignore[arg-type]
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/_dispatch/__init__.py ADDED
File without changes
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/_dispatch/python.py ADDED
@@ -0,0 +1,192 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # mypy: allow-untyped-defs
2
+ import itertools
3
+ import unittest.mock
4
+ from collections.abc import Callable, Iterator
5
+ from contextlib import contextmanager
6
+ from typing import TypeVar, Union
7
+ from typing_extensions import ParamSpec
8
+
9
+ import torch
10
+ import torch._C
11
+ import torch._ops
12
+ import torch.utils._python_dispatch
13
+ import torch.utils._pytree as pytree
14
+ from torch._C import DispatchKey
15
+
16
+
17
+ __all__ = ["enable_python_dispatcher", "no_python_dispatcher", "enable_pre_dispatch"]
18
+
19
+ no_python_dispatcher = torch._C._DisablePythonDispatcher
20
+ enable_python_dispatcher = torch._C._EnablePythonDispatcher
21
+ enable_pre_dispatch = torch._C._EnablePreDispatch
22
+
23
+ CROSSREF_FUNCTIONALIZE = False
24
+
25
+ _P = ParamSpec("_P")
26
+ _T = TypeVar("_T")
27
+
28
+
29
+ def all_py_loaded_overloads() -> Iterator[torch._ops.OpOverload]:
30
+ """
31
+ Warning: the set of overloads this will report is very subtle. It is precisely
32
+ the set of torch.ops functions that have actually been accessed from Python
33
+ (e.g., we actually called torch.ops.aten.blah at some point. This is DIFFERENT
34
+ from the set of registered operators, which will in general be a larger set,
35
+ as this would include all operators which we ran C++ static initializers or
36
+ Python operator registration on. This does not eagerly populate the list on
37
+ torch.ops.aten; this list is lazy!
38
+
39
+ In other words, this is good for traversing over everything that has an
40
+ OpOverload object allocated in Python. We use it for cache invalidation, but
41
+ don't rely on this list being complete.
42
+
43
+ Note that even if we did report all C++ registered overloads, this isn't guaranteed
44
+ to be complete either, as a subsequent lazy load of a library which triggers more
45
+ registrations could add more things to the set.
46
+ """
47
+ for ns in torch.ops:
48
+ packets = getattr(torch.ops, ns)
49
+ for op_name in packets:
50
+ packet = getattr(packets, op_name)
51
+ for overload in packet:
52
+ yield getattr(packet, overload)
53
+
54
+
55
+ @contextmanager
56
+ def suspend_functionalization():
57
+ f_tls = torch._C._dispatch_tls_is_dispatch_key_included(
58
+ torch._C.DispatchKey.Functionalize
59
+ )
60
+ f_rv = torch._C._functionalization_reapply_views_tls()
61
+ if f_tls:
62
+ torch._disable_functionalization()
63
+ try:
64
+ yield
65
+ finally:
66
+ if f_tls:
67
+ torch._enable_functionalization(reapply_views=f_rv)
68
+
69
+
70
+ def check_tensor_metadata_matches(nv, rv, desc):
71
+ assert callable(desc)
72
+ assert nv.size() == rv.size(), f"{desc()}: sizes {nv.size()} != {rv.size()}"
73
+ assert nv.dtype == rv.dtype, f"{desc()}: dtype {nv.dtype} != {rv.dtype}"
74
+ same_strides, idx = torch._prims_common.check_significant_strides(
75
+ nv, rv, only_cuda=False
76
+ )
77
+ assert same_strides, (
78
+ f"{desc()}: strides {nv.stride()} != {rv.stride()} (mismatch at index {idx})"
79
+ )
80
+
81
+
82
+ def check_metadata_matches(n, r, desc):
83
+ assert callable(desc)
84
+ n_vals, _n_spec = pytree.tree_flatten(n)
85
+ r_vals, _r_spec = pytree.tree_flatten(r)
86
+ # TODO: test the specs match; empirically sometimes we have a tuple
87
+ # on one side and a list on the other
88
+ assert len(n_vals) == len(r_vals), f"{len(n_vals)} != {len(r_vals)}"
89
+ for i, nv, rv in zip(range(len(n_vals)), n_vals, r_vals):
90
+ if not isinstance(rv, torch.Tensor):
91
+ continue
92
+ check_tensor_metadata_matches(nv, rv, lambda: f"{desc()} output {i}")
93
+
94
+
95
+ class Lit:
96
+ def __init__(self, s):
97
+ self.s = s
98
+
99
+ def __repr__(self):
100
+ return self.s
101
+
102
+
103
+ def _fmt(a: object) -> object:
104
+ if isinstance(a, torch.Tensor):
105
+ return Lit(
106
+ f"torch.empty_strided({tuple(a.size())}, {a.stride()}, dtype={a.dtype})"
107
+ )
108
+ else:
109
+ return a
110
+
111
+
112
+ def make_crossref_functionalize(
113
+ op: torch._ops.OpOverload[_P, _T], final_key: DispatchKey
114
+ ) -> Union[Callable[_P, _T], DispatchKey]:
115
+ from torch._subclasses.fake_tensor import FakeTensorMode
116
+
117
+ # This case is pretty weird, suppress it for now
118
+ if op is torch.ops.aten.lift_fresh.default:
119
+ return final_key
120
+
121
+ def handler(*args: _P.args, **kwargs: _P.kwargs) -> _T:
122
+ fake_mode = FakeTensorMode()
123
+
124
+ def fakeify_defun(t):
125
+ if isinstance(t, torch.Tensor):
126
+ if torch._is_functional_tensor(t):
127
+ r = torch._from_functional_tensor(t)
128
+ # NB: This assumes that the inner tensor sizes/strides match
129
+ # the outer tensor sizes/strides. This doesn't necessarily have to
130
+ # be the case, see discussion at
131
+ # https://github.com/pytorch/pytorch/pull/87610/files/401ddeda1d769bedc88a12de332c7357b60e51a4#r1007264456
132
+ assert t.size() == r.size()
133
+ assert t.stride() == r.stride()
134
+ else:
135
+ r = t
136
+ # TODO: suppress guards
137
+ return fake_mode.from_tensor(r)
138
+ return t
139
+
140
+ def maybe_detach(t):
141
+ if isinstance(t, torch.Tensor):
142
+ return t.detach()
143
+ else:
144
+ return t
145
+
146
+ # TODO: This probably does the wrong thing if you're running other
147
+ # substantive modes with the normal op outside here
148
+ with (
149
+ torch.utils._python_dispatch._disable_current_modes(),
150
+ suspend_functionalization(),
151
+ ):
152
+ f_args, f_kwargs = pytree.tree_map(fakeify_defun, (args, kwargs))
153
+ orig_f_args, orig_f_kwargs = pytree.tree_map(
154
+ maybe_detach, (f_args, f_kwargs)
155
+ )
156
+ with fake_mode:
157
+ f_r = op(*f_args, **f_kwargs) # pyrefly: ignore [invalid-param-spec]
158
+ r = op._op_dk(final_key, *args, **kwargs)
159
+
160
+ def desc():
161
+ fmt_args = ", ".join(
162
+ itertools.chain(
163
+ (repr(pytree.tree_map(_fmt, a)) for a in orig_f_args),
164
+ (
165
+ f"{k}={pytree.tree_map(_fmt, v)}"
166
+ for k, v in orig_f_kwargs.items()
167
+ ),
168
+ )
169
+ )
170
+ return f"{op}({fmt_args})"
171
+
172
+ check_metadata_matches(f_r, r, desc)
173
+ return r
174
+
175
+ return handler
176
+
177
+
178
+ # NB: enabling this is slow, don't do it in a hot loop. This is purely
179
+ # for debugging purposes.
180
+ @contextmanager
181
+ def enable_crossref_functionalize():
182
+ for op in all_py_loaded_overloads():
183
+ op._uncache_dispatch(torch._C.DispatchKey.Functionalize)
184
+ try:
185
+ with (
186
+ enable_python_dispatcher(),
187
+ unittest.mock.patch("torch._dispatch.python.CROSSREF_FUNCTIONALIZE", True),
188
+ ):
189
+ yield
190
+ finally:
191
+ for op in all_py_loaded_overloads():
192
+ op._uncache_dispatch(torch._C.DispatchKey.Functionalize)
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/_export/config.py ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Configuration module for torch.export.export.
3
+
4
+ This module contains various configuration flags and settings that control torch.export's
5
+ behavior, including:
6
+ - Runtime behavior flags
7
+ - Debugging and development options
8
+ """
9
+
10
+ import sys
11
+ from typing import Any, TYPE_CHECKING
12
+
13
+ from torch._environment import is_fbcode
14
+ from torch.utils._config_module import install_config_module
15
+
16
+
17
+ # this flag controls whether we use new functional tracer. It
18
+ # should be True in the long term.
19
+ use_new_tracer_experimental = True
20
+
21
+ # this flag is used to control whether we want to instrument
22
+ # fake tensor creation to track potential leaks. It is off
23
+ # by default, but user can turn it on to debug leaks.
24
+ detect_non_strict_fake_tensor_leaks = False
25
+
26
+ # error on potentially pre-dispatch/non-strict tracing limitation
27
+ # this type of error usually happens when we encounter an op
28
+ # that we don't know how to proxy, resulting in untracked fake tensors
29
+ error_on_lifted_constant_tensors = True
30
+
31
+ # enable auto_functionalized_v2 in export
32
+ # We turn this off in fbcode due to downstream users not
33
+ # being ready to handle auto_functionalized_v2.
34
+ enable_auto_functionalized_v2_for_export = not is_fbcode()
35
+
36
+ use_legacy_dynamo_graph_capture = True
37
+
38
+
39
+ if TYPE_CHECKING:
40
+ from torch.utils._config_typing import * # noqa: F401, F403
41
+
42
+ def _make_closure_patcher(**changes: Any) -> Any: ...
43
+
44
+
45
+ install_config_module(sys.modules[__name__])
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/_export/error.py ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from enum import Enum
2
+
3
+
4
+ class ExportErrorType(Enum):
5
+ # User providing invalid inputs to either tracer, or other public facing APIs
6
+ INVALID_INPUT_TYPE = 1
7
+
8
+ # User returning values from their models that we don't support.
9
+ INVALID_OUTPUT_TYPE = 2
10
+
11
+ # Generated IR does not conform to Export IR Specification.
12
+ VIOLATION_OF_SPEC = 3
13
+
14
+ # User's code contains types and functionalities we don't support.
15
+ NOT_SUPPORTED = 4
16
+
17
+ # User's code didn't provide necessary details for us to successfully trace and export.
18
+ # For example, we use a lot of decorators and ask users to annotate their model.
19
+ MISSING_PROPERTY = 5
20
+
21
+ # User is using an API without proper initialization step.
22
+ UNINITIALIZED = 6
23
+
24
+
25
+ def internal_assert(pred: bool, assert_msg: str) -> None:
26
+ """
27
+ This is exir's custom assert method. It internally just throws InternalError.
28
+ Note that the sole purpose is to throw our own error while maintaining similar syntax
29
+ as python assert.
30
+ """
31
+
32
+ if not pred:
33
+ raise InternalError(assert_msg)
34
+
35
+
36
+ class InternalError(Exception):
37
+ """
38
+ Raised when an internal invariance is violated in EXIR stack.
39
+ Should hint users to report a bug to dev and expose the original
40
+ error message.
41
+ """
42
+
43
+ def __init__(self, message: str) -> None:
44
+ super().__init__(message)
45
+
46
+
47
+ class ExportError(Exception):
48
+ """
49
+ This type of exception is raised for errors that are directly caused by the user
50
+ code. In general, user errors happen during model authoring, tracing, using our public
51
+ facing APIs, and writing graph passes.
52
+ """
53
+
54
+ def __init__(self, error_code: ExportErrorType, message: str) -> None:
55
+ prefix = f"[{error_code}]: "
56
+ super().__init__(prefix + message)
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/_export/verifier.py ADDED
@@ -0,0 +1,531 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # mypy: allow-untyped-defs
2
+ import inspect
3
+ import math
4
+ import operator
5
+ from collections.abc import Iterable
6
+ from typing import Any, final, TYPE_CHECKING
7
+
8
+ import torch
9
+ from torch._library.opaque_object import is_opaque_type
10
+ from torch._ops import HigherOrderOperator, OpOverload
11
+ from torch._subclasses.fake_tensor import FakeTensor
12
+ from torch.export.graph_signature import (
13
+ CustomObjArgument,
14
+ InputKind,
15
+ SymBoolArgument,
16
+ SymFloatArgument,
17
+ SymIntArgument,
18
+ TensorArgument,
19
+ TokenArgument,
20
+ )
21
+ from torch.fx import GraphModule
22
+
23
+
24
+ if TYPE_CHECKING:
25
+ from torch.export.exported_program import ExportedProgram
26
+
27
+
28
+ class SpecViolationError(Exception):
29
+ pass
30
+
31
+
32
+ def is_functional(op: OpOverload) -> bool:
33
+ return not op._schema.is_mutable
34
+
35
+
36
+ def _check_has_fake_tensor(node: torch.fx.Node) -> None:
37
+ # TODO(angelayi): remove this in favor of _check_val
38
+ return _check_val(node)
39
+
40
+
41
+ def _check_val(node: torch.fx.Node) -> None:
42
+ from torch.fx.experimental.symbolic_shapes import SymBool, SymFloat, SymInt
43
+
44
+ def _check_correct_val(val):
45
+ if val is None:
46
+ return True
47
+ elif isinstance(val, (int, bool, str, float)):
48
+ return True
49
+ elif isinstance(
50
+ val, (torch.memory_format, torch.dtype, torch.device, torch.layout)
51
+ ):
52
+ return True
53
+ elif isinstance(
54
+ val, (FakeTensor, torch.Tensor)
55
+ ): # TODO(zhxchen17) Remove Tensor.
56
+ return True
57
+ elif isinstance(val, (SymInt, SymFloat, SymBool)):
58
+ return True
59
+ elif isinstance(val, CustomObjArgument):
60
+ return True
61
+ elif isinstance(val, Iterable):
62
+ return all(_check_correct_val(x) for x in val)
63
+ elif is_opaque_type(type(val)):
64
+ return True
65
+ return False
66
+
67
+ def _no_returns(op):
68
+ if not isinstance(op, OpOverload):
69
+ return False
70
+ return len(op._schema.returns) == 0
71
+
72
+ if "val" not in node.meta:
73
+ if node.op == "call_function" and _no_returns(node.target):
74
+ return
75
+ raise SpecViolationError(f"Node.meta {node.name} is missing val field.")
76
+
77
+ val = node.meta["val"]
78
+ if not _check_correct_val(val):
79
+ raise SpecViolationError(f"Node.meta {node.name} has invalid val field {val}")
80
+
81
+
82
+ def _check_torch_fn(node: torch.fx.Node) -> None:
83
+ torch_fn = node.meta.get("torch_fn")
84
+ if torch_fn is None:
85
+ raise SpecViolationError(
86
+ f"Unable to find torch_fn metadata for node {node.name}"
87
+ )
88
+ if (
89
+ not isinstance(torch_fn, tuple)
90
+ and isinstance(torch_fn[0], str)
91
+ and isinstance(torch_fn[1], str)
92
+ ):
93
+ raise SpecViolationError(
94
+ f"Node.meta {node.name} has invalid torch_fn field {torch_fn}"
95
+ )
96
+
97
+
98
+ class _VerifierMeta(type):
99
+ _registry: dict[str, type["Verifier"]] = {}
100
+
101
+ def __new__(metacls, name, bases, attrs):
102
+ if bases:
103
+ if "check" in attrs or "_check_graph_module" in attrs:
104
+ raise SyntaxError("Overriding method check is not allowed.")
105
+ assert "dialect" in attrs and attrs["dialect"] != "ATEN"
106
+ else:
107
+ assert "check" in attrs
108
+ assert "_check_graph_module" in attrs
109
+ assert attrs["dialect"] == "ATEN"
110
+
111
+ assert isinstance(attrs["dialect"], str)
112
+ ret = type.__new__(metacls, name, bases, attrs)
113
+ metacls._registry[attrs["dialect"]] = ret # type: ignore[assignment]
114
+ return ret
115
+
116
+
117
+ def getattr_recursive(obj: Any, target: str) -> Any:
118
+ target_atoms = target.split(".")
119
+ attr_itr = obj
120
+ for i, atom in enumerate(target_atoms):
121
+ if not hasattr(attr_itr, atom):
122
+ raise RuntimeError(
123
+ f"Node referenced nonexistent target {'.'.join(target_atoms[:i])}"
124
+ )
125
+ attr_itr = getattr(attr_itr, atom)
126
+ return attr_itr
127
+
128
+
129
+ class Verifier(metaclass=_VerifierMeta):
130
+ dialect = "ATEN"
131
+
132
+ def allowed_builtin_ops(self) -> list:
133
+ return [
134
+ operator.getitem,
135
+ operator.add,
136
+ operator.mul,
137
+ operator.sub,
138
+ operator.truediv,
139
+ operator.ge,
140
+ operator.le,
141
+ operator.gt,
142
+ operator.lt,
143
+ operator.eq,
144
+ operator.ne,
145
+ operator.floordiv,
146
+ operator.mod,
147
+ operator.and_,
148
+ operator.or_,
149
+ operator.not_,
150
+ operator.pow,
151
+ operator.neg,
152
+ operator.abs,
153
+ operator.lshift,
154
+ operator.rshift,
155
+ math.ceil,
156
+ math.floor,
157
+ math.trunc,
158
+ round,
159
+ ]
160
+
161
+ def allowed_op_types(self) -> tuple[type[Any], ...]:
162
+ return (OpOverload, HigherOrderOperator)
163
+
164
+ def allowed_getattr_types(self) -> tuple[type[Any], ...]:
165
+ return (torch.fx.GraphModule, torch.utils._pytree.TreeSpec)
166
+
167
+ def allowed_getattr_types_for_subgm(self) -> tuple[type[Any], ...]:
168
+ # subgm in HOP's argument could has have getattr(weight) nodes, thus stateful
169
+ return (
170
+ torch.fx.GraphModule,
171
+ torch.nn.parameter.Parameter,
172
+ torch.Tensor, # for buffer and constant tensor
173
+ torch.utils._pytree.TreeSpec,
174
+ )
175
+
176
+ def check_valid_op(self, op):
177
+ pass
178
+
179
+ def check_additional(self, gm: GraphModule) -> None:
180
+ """
181
+ Additional checks that are specific to some dialects.
182
+ """
183
+
184
+ @final
185
+ def check(self, ep: "ExportedProgram") -> None:
186
+ self._check_graph_module(ep.graph_module)
187
+ _verify_exported_program_module_call_graph(ep)
188
+ _verify_exported_program_signature(ep)
189
+
190
+ @final
191
+ def _check_graph_module(self, gm: torch.fx.GraphModule) -> None:
192
+ def _allowed_getattr_types(is_toplevel_gm) -> tuple[type[Any], ...]:
193
+ if is_toplevel_gm:
194
+ ret = self.allowed_getattr_types()
195
+ else:
196
+ ret = self.allowed_getattr_types_for_subgm()
197
+ assert not any(t is object for t in ret)
198
+ return ret
199
+
200
+ def _check_valid_op(op) -> None:
201
+ def _allowed_builtin_ops() -> list:
202
+ ret = self.allowed_builtin_ops()
203
+ assert all(inspect.isbuiltin(op) for op in ret)
204
+ return ret
205
+
206
+ def _allowed_op_types() -> tuple[type[Any], ...]:
207
+ ret = self.allowed_op_types()
208
+ assert not any(t is object for t in ret)
209
+ return ret
210
+
211
+ # TODO Remove this allowlist.
212
+ _allowed_torch_functions = (
213
+ torch.autograd.grad_mode.set_grad_enabled,
214
+ torch.sym_int,
215
+ torch.sym_float,
216
+ torch.sym_ite,
217
+ torch.sym_max,
218
+ torch.sym_min,
219
+ torch.sym_not,
220
+ torch.sym_sqrt,
221
+ torch.sym_sum,
222
+ torch.export.custom_ops._call_custom_autograd_function_in_pre_dispatch,
223
+ # TODO (tmanlaibaatar)
224
+ # Predispatch export is able to contain autograd ops.
225
+ # These will be modeled as HOO later
226
+ torch._C._set_grad_enabled,
227
+ torch.amp.autocast_mode._enter_autocast,
228
+ torch.amp.autocast_mode._exit_autocast,
229
+ torch.fx.experimental.symbolic_shapes.cast_symbool_to_symint_guardless,
230
+ torch._functorch.predispatch._add_batch_dim,
231
+ torch._functorch.predispatch._remove_batch_dim,
232
+ torch._functorch.predispatch._vmap_increment_nesting,
233
+ torch._functorch.predispatch._vmap_decrement_nesting,
234
+ torch._functorch.predispatch.lazy_load_decompositions,
235
+ )
236
+
237
+ if not isinstance(op, _allowed_op_types()):
238
+ if (
239
+ op not in _allowed_builtin_ops()
240
+ and op not in _allowed_torch_functions
241
+ ):
242
+ raise SpecViolationError(
243
+ f"Operator '{op}' is not an allowed operator type: {_allowed_op_types()}\n"
244
+ f"Valid builtin ops: {_allowed_builtin_ops()}"
245
+ f"Valid torch functions: {_allowed_torch_functions}"
246
+ )
247
+
248
+ if isinstance(op, OpOverload):
249
+ # All ops functional
250
+ # TODO (tmanlaibaatar) more proper way is needed here
251
+ if self.dialect != "TRAINING" and not is_functional(op):
252
+ raise SpecViolationError(f"operator '{op}' is not functional")
253
+ self.check_valid_op(op)
254
+
255
+ for mod in gm.modules():
256
+ is_toplevel_gm = mod is gm
257
+
258
+ if not isinstance(mod, torch.fx.GraphModule):
259
+ continue
260
+
261
+ mod.graph.lint()
262
+ for node in mod.graph.nodes:
263
+ # TODO(T140410192): should have fake tensor for all dialects
264
+ if node.op in {"call_module", "call_method"}:
265
+ raise SpecViolationError(
266
+ f"call_module is not valid: got a class '{node.target}' ",
267
+ )
268
+
269
+ elif node.op == "call_function":
270
+ _check_val(node)
271
+
272
+ _check_valid_op(node.target)
273
+
274
+ elif node.op == "get_attr":
275
+ if not isinstance(node.target, str):
276
+ raise SpecViolationError(
277
+ f"Expected get_attr target to be string, but got {type(node.target)}"
278
+ )
279
+
280
+ attr = getattr_recursive(mod, node.target)
281
+ if isinstance(attr, torch.nn.Module):
282
+
283
+ def _is_type(name, ty):
284
+ return isinstance(getattr(attr, name, None), ty)
285
+
286
+ if type(attr).__name__ == "LoweredBackendModule":
287
+ if (
288
+ _is_type("backend_id", str)
289
+ and hasattr(attr, "original_module")
290
+ and hasattr(attr, "module_name")
291
+ and getattr(attr, "backend_id", None) == "aoti"
292
+ ):
293
+ continue
294
+ if (
295
+ _is_type("backend_id", str)
296
+ and _is_type("processed_bytes", bytes)
297
+ and _is_type("compile_specs", list)
298
+ and hasattr(attr, "original_module")
299
+ ):
300
+ continue
301
+ else:
302
+ backend_id = getattr(attr, "backend_id", None)
303
+ processed_bytes = getattr(attr, "processed_bytes", None)
304
+ compile_specs = getattr(attr, "compile_specs", None)
305
+ raise SpecViolationError(
306
+ f"Invalid get_attr type {type(attr)}. \n"
307
+ f"LoweredBackendModule fields: "
308
+ f"backend_id(str) : {type(backend_id)}, "
309
+ f"processed_bytes(bytes) : {type(processed_bytes)}, "
310
+ f"compile_specs(list) : {type(compile_specs)}"
311
+ )
312
+ elif type(attr).__name__ == "AOTInductorEPModule":
313
+ continue
314
+
315
+ elif type(attr).__name__ == "AOTInductorRunnerWrapper":
316
+ continue
317
+
318
+ if not isinstance(attr, _allowed_getattr_types(is_toplevel_gm)):
319
+ raise SpecViolationError(
320
+ f"Invalid get_attr type {type(attr)} on target {node.target}. \n"
321
+ f"Valid get_attr types: {_allowed_getattr_types(is_toplevel_gm)}"
322
+ )
323
+
324
+ elif node.op == "placeholder":
325
+ _check_val(node)
326
+ # TODO(zhxchen17)
327
+ # elif node.op == "output":
328
+ # _check_flattened_outputs()
329
+
330
+ self.check_additional(gm)
331
+
332
+
333
+ class TrainingIRVerifier(Verifier):
334
+ dialect = "TRAINING"
335
+
336
+
337
+ def _verify_exported_program_module_call_graph(exported_program) -> None:
338
+ module_call_graph = exported_program.module_call_graph
339
+ nodes = {node.name for node in exported_program.graph.nodes}
340
+ for entry in module_call_graph:
341
+ if entry.signature is not None:
342
+ for arg in entry.signature.inputs:
343
+ if arg.name and arg.name not in nodes:
344
+ raise SpecViolationError(
345
+ f"Input {arg.name} does not exist in the graph."
346
+ )
347
+ for arg in entry.signature.outputs:
348
+ if arg.name and arg.name not in nodes:
349
+ raise SpecViolationError(
350
+ f"Output {arg.name} does not exist in the graph."
351
+ )
352
+
353
+
354
+ def _verify_exported_program_signature(exported_program) -> None:
355
+ # Check ExportedProgram signature matches
356
+ gs = exported_program.graph_signature
357
+
358
+ # Check every node in the signature exists in the graph
359
+ input_node_names = [
360
+ node.name for node in exported_program.graph.nodes if node.op == "placeholder"
361
+ ]
362
+
363
+ if len(input_node_names) != len(gs.input_specs):
364
+ raise SpecViolationError(
365
+ f"Number of graph inputs ({len(input_node_names)}) "
366
+ f"does not match number of inputs in the graph signature ({len(gs.input_specs)})"
367
+ )
368
+
369
+ for input_spec, node in zip(gs.input_specs, input_node_names):
370
+ if isinstance(
371
+ input_spec.arg,
372
+ (TensorArgument, SymIntArgument, SymFloatArgument, SymBoolArgument),
373
+ ):
374
+ if input_spec.arg.name != node:
375
+ raise SpecViolationError(
376
+ f"Input spec name {input_spec.arg.name} does not match node name {node}"
377
+ )
378
+
379
+ if input_spec.kind == InputKind.USER_INPUT:
380
+ continue
381
+
382
+ elif input_spec.kind == InputKind.PARAMETER:
383
+ if not isinstance(input_spec.arg, TensorArgument):
384
+ raise SpecViolationError(
385
+ f"Parameter {input_spec.name} is not a tensor argument. Found {input_spec.arg} instead."
386
+ )
387
+ if input_spec.target is None:
388
+ raise SpecViolationError(
389
+ f"InputSpec for {input_spec.name} has no target."
390
+ )
391
+
392
+ param = input_spec.target
393
+ if param not in exported_program.state_dict:
394
+ raise SpecViolationError(f"Parameter {param} is not in the state dict.")
395
+
396
+ if not isinstance(exported_program.state_dict[param], torch.nn.Parameter):
397
+ raise SpecViolationError(
398
+ f"State dict entry for parameter {param} is not an instance of torch.nn.Parameter."
399
+ )
400
+
401
+ elif input_spec.kind == InputKind.BUFFER:
402
+ if not isinstance(input_spec.arg, TensorArgument):
403
+ raise SpecViolationError(
404
+ f"Buffer {input_spec.name} is not a tensor argument. Found {input_spec.arg} instead."
405
+ )
406
+ if input_spec.target is None:
407
+ raise SpecViolationError(
408
+ f"InputSpec for {input_spec.name} has no target."
409
+ )
410
+
411
+ buffer = input_spec.target
412
+ if input_spec.persistent is None:
413
+ raise SpecViolationError(
414
+ f"Buffer {buffer} is missing a persistence flag"
415
+ )
416
+
417
+ if (
418
+ input_spec.persistent is True
419
+ and buffer not in exported_program.state_dict
420
+ ):
421
+ raise SpecViolationError(f"Buffer {buffer} is not in the state dict.")
422
+
423
+ if input_spec.persistent is False and buffer in exported_program.state_dict:
424
+ raise SpecViolationError(
425
+ f"Non-persistent buffer {buffer} is in the state dict, it should not be."
426
+ )
427
+ elif input_spec.kind == InputKind.CONSTANT_TENSOR:
428
+ if not isinstance(input_spec.arg, TensorArgument):
429
+ raise SpecViolationError(
430
+ f"Constant tensor {input_spec.name} is not a tensor argument. Found {input_spec.arg} instead."
431
+ )
432
+ if input_spec.target is None:
433
+ raise SpecViolationError(
434
+ f"InputSpec for {input_spec.name} has no target."
435
+ )
436
+
437
+ tensor_const = input_spec.target
438
+ if tensor_const not in exported_program.constants:
439
+ raise SpecViolationError(
440
+ f"Constant tensor {tensor_const} is not in the constants dictionary."
441
+ )
442
+ elif input_spec.kind == InputKind.CUSTOM_OBJ:
443
+ if not isinstance(input_spec.arg, CustomObjArgument):
444
+ raise SpecViolationError(
445
+ f"Custom object {input_spec.name} is not a custom object argument. Found {input_spec.arg} instead."
446
+ )
447
+ if input_spec.target is None:
448
+ raise SpecViolationError(
449
+ f"InputSpec for {input_spec.name} has no target."
450
+ )
451
+
452
+ custom_obj = input_spec.target
453
+ if custom_obj not in exported_program.constants:
454
+ raise SpecViolationError(
455
+ f"Custom object {custom_obj} is not in the constants dictionary."
456
+ )
457
+ elif input_spec.kind == InputKind.TOKEN:
458
+ if not isinstance(input_spec.arg, TokenArgument):
459
+ raise SpecViolationError(
460
+ f"Constant tensor {input_spec.name} is not a tensor argument. Found {input_spec.arg} instead."
461
+ )
462
+ else:
463
+ raise SpecViolationError(f"Unknown InputKind {input_spec.kind}.")
464
+
465
+ # Check outputs
466
+ output_node = list(exported_program.graph.nodes)[-1]
467
+ assert output_node.op == "output"
468
+ output_nodes = [
469
+ arg.name if isinstance(arg, torch.fx.Node) else arg
470
+ for arg in output_node.args[0]
471
+ ]
472
+
473
+ if len(output_nodes) != len(gs.output_specs):
474
+ raise SpecViolationError(
475
+ f"Number of output nodes {len(output_nodes)} is different "
476
+ "Than the number of outputs specified by the graph signature: \n"
477
+ f"Number of mutated buffers: {len(gs.buffers_to_mutate)}. \n"
478
+ f"Number of user outputs: {len(gs.user_outputs)}. \n"
479
+ )
480
+
481
+ num_tokens = len(gs.output_tokens)
482
+ end = (
483
+ len(gs.buffers_to_mutate)
484
+ + len(gs.parameters_to_mutate)
485
+ + len(gs.user_inputs_to_mutate)
486
+ + num_tokens
487
+ )
488
+ mutate_nodes: list[str] = output_nodes[num_tokens:end]
489
+ user_output_nodes = output_nodes[end : end + len(gs.user_outputs)]
490
+
491
+ for mutation_node in mutate_nodes:
492
+ if mutation_node in gs.buffers_to_mutate:
493
+ if gs.buffers_to_mutate[mutation_node] not in gs.buffers:
494
+ raise SpecViolationError(
495
+ f"Buffer output {mutation_node} does not point to a buffer that exists. \n"
496
+ f"Dict of buffers that are mutated, in order: {gs.buffers_to_mutate} \n"
497
+ f"Buffer nodes available: {gs.buffers} \n"
498
+ )
499
+ elif mutation_node in gs.parameters_to_mutate:
500
+ if gs.parameters_to_mutate[mutation_node] not in gs.parameters:
501
+ raise SpecViolationError(
502
+ f"Parameter output {mutation_node} does not point to a parameter that exists. \n"
503
+ f"Dict of parameters that are mutated, in order: {gs.parameters_to_mutate} \n"
504
+ f"Parameter nodes available: {gs.parameters} \n"
505
+ )
506
+ elif mutation_node in gs.user_inputs_to_mutate:
507
+ if gs.user_inputs_to_mutate[mutation_node] not in gs.user_inputs:
508
+ raise SpecViolationError(
509
+ f"User input output {mutation_node} does not point to a user input that exists. \n"
510
+ f"Dict of user inputs that are mutated, in order: {gs.user_inputs_to_mutate} \n"
511
+ f"User input nodes available: {gs.user_inputs} \n"
512
+ )
513
+ else:
514
+ raise SpecViolationError(
515
+ f"Mutation node {mutation_node} is neither a buffer nor a user input. "
516
+ f"Buffers to mutate: {gs.buffers_to_mutate}, User inputs to mutate: {gs.user_inputs_to_mutate}"
517
+ )
518
+
519
+ for user_output_node, user_output_name in zip(user_output_nodes, gs.user_outputs):
520
+ if user_output_node != user_output_name:
521
+ raise SpecViolationError(
522
+ f"User output {user_output_node} is not in the correct "
523
+ "order or is not found in the "
524
+ f"exported program's user_output list: {gs.user_outputs}. "
525
+ )
526
+
527
+
528
+ def load_verifier(dialect: str) -> type[Verifier]:
529
+ if dialect == "ATEN" or dialect == "":
530
+ return _VerifierMeta._registry.get(dialect, Verifier)
531
+ return _VerifierMeta._registry[dialect]
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/_inductor/__autotune_main__.py ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ import logging
3
+ import os
4
+
5
+ from torch._inductor.autotune_process import TuningProcess
6
+ from torch._inductor.compile_worker.utils import _async_compile_initializer
7
+
8
+
9
+ log = logging.getLogger(__name__)
10
+
11
+
12
+ def main() -> None:
13
+ parser = argparse.ArgumentParser()
14
+ parser.add_argument("--parent", type=int)
15
+ parser.add_argument("--read-fd", type=int)
16
+ parser.add_argument("--write-fd", type=int)
17
+ args = parser.parse_args()
18
+ read_pipe = os.fdopen(args.read_fd, "rb")
19
+ write_pipe = os.fdopen(args.write_fd, "wb")
20
+
21
+ try:
22
+ # Ensures the subprocess exits if the parent crashes:
23
+ _async_compile_initializer(args.parent)
24
+ TuningProcess.process_main(read_pipe, write_pipe)
25
+ except Exception:
26
+ log.exception("Uncaught exception in autotune subprocess")
27
+ finally:
28
+ read_pipe.close()
29
+ write_pipe.close()
30
+
31
+
32
+ if __name__ == "__main__":
33
+ main()
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/_inductor/__init__.py ADDED
@@ -0,0 +1,447 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # mypy: allow-untyped-defs
2
+ from __future__ import annotations
3
+
4
+ import io
5
+ import logging
6
+ import os
7
+ from typing import Any, IO, Literal, Optional, TYPE_CHECKING, Union
8
+
9
+ import torch.fx
10
+
11
+ from .standalone_compile import CompiledArtifact # noqa: TC001
12
+
13
+
14
+ if TYPE_CHECKING:
15
+ from torch._inductor.utils import InputType
16
+ from torch.export import ExportedProgram
17
+ from torch.export.pt2_archive._package import AOTICompiledModel
18
+ from torch.export.pt2_archive._package_weights import Weights
19
+ from torch.types import FileLike
20
+
21
+ __all__ = [
22
+ "compile",
23
+ "list_mode_options",
24
+ "list_options",
25
+ "cudagraph_mark_step_begin",
26
+ "standalone_compile",
27
+ ]
28
+
29
+
30
+ log = logging.getLogger(__name__)
31
+
32
+
33
+ def compile(
34
+ gm: torch.fx.GraphModule,
35
+ example_inputs: list[InputType],
36
+ options: Optional[dict[str, Any]] = None,
37
+ ):
38
+ """
39
+ Compile a given FX graph with TorchInductor. This allows compiling
40
+ FX graphs captured without using TorchDynamo.
41
+
42
+ Args:
43
+ gm: The FX graph to compile.
44
+ example_inputs: List of tensor inputs.
45
+ options: Optional dict of config options. See `torch._inductor.config`.
46
+
47
+ Returns:
48
+ Callable with same behavior as gm but faster.
49
+ """
50
+ from .compile_fx import compile_fx
51
+
52
+ return compile_fx(gm, example_inputs, config_patches=options)
53
+
54
+
55
+ def aoti_compile_and_package(
56
+ exported_program: ExportedProgram,
57
+ _deprecated_unused_args=None,
58
+ _deprecated_unused_kwargs=None,
59
+ *,
60
+ package_path: Optional[FileLike] = None,
61
+ inductor_configs: Optional[dict[str, Any]] = None,
62
+ ) -> str:
63
+ """
64
+ Compiles the exported program with AOTInductor, and packages it into a .pt2
65
+ artifact specified by the input package_path. To load the package, you can
66
+ call ``torch._inductor.aoti_load_package(package_path)``.
67
+
68
+ An example usage is as follows:
69
+
70
+ .. code-block:: python
71
+
72
+ ep = torch.export.export(M(), ...)
73
+ aoti_file = torch._inductor.aoti_compile_and_package(
74
+ ep, package_path="my_package.pt2"
75
+ )
76
+ compiled_model = torch._inductor.aoti_load_package("my_package.pt2")
77
+
78
+ To compile and save multiple models into a single ``.pt2`` artifact, you can do
79
+ the following:
80
+
81
+ .. code-block:: python
82
+
83
+ ep1 = torch.export.export(M1(), ...)
84
+ aoti_file1 = torch._inductor.aot_compile(
85
+ ep1, ..., options={"aot_inductor.package": True}
86
+ )
87
+ ep2 = torch.export.export(M2(), ...)
88
+ aoti_file2 = torch._inductor.aot_compile(
89
+ ep2, ..., options={"aot_inductor.package": True}
90
+ )
91
+
92
+ from torch._inductor.package import package_aoti, load_package
93
+
94
+ package_aoti("my_package.pt2", {"model1": aoti_file1, "model2": aoti_file2})
95
+
96
+ compiled_model1 = load_package("my_package.pt2", "model1")
97
+ compiled_model2 = load_package("my_package.pt2", "model2")
98
+
99
+ Args:
100
+ exported_program: An exported program created through a call from torch.export
101
+ package_path: Optional specified path to the generated .pt2 artifact.
102
+ inductor_configs: Optional dictionary of configs to control inductor.
103
+
104
+ Returns:
105
+ Path to the generated artifact
106
+ """
107
+ from torch.export import ExportedProgram
108
+
109
+ from .debug import aot_inductor_minifier_wrapper
110
+
111
+ if not isinstance(exported_program, ExportedProgram):
112
+ raise ValueError("Only ExportedProgram is supported")
113
+
114
+ if exported_program.example_inputs is None:
115
+ raise RuntimeError(
116
+ "exported_program.example_inputs is required to be set in order "
117
+ "for AOTInductor compilation."
118
+ )
119
+
120
+ if _deprecated_unused_args is not None or _deprecated_unused_kwargs is not None:
121
+ log.warning(
122
+ "You no longer need to specify args/kwargs to aoti_compile_and_package "
123
+ "as we can get this information from exported_program.example_inputs."
124
+ )
125
+
126
+ assert (
127
+ package_path is None
128
+ or (
129
+ isinstance(package_path, (io.IOBase, IO))
130
+ and package_path.writable()
131
+ and package_path.seekable()
132
+ )
133
+ or (
134
+ isinstance(package_path, (str, os.PathLike))
135
+ and os.fspath(package_path).endswith(".pt2")
136
+ )
137
+ ), (
138
+ f"Expect package path to be a file ending in .pt2, is None, or is a buffer. Instead got {package_path}"
139
+ )
140
+
141
+ inductor_configs = inductor_configs or {}
142
+ inductor_configs["aot_inductor.package"] = True
143
+
144
+ if inductor_configs.get("aot_inductor.output_path"):
145
+ raise RuntimeError(
146
+ "Please pass in a package path to aot_inductor_compile() instead "
147
+ "of setting the aot_inductor.output_path config."
148
+ )
149
+
150
+ # a wrapper around aoti_compile_and_package_inner.
151
+ return aot_inductor_minifier_wrapper(
152
+ _aoti_compile_and_package_inner,
153
+ exported_program,
154
+ # pyrefly: ignore [bad-argument-type]
155
+ package_path=package_path,
156
+ inductor_configs=inductor_configs,
157
+ )
158
+
159
+
160
+ def _aoti_compile_and_package_inner(
161
+ gm: torch.nn.Module,
162
+ # flat_example_inputs: List[Any],
163
+ args: tuple[Any],
164
+ kwargs: Optional[dict[str, Any]] = None,
165
+ *,
166
+ load_and_run: bool = False,
167
+ check_accuracy: Optional[str] = None,
168
+ package_path: Optional[Union[str, io.BytesIO]] = None,
169
+ inductor_configs: Optional[dict[str, Any]] = None,
170
+ ):
171
+ """
172
+ See docstring for aoti_compile_and_package.
173
+
174
+ If `load_and_run` is True, this function will load the compiled model and run it.
175
+ This is for the minifier to check the correctness of the compiled model.
176
+
177
+ If `check_accuracy` is set, this function will check the accuracy of the compiled
178
+ model against gm. kwargs must be None if check_accuracy is set.
179
+ "strict_accuracy" means "we will minify any time we see anything that
180
+ diverges", whereas "accuracy" is more conservative, and will only minify if there
181
+ is a meaningful fp64 divergence
182
+ """
183
+
184
+ if check_accuracy:
185
+ assert kwargs is None or len(kwargs) == 0, (
186
+ "when checking for accuracy, the inputs must have been flattened and kwargs is None"
187
+ )
188
+
189
+ from .package import package_aoti
190
+
191
+ assert isinstance(gm, torch.fx.GraphModule)
192
+
193
+ kwargs = kwargs or {}
194
+
195
+ aoti_files = aot_compile(gm, args, kwargs, options=inductor_configs)
196
+ assert isinstance(aoti_files, list)
197
+
198
+ if package_path is None:
199
+ path = [
200
+ os.path.splitext(file)[0]
201
+ for file in aoti_files
202
+ if isinstance(file, str) and os.path.splitext(file)[1] == ".so"
203
+ ]
204
+ if len(path) == 0:
205
+ path = [
206
+ os.path.splitext(file)[0]
207
+ for file in aoti_files
208
+ if isinstance(file, str) and os.path.splitext(file)[1] == ".cpp"
209
+ ]
210
+ package_path = path[0] + ".pt2"
211
+
212
+ res = package_aoti(package_path, aoti_files)
213
+ assert res == package_path
214
+
215
+ if load_and_run or check_accuracy:
216
+ compiled_model = aoti_load_package(package_path)
217
+ if check_accuracy:
218
+ from torch._dynamo.debug_utils import AccuracyError, same_two_models
219
+
220
+ # This might look inverted but it's not. strict_accuracy means "we will
221
+ # minify any time we see anything that diverges", whereas accuracy is more
222
+ # conservative, and will only minify if there is a meaningful fp64
223
+ # divergence
224
+ not_strict_accuracy = check_accuracy == "accuracy"
225
+ if not same_two_models(
226
+ gm,
227
+ compiled_model, # type: ignore[arg-type]
228
+ args,
229
+ only_fwd=True,
230
+ require_fp64=not_strict_accuracy,
231
+ ignore_non_fp=not_strict_accuracy,
232
+ ):
233
+ raise AccuracyError("Bad accuracy detected")
234
+ else:
235
+ compiled_model(*args, **kwargs)
236
+
237
+ return package_path
238
+
239
+
240
+ def aoti_load_package(
241
+ path: FileLike, run_single_threaded: bool = False, device_index: int = -1
242
+ ) -> AOTICompiledModel:
243
+ """
244
+ Loads the model from the PT2 package.
245
+
246
+ If multiple models were packaged into the PT2, this will load the default
247
+ model. To load a specific model, you can directly call the load API
248
+
249
+ .. code-block:: python
250
+
251
+ from torch._inductor.package import load_package
252
+
253
+ compiled_model1 = load_package("my_package.pt2", "model1")
254
+ compiled_model2 = load_package("my_package.pt2", "model2")
255
+
256
+ Args:
257
+ path: Path to the .pt2 package
258
+ run_single_threaded (bool): Whether the model should be run without
259
+ thread synchronization logic. This is useful to avoid conflicts with
260
+ CUDAGraphs.
261
+ device_index (int): The index of the device to which the PT2 package is
262
+ to be loaded. By default, `device_index=-1` is used, which corresponds
263
+ to the device `cuda` when using CUDA. Passing `device_index=1` would
264
+ load the package to `cuda:1`, for example.
265
+ """
266
+ from torch._inductor.package import load_package
267
+
268
+ return load_package(
269
+ path, run_single_threaded=run_single_threaded, device_index=device_index
270
+ )
271
+
272
+
273
+ def aot_compile(
274
+ gm: torch.fx.GraphModule,
275
+ args: tuple[Any, ...],
276
+ kwargs: Optional[dict[str, Any]] = None,
277
+ *,
278
+ options: Optional[dict[str, Any]] = None,
279
+ ) -> Union[str, list[Union[str, Weights]], torch.fx.GraphModule]:
280
+ """
281
+ Ahead-of-time compile a given FX graph with TorchInductor into a shared library.
282
+
283
+ Args:
284
+ gm: The FX graph to compile.
285
+ args: Example arguments
286
+ kwargs: Example keyword arguments
287
+ options: Optional dict of config options. See `torch._inductor.config`.
288
+
289
+ Returns:
290
+ Path to the generated shared library, or a list of files generated by
291
+ AOTI if aot_inductor.package=True.
292
+ TODO: make it return a list by default
293
+ """
294
+ from .compile_fx import _aoti_flatten_inputs, compile_fx_aot
295
+
296
+ if hasattr(gm, "_guards_fn"):
297
+ # Do not compile the guards function, since it may contain checks
298
+ # that are not currently supported by AOTI. In particular, non-Tensor
299
+ # arguments are converted to None and will fail specialization checks.
300
+ node = next(iter(gm.graph.find_nodes(op="call_module", target="_guards_fn")))
301
+ gm.graph.erase_node(node)
302
+ delattr(gm, "_guards_fn")
303
+ gm.recompile()
304
+
305
+ flat_example_inputs, options = _aoti_flatten_inputs(
306
+ gm, args, kwargs, options=options
307
+ )
308
+ from torch._export.utils import _compiling_state_context
309
+
310
+ with _compiling_state_context():
311
+ return compile_fx_aot(
312
+ gm,
313
+ flat_example_inputs, # type: ignore[arg-type]
314
+ config_patches=options,
315
+ )
316
+
317
+
318
+ lite_mode_options = {
319
+ # Fallback by default unless users explicitly annotated with
320
+ # regional inductor compile.
321
+ "fallback_by_default": True,
322
+ "selective_decompose": True,
323
+ # Disable reorder optimizations
324
+ "reorder_for_peak_memory": False,
325
+ "reorder_for_compute_comm_overlap": False,
326
+ "triton.reorder_for_reducing_graph_partitions": False,
327
+ # Disable pre-, joint-, post-grad passes
328
+ "use_pre_grad_passes": False,
329
+ "use_joint_graph_passes": False,
330
+ "use_post_grad_passes": False,
331
+ # Disable dead code elimination (dce) and buffer reuse
332
+ "use_dce": False,
333
+ "allow_buffer_reuse": False,
334
+ }
335
+
336
+
337
+ def list_mode_options(
338
+ mode: Optional[str] = None, dynamic: Optional[bool] = None
339
+ ) -> dict[str, Any]:
340
+ r"""Returns a dictionary describing the optimizations that each of the available
341
+ modes passed to `torch.compile()` performs.
342
+
343
+ Args:
344
+ mode (str, optional): The mode to return the optimizations for.
345
+ If None, returns optimizations for all modes
346
+ dynamic (bool, optional): Whether dynamic shape is enabled.
347
+
348
+ Example::
349
+ >>> torch._inductor.list_mode_options()
350
+ """
351
+
352
+ mode_options: dict[str, dict[str, bool]] = {
353
+ "default": {},
354
+ # lite backend for opt-in optimizations
355
+ "lite": lite_mode_options,
356
+ # enable cudagraphs
357
+ "reduce-overhead": {
358
+ "triton.cudagraphs": True,
359
+ },
360
+ # enable max-autotune
361
+ "max-autotune-no-cudagraphs": {
362
+ "max_autotune": True,
363
+ "coordinate_descent_tuning": True,
364
+ },
365
+ # enable max-autotune
366
+ # enable cudagraphs
367
+ "max-autotune": {
368
+ "max_autotune": True,
369
+ "triton.cudagraphs": True,
370
+ "coordinate_descent_tuning": True,
371
+ },
372
+ }
373
+ try:
374
+ return mode_options[mode] if mode else mode_options
375
+ except KeyError as e:
376
+ raise RuntimeError(
377
+ f"Unrecognized mode={mode}, should be one of: {', '.join(mode_options.keys())}"
378
+ ) from e
379
+
380
+
381
+ def list_options() -> list[str]:
382
+ r"""Returns a dictionary describing the optimizations and debug configurations
383
+ that are available to `torch.compile()`.
384
+
385
+ The options are documented in `torch._inductor.config`.
386
+
387
+ Example::
388
+
389
+ >>> torch._inductor.list_options()
390
+ """
391
+
392
+ from torch._inductor import config
393
+
394
+ current_config: dict[str, Any] = config.get_config_copy()
395
+
396
+ return list(current_config.keys())
397
+
398
+
399
+ def cudagraph_mark_step_begin():
400
+ "Indicates that a new iteration of inference or training is about to begin."
401
+ from .cudagraph_trees import mark_step_begin
402
+
403
+ mark_step_begin()
404
+
405
+
406
+ def standalone_compile(
407
+ gm: torch.fx.GraphModule,
408
+ example_inputs: list[InputType],
409
+ *,
410
+ dynamic_shapes: Literal[
411
+ "from_example_inputs", "from_tracing_context", "from_graph"
412
+ ] = "from_graph",
413
+ options: Optional[dict[str, Any]] = None,
414
+ aot: bool = False, # AOT mode, which uses BundledAOTAutogradCache
415
+ ) -> CompiledArtifact:
416
+ """
417
+ Precompilation API for inductor.
418
+
419
+ .. code-block:: python
420
+
421
+ compiled_artifact = torch._inductor.standalone_compile(gm, args)
422
+ compiled_artifact.save(path=path, format="binary")
423
+
424
+ # Later on a new process
425
+ loaded = torch._inductor.CompiledArtifact.load(path=path, format="binary")
426
+ compiled_out = loaded(*args)
427
+
428
+ Args:
429
+ gm: Graph Module
430
+ example_inputs: Inputs for the graph module
431
+ dynamic_shapes: If "from_graph" (default), we will use the dynamic
432
+ shapes in the passed-in graph module.
433
+ If "from_tracing_context", we use the dynamic shape info in the
434
+ ambient tracing context.
435
+ If "from_example_inputs", we will specialize the graph on the
436
+ example_inputs.
437
+ options: Inductor compilation options
438
+
439
+ Returns:
440
+ CompiledArtifact that can be saved to disk or invoked directly.
441
+ """
442
+ from .standalone_compile import standalone_compile
443
+
444
+ options = options if options else {}
445
+ return standalone_compile(
446
+ gm, example_inputs, dynamic_shapes=dynamic_shapes, options=options, aot=aot
447
+ )