koichi12 commited on Feb 12, 2025

Commit

de8bd69

verified ·

1 Parent(s): 71147a9

Add files using upload-large-folder tool

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_higher_order_ops/__init__.py +1 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_higher_order_ops/__pycache__/__init__.cpython-311.pyc +0 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_higher_order_ops/__pycache__/auto_functionalize.cpython-311.pyc +0 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_higher_order_ops/__pycache__/cond.cpython-311.pyc +0 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_higher_order_ops/__pycache__/effects.cpython-311.pyc +0 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_higher_order_ops/__pycache__/map.cpython-311.pyc +0 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_higher_order_ops/__pycache__/strict_mode.cpython-311.pyc +0 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_higher_order_ops/__pycache__/triton_kernel_wrap.cpython-311.pyc +0 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_higher_order_ops/__pycache__/wrap.cpython-311.pyc +0 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_higher_order_ops/auto_functionalize.py +261 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_higher_order_ops/cond.py +349 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_higher_order_ops/effects.py +204 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_higher_order_ops/map.py +358 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_higher_order_ops/strict_mode.py +100 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_higher_order_ops/torchbind.py +94 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_higher_order_ops/triton_kernel_wrap.py +842 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_higher_order_ops/while_loop.py +232 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/backends/_nnapi/__pycache__/__init__.cpython-311.pyc +0 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/backends/mkl/__init__.py +56 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/backends/mps/__pycache__/__init__.cpython-311.pyc +0 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/backends/nnpack/__pycache__/__init__.cpython-311.pyc +0 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/backends/openmp/__init__.py +6 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/distributed/_sharded_tensor/__pycache__/__init__.cpython-311.pyc +0 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/distributed/_tools/__pycache__/__init__.cpython-311.pyc +0 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/distributed/autograd/__init__.py +52 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/distributed/checkpoint/__pycache__/_dedup_tensors.cpython-311.pyc +0 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/distributed/elastic/events/api.py +112 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/distributed/elastic/events/handlers.py +22 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/distributed/elastic/metrics/api.py +201 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/distributed/elastic/multiprocessing/__pycache__/redirects.cpython-311.pyc +0 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py +375 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/distributed/elastic/multiprocessing/errors/handlers.py +16 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/distributed/elastic/multiprocessing/subprocess_handler/__pycache__/handlers.cpython-311.pyc +0 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/distributed/elastic/multiprocessing/subprocess_handler/__pycache__/subprocess_handler.cpython-311.pyc +0 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/distributed/elastic/multiprocessing/subprocess_handler/handlers.py +32 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/distributed/elastic/timer/__init__.py +44 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/distributed/elastic/timer/__pycache__/local_timer.cpython-311.pyc +0 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/distributed/elastic/timer/local_timer.py +125 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/distributed/nn/__init__.py +4 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/distributed/nn/api/__init__.py +0 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/distributed/nn/jit/__pycache__/__init__.cpython-311.pyc +0 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/distributed/nn/jit/templates/__init__.py +0 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/distributed/optim/__pycache__/post_localSGD_optimizer.cpython-311.pyc +0 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/distributed/pipeline/sync/__pycache__/copy.cpython-311.pyc +0 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/distributed/pipeline/sync/__pycache__/worker.cpython-311.pyc +0 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/distributed/tensor/__pycache__/__init__.cpython-311.pyc +0 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/distributed/tensor/parallel/__pycache__/api.cpython-311.pyc +0 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/distributed/tensor/parallel/__pycache__/loss.cpython-311.pyc +0 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/distributed/tensor/parallel/__pycache__/style.cpython-311.pyc +0 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/distributed/tensor/parallel/api.py +108 -0

tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_higher_order_ops/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ from .cond import cond

tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_higher_order_ops/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (263 Bytes). View file

tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_higher_order_ops/__pycache__/auto_functionalize.cpython-311.pyc ADDED Viewed

Binary file (11.5 kB). View file

tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_higher_order_ops/__pycache__/cond.cpython-311.pyc ADDED Viewed

Binary file (18.3 kB). View file

tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_higher_order_ops/__pycache__/effects.cpython-311.pyc ADDED Viewed

Binary file (10.5 kB). View file

tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_higher_order_ops/__pycache__/map.cpython-311.pyc ADDED Viewed

Binary file (21 kB). View file

tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_higher_order_ops/__pycache__/strict_mode.cpython-311.pyc ADDED Viewed

Binary file (5.79 kB). View file

tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_higher_order_ops/__pycache__/triton_kernel_wrap.cpython-311.pyc ADDED Viewed

Binary file (42 kB). View file

tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_higher_order_ops/__pycache__/wrap.cpython-311.pyc ADDED Viewed

Binary file (11.2 kB). View file

tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_higher_order_ops/auto_functionalize.py ADDED Viewed

	@@ -0,0 +1,261 @@

+from typing import Any, Dict, List, Optional, Tuple, Union
+import torch
+import torch.utils._pytree as pytree
+from torch import Tensor
+from torch._C import DispatchKey
+from torch._ops import HigherOrderOperator
+from torch._prims_common import clone_preserve_strides
+from torch._subclasses.fake_tensor import FakeTensorMode
+from torch.fx.experimental.proxy_tensor import (
+    disable_proxy_modes_tracing,
+    ProxyTorchDispatchMode,
+    track_tensor_tree,
+)
+# NOTE: [auto-functionalizing custom ops]
+# Users may wish to torch.compile custom ops that mutate their inputs.
+# torch.compile will automatically support this op without anyone needing
+# to provide a functionalization kernel for it. Here's how.
+#
+# Let's say we have a hypothetical mylib::sin_(Tensor(a!) x) -> ()
+# op. First, when FakeTensor sees this op:
+# - If the schema says it returns nothing, we can generate a trivial
+#   FakeTensor rule for it (that returns nothing).
+# - Otherwise, the user needs to provide a FakeTensor rule (abstract impl)
+#
+# Next, when Python FunctionalTensor sees the op, it will functionalize
+# it by emitting a call to an auto_functionalize(op, ["x"], {"x": ...})
+# HOP and replacing the mutated inputs with corresponding outputs of this HOP.
+# This HOP effectively runs the functional version of the op when
+# called: it clones inputs that will be mutated, runs the op, and
+# then returns (output, Tensors with the new values)
+class AutoFunctionalized(HigherOrderOperator):
+    """auto_functionalized(_mutable_op, **kwargs)
+    This HOP runs a "functional" version of _mutable_op.
+    Concretely, it looks at all the arguments that are mutable through
+    _mutable_op's operator schema, clones those kwargs, runs
+    `out = _mutable_op(**kwargs)` with the cloned values, and then returns the
+    operator output concatenated with the cloned values that were mutated.
+    We have some restrictions on `_mutable_op`.
+    See `can_auto_functionalize` for the restrictions. We can likely lift
+    many of these if users request it.
+    The reason why _mutable_op is prefixed with an
+    underscore is to prevent collisions with kwarg names in **kwargs.
+    """
+    def __init__(self):
+        super().__init__("auto_functionalized")
+    def __call__(
+        self,
+        _mutable_op: torch._ops.OpOverload,
+        **kwargs: Dict[str, Any],
+    ) -> Tuple[Any, Tuple[Tensor, ...]]:
+        assert can_auto_functionalize(_mutable_op)
+        assert isinstance(kwargs, dict)
+        return super().__call__(_mutable_op, **kwargs)
+auto_functionalized = AutoFunctionalized()
+def can_auto_functionalize(op: torch._ops.OperatorBase) -> bool:
+    if not isinstance(op, torch._ops.OpOverload):
+        return False
+    if torch._library.utils.is_builtin(op):
+        # We control the built-ins. These may (in rare cases)
+        # do input metadata mutation (which we have banned on custom ops)
+        return False
+    schema = op._schema
+    if not schema.is_mutable:
+        return False
+    schema = op._schema
+    for arg in schema.arguments:
+        if arg.alias_info is None:
+            continue
+        if not arg.alias_info.is_write:
+            continue
+        if type(arg.type) is torch.TensorType:
+            continue
+        if (
+            type(arg.type) is torch.OptionalType
+            and type(arg.type.getElementType()) is torch.TensorType
+        ):
+            continue
+        # Not yet supported: other Tensor types. This includes things like
+        # Tensor[], Tensor?[], Tensor[]?.
+        return False
+    # The returns must not alias anything
+    for ret in schema.returns:
+        if ret.alias_info is None and type(ret.type) is torch.TensorType:
+            continue
+        # Not yet supported: List[Tensor] return.
+        return False
+    return True
+@auto_functionalized.py_impl(DispatchKey.CompositeExplicitAutograd)
+def auto_functionalized_dense(
+    _mutable_op: torch._ops.OpOverload,
+    _only_clone_these_tensors: Optional[Tuple[str, ...]] = None,
+    **kwargs: Dict[str, Any],
+) -> Tuple[Any, Tuple[Tensor, ...]]:
+    new_kwargs = dict(**kwargs)
+    result = []
+    _mutable_args_names = get_mutable_arg_names(_mutable_op)
+    for name in _mutable_args_names:
+        if (
+            _only_clone_these_tensors is not None
+            and name not in _only_clone_these_tensors
+        ):
+            new_kwargs[name] = kwargs[name]
+        else:
+            new_kwargs[name] = (
+                clone_preserve_strides(kwargs[name])
+                if kwargs[name] is not None
+                else None
+            )
+        result.append(new_kwargs[name])
+    out = _mutable_op(**new_kwargs)
+    if isinstance(out, tuple):
+        return (*out, *result)  # type: ignore[return-value]
+    else:
+        return (out, *result)  # type: ignore[return-value]
+@auto_functionalized.py_impl(FakeTensorMode)
+def auto_functionalized_fake(
+    mode,
+    _mutable_op: torch._ops.OpOverload,
+    **kwargs: Dict[str, Any],
+) -> Tuple[Any, Tuple[Tensor, ...]]:
+    with mode:
+        result = auto_functionalized_dense(_mutable_op, **kwargs)
+        return result
+@auto_functionalized.py_impl(ProxyTorchDispatchMode)
+def auto_functionalized_proxy(
+    mode,
+    _mutable_op: torch._ops.OpOverload,
+    **kwargs: Dict[str, Any],
+) -> Tuple[Any, Tuple[Tensor, ...]]:
+    if not mode.enable_tracing:
+        return auto_functionalized(_mutable_op, **kwargs)
+    with disable_proxy_modes_tracing():
+        out = auto_functionalized(_mutable_op, **kwargs)
+    proxy_kwargs = pytree.tree_map(mode.tracer.unwrap_proxy, kwargs)
+    out_proxy = mode.tracer.create_proxy(
+        "call_function",
+        auto_functionalized,
+        (_mutable_op,),
+        proxy_kwargs,
+    )
+    result = track_tensor_tree(out, out_proxy, constant=None, tracer=mode.tracer)
+    return result
+auto_functionalized.fallthrough(DispatchKey.AutogradCPU)
+auto_functionalized.fallthrough(DispatchKey.AutogradCUDA)
+def get_mutable_arg_names(op: torch._ops.OpOverload) -> List[str]:
+    """
+    Returns the list of argument names that get mutated according to the
+    schema.
+    """
+    mutable_args_names = [
+        arg.name
+        for arg in op._schema.arguments
+        if arg.alias_info is not None and arg.alias_info.is_write
+    ]
+    return mutable_args_names
+def do_auto_functionalize(
+    op: torch._ops.OpOverload, args: Tuple[Any, ...], kwargs: Dict[str, Any]
+) -> Any:
+    """Functionalizes a call to op(*args, **kwargs) by emitting a call to
+    `outs = auto_functionalized(op, normalized_kwargs)`
+    and replacing the mutated (args, kwargs) with the corresponding outputs.
+    The normalized_kwargs are just the (args, kwargs), but all in kwarg form.
+    This makes handling easier for the auto_functionalized HOP.
+    """
+    from torch._subclasses.functional_tensor import PythonFunctionalizeAPI
+    ctx = PythonFunctionalizeAPI()
+    # All of the (args, kwargs), but all as kwargs. The names for the
+    # args come from the schema. This makes it easier for us to work with them.
+    normalized_kwargs = {}
+    schema = op._schema
+    for idx, arg in enumerate(schema.arguments):
+        # NB: torch_dispatch kwargs are the args defined as kwarg-only in the schema
+        if arg.name in kwargs:
+            normalized_kwargs[arg.name] = kwargs[arg.name]
+        elif idx < len(args):
+            # if its out of bounds we don't need to do anything
+            # as it means the the optional arg was passed with its default
+            # value
+            normalized_kwargs[arg.name] = args[idx]
+        else:
+            normalized_kwargs[arg.name] = arg.default_value
+    unwrapped_kwargs = ctx.unwrap_tensors(normalized_kwargs)  # type: ignore[arg-type]
+    with ctx.redispatch_to_next():
+        unwrapped_outs = auto_functionalized(
+            op, **unwrapped_kwargs  # type: ignore[arg-type]
+        )
+    # List of the name of args that get mutated (according to the schema)
+    mutable_args_names = get_mutable_arg_names(op)
+    unwrapped_actual_out: Union[Any, Tuple[Any]] = unwrapped_outs[
+        : -len(mutable_args_names)
+    ]
+    unwrapped_mutable_out = unwrapped_outs[-len(mutable_args_names) :]
+    if len(op._schema.returns) == 0:
+        assert unwrapped_actual_out[0] is None
+        unwrapped_actual_out = None
+    elif len(op._schema.returns) == 1:
+        assert len(unwrapped_actual_out) == 1
+        unwrapped_actual_out = unwrapped_actual_out[0]
+    else:
+        assert len(unwrapped_actual_out) == len(op._schema.returns)
+    for name, unwrapped_out in zip(mutable_args_names, unwrapped_mutable_out):
+        # Can be None if input was `Tensor(a!)?`
+        if unwrapped_out is None:
+            continue
+        assert isinstance(unwrapped_out, torch.Tensor)
+        orig_arg = normalized_kwargs[name]
+        ctx.replace(orig_arg, unwrapped_out)
+        ctx.commit_update(orig_arg)
+        ctx.sync(orig_arg)
+    return ctx.wrap_tensors(unwrapped_actual_out)  # type: ignore[arg-type]
+@auto_functionalized.py_functionalize_impl
+def auto_functionalized_func(ctx, _mutable_op, **kwargs):
+    unwrapped_kwargs = ctx.unwrap_tensors(kwargs)
+    with ctx.redispatch_to_next():
+        result = auto_functionalized(_mutable_op, **unwrapped_kwargs)
+    return ctx.wrap_tensors(result)

tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_higher_order_ops/cond.py ADDED Viewed

	@@ -0,0 +1,349 @@

+import torch
+import torch._subclasses.functional_tensor
+import torch.utils._pytree as pytree
+from torch._C import DispatchKey
+from torch._C._functorch import (
+    _add_batch_dim,
+    get_unwrapped,
+    is_batchedtensor,
+    maybe_get_bdim,
+)
+from torch._functorch.utils import exposed_in
+from torch._higher_order_ops.utils import (
+    _has_potential_branch_input_alias,
+    _has_potential_branch_input_mutation,
+    _set_compilation_env,
+    autograd_not_implemented,
+    reenter_make_fx,
+    UnsupportedAliasMutationException,
+)
+from torch._ops import HigherOrderOperator
+from torch._subclasses.fake_tensor import FakeTensorMode
+from torch.fx.experimental.proxy_tensor import (
+    disable_proxy_modes_tracing,
+    ProxyTorchDispatchMode,
+    track_tensor_tree,
+)
+from torch.fx.passes.shape_prop import _extract_tensor_metadata
+from torch.utils._python_dispatch import _get_current_dispatch_mode
+@exposed_in("torch")
+def cond(pred, true_fn, false_fn, operands):
+    r"""
+    Conditionally applies `true_fn` or `false_fn`.
+    .. warning::
+        `torch.cond` is a prototype feature in PyTorch. It has limited support for input and output types and
+        doesn't support training currently. Please look forward to a more stable implementation in a future version of PyTorch.
+        Read more about feature classification at: https://pytorch.org/blog/pytorch-feature-classification-changes/#prototype
+    `cond` is structured control flow operator. That is, it is like a Python if-statement,
+    but has restrictions on `true_fn`, `false_fn`, and `operands` that enable it to be
+    capturable using torch.compile and torch.export.
+    Assuming the constraints on `cond`'s arguments are met, `cond` is equivalent to the following::
+        def cond(pred, true_branch, false_branch, operands):
+            if pred:
+                return true_branch(*operands)
+            else:
+                return false_branch(*operands)
+    Args:
+        pred (Union[bool, torch.Tensor]): A boolean expression or a tensor with one element,
+          indicating which branch function to apply.
+        true_fn (Callable): A callable function (a -> b) that is within the
+          scope that is being traced.
+        false_fn (Callable): A callable function (a -> b) that is within the
+          scope that is being traced. The true branch and false branch must
+          have consistent input and outputs, meaning the inputs have to be
+          the same, and the outputs have to be the same type and shape.
+        operands (Tuple of possibly nested dict/list/tuple of torch.Tensor): A tuple of inputs to the true/false functions.
+    Example::
+        def true_fn(x: torch.Tensor):
+            return x.cos()
+        def false_fn(x: torch.Tensor):
+            return x.sin()
+        return cond(x.shape[0] > 4, true_fn, false_fn, (x,))
+    Restrictions:
+        - The conditional statement (aka `pred`) must meet one of the following constraints:
+          - It's a `torch.Tensor` with only one element, and torch.bool dtype
+          - It's a boolean expression, e.g. `x.shape[0] > 10` or `x.dim() > 1 and x.shape[1] > 10`
+        - The branch function (aka `true_fn`/`false_fn`) must meet all of the following constraints:
+          - The function signature must match with operands.
+          - The function must return a tensor with the same metadata, e.g. shape,
+            dtype, etc.
+          - The function cannot have in-place mutations on inputs or global variables.
+            (Note: in-place tensor operations such as `add_` for intermediate results
+            are allowed in a branch)
+    .. warning::
+        Temporal Limitations:
+        - `cond` only supports **inference** right now. Autograd will be supported in the future.
+        - The **output** of branches must be a **single Tensor**. Pytree of tensors will be supported in the future.
+    """
+    if torch.compiler.is_dynamo_compiling():
+        return cond_op(pred, true_fn, false_fn, operands)
+    def _validate_input(pred, true_fn, false_fn, operands):
+        if not isinstance(pred, (bool, torch.Tensor, torch.SymBool)):
+            raise RuntimeError(f"Expected pred to be bool or tensor, but got {pred}.")
+        if isinstance(pred, torch.Tensor) and pred.numel() != 1:
+            raise RuntimeError(
+                f"Expected pred to be bool or single-element tensor, but got {pred}."
+            )
+        if not callable(true_fn) or not callable(false_fn):
+            raise RuntimeError("Expect both branches to be callbale.")
+        if not isinstance(operands, (tuple, list)) or pytree.tree_any(
+            lambda t: not isinstance(t, torch.Tensor), operands
+        ):
+            raise RuntimeError(
+                "Expect operands to be a tuple of possibly nested dict/list/tuple that only"
+                f"consists of tensor leaves, but got {operands}."
+            )
+    _validate_input(pred, true_fn, false_fn, operands)
+    if not torch._dynamo.is_dynamo_supported():
+        raise RuntimeError("torch.cond requires dynamo support.")
+    with _set_compilation_env():
+        with torch._dynamo.utils.disable_cache_limit():
+            return torch.compile(cond_op, backend="eager", fullgraph=True)(
+                pred, true_fn, false_fn, operands
+            )
+"""
+We're going to define a `cond_op` operation.
+In order to do this, we need implementations for each of the dispatch keys.
+"""
+cond_op = HigherOrderOperator("cond")
+def trace_cond(proxy_mode, func_overload, pred, true_fn, false_fn, operands):
+    assert isinstance(
+        operands, (list, tuple)
+    ), "Cond operands must be a list or tuple of tensors"
+    assert all(
+        isinstance(o, torch.Tensor) for o in operands
+    ), "Cond operands must be a list of tensors"
+    pre_dispatch = getattr(proxy_mode, "pre_dispatch", False)
+    with disable_proxy_modes_tracing():
+        true_graph = reenter_make_fx(true_fn, pre_dispatch)(*operands)
+        false_graph = reenter_make_fx(false_fn, pre_dispatch)(*operands)
+    true_outs = []
+    false_outs = []
+    for node in true_graph.graph.nodes:
+        if node.op == "output":
+            true_outs.extend(node.args)
+    for node in false_graph.graph.nodes:
+        if node.op == "output":
+            false_outs.extend(node.args)
+    flat_true_outs = pytree.arg_tree_leaves(*true_outs)
+    flat_false_outs = pytree.arg_tree_leaves(*false_outs)
+    if len(flat_true_outs) != len(flat_false_outs):
+        raise torch._dynamo.exc.CondOpArgsMismatchError(
+            f"Expected to return same number of outputs but got:"
+            f"\n  {true_fn.__name__} returns {len(flat_true_outs)} item(s)"
+            f"\n  {false_fn.__name__} returns {len(flat_false_outs)} item(s)"
+        )
+    for i in range(0, len(flat_true_outs)):
+        true_out = flat_true_outs[i]
+        false_out = flat_false_outs[i]
+        if true_out.meta["tensor_meta"] != false_out.meta["tensor_meta"]:
+            raise torch._dynamo.exc.CondOpArgsMismatchError(
+                f"Expected each tensor to have same metadata but got:"
+                f"\n  {true_fn.__name__} returns {true_out.meta['tensor_meta']}"
+                f"\n  {false_fn.__name__} returns {false_out.meta['tensor_meta']}"
+            )
+    # There are probably better ways - I know that create_arg has some self incrementing name
+    # magic to it, but since we explicitly have to get the name for register_module,
+    # I was not sure how to do that. This kinda simulates it.
+    next_name = None
+    i = 0
+    while not next_name:
+        candidate = f"true_graph_{i}"
+        if hasattr(proxy_mode.tracer.root, candidate):
+            i += 1
+        else:
+            next_name = candidate
+    true_name = next_name
+    false_name = f"false_graph_{i}"
+    assert not hasattr(proxy_mode.tracer.root, false_name)
+    proxy_mode.tracer.root.register_module(true_name, true_graph)
+    proxy_mode.tracer.root.register_module(false_name, false_graph)
+    args = (pred, true_graph, false_graph, operands)
+    proxy_args = pytree.tree_map(proxy_mode.tracer.unwrap_proxy, args)
+    out_proxy = proxy_mode.tracer.create_proxy(
+        "call_function", func_overload, proxy_args, {}, name="conditional"
+    )
+    # At this point, we're *guaranteed* that whether an output came from the
+    # true or false branch is indistinguishable. So, as this is just for tracing
+    # purposes, choose the true branch.
+    # TODO: Uhh.... it shouldn't matter, but changing this to true_fn results in
+    # a FakeTensorMode error :
+    # `Current active mode <class 'torch._subclasses.fake_tensor.FakeTensorMode'> not registered`
+    # TODO Sometimes the operands are not completely FakeTensor, something seems went wrong in
+    # dynamo? Because of that it runs real computation sometimes and re-triggering downstream dispatch keys.
+    out = false_fn(*operands)
+    return track_tensor_tree(out, out_proxy, constant=None, tracer=proxy_mode.tracer)
+@cond_op.py_impl(DispatchKey.CompositeExplicitAutograd)
+def cond_op_dense(pred, true_fn, false_fn, operands):
+    mode = _get_current_dispatch_mode()
+    assert mode is None, "Mode should never be enabled for CPU/CUDA key"
+    if pred:
+        return true_fn(*operands)
+    else:
+        return false_fn(*operands)
+cond_op.py_impl(DispatchKey.Autograd)(
+    autograd_not_implemented(cond_op, deferred_error=True)
+)
+@cond_op.py_impl(ProxyTorchDispatchMode)
+def inner(mode, pred, true_fn, false_fn, operands):
+    if mode.enable_tracing:
+        return trace_cond(mode, cond_op, pred, true_fn, false_fn, operands)
+    else:
+        return cond_op(pred, true_fn, false_fn, operands)
+@cond_op.py_impl(FakeTensorMode)
+def cond_fake_tensor_mode(mode, pred, true_fn, false_fn, operands):
+    with mode:
+        true_outs = true_fn(*operands)
+        flat_true_outs = pytree.tree_leaves(true_outs)
+        flat_false_outs = pytree.tree_leaves(false_fn(*operands))
+    if len(flat_true_outs) != len(flat_false_outs):
+        raise RuntimeError("Unmatched number of outputs from cond() branches.")
+    for true_out, false_out in zip(flat_true_outs, flat_false_outs):
+        true_meta = _extract_tensor_metadata(true_out)
+        false_meta = _extract_tensor_metadata(false_out)
+        if true_meta != false_meta:
+            raise torch._dynamo.exc.CondOpArgsMismatchError(
+                f"Expected each tensor to have same metadata but got:"
+                f"\n  {true_fn.__name__} returns {true_meta}"
+                f"\n  {false_fn.__name__} returns {false_meta}"
+            )
+    return true_outs
+@cond_op.py_functionalize_impl
+def cond_func(ctx, pred, true_fn, false_fn, inputs):
+    unwrapped_inputs = ctx.unwrap_tensors(inputs)
+    unwrapped_pred = ctx.unwrap_tensors(pred)
+    with ctx.redispatch_to_next() as m:
+        functional_true = ctx.functionalize(true_fn)
+        functional_false = ctx.functionalize(false_fn)
+        pre_dispatch = hasattr(ctx, "mode") and ctx.mode.pre_dispatch
+        for branch in [functional_true, functional_false]:
+            if _has_potential_branch_input_mutation(
+                branch, unwrapped_inputs, pre_dispatch=pre_dispatch
+            ):
+                raise UnsupportedAliasMutationException(
+                    "One of torch.cond branch might be modifying the input!"
+                )
+        for branch in [true_fn, false_fn]:
+            if _has_potential_branch_input_alias(
+                branch, unwrapped_inputs, pre_dispatch=pre_dispatch
+            ):
+                raise UnsupportedAliasMutationException(
+                    "One of torch.cond branch might be aliasing the input!"
+                )
+        cond_return = cond_op(
+            unwrapped_pred, functional_true, functional_false, unwrapped_inputs
+        )
+        return ctx.wrap_tensors(cond_return)
+@cond_op.py_impl(torch._C._functorch.TransformType.Vmap)
+def cond_batch_rule(interpreter, pred, true_fn, false_fn, inputs):
+    assert isinstance(
+        inputs, (list, tuple)
+    ), "Cond inputs must be a list or tuple of tensors"
+    assert all(
+        isinstance(i, torch.Tensor) for i in inputs
+    ), "Cond inputs must be a list of tensors"
+    pred_ = get_unwrapped(pred) if is_batchedtensor(pred) else pred
+    # unbatched tensors are not vmapped
+    tensors, in_dims = zip(
+        *[
+            (get_unwrapped(t), maybe_get_bdim(t)) if is_batchedtensor(t) else (t, None)
+            for t in inputs
+        ]
+    )
+    if is_batchedtensor(pred):
+        # prepend "pred" and vmap everything
+        tensors = (pred_,) + tensors
+        in_dims = (0,) + in_dims
+        def fn(p, *args):
+            t = true_fn(*args)
+            f = false_fn(*args)
+            return torch.where(p, t[0], f[0])
+        with interpreter.lower():
+            result = torch.vmap(fn, in_dims=in_dims)(*tensors)
+    else:
+        # predicate is known at this stage and it is a boolean expression or a
+        # tensor with one element.
+        true_fn = torch.vmap(true_fn, in_dims=in_dims)
+        false_fn = torch.vmap(false_fn, in_dims=in_dims)
+        with interpreter.lower():
+            result = cond_op(pred, true_fn, false_fn, tensors)
+    if not isinstance(result, tuple):
+        result = (result,)
+    lvl = interpreter.level()
+    return tuple([_add_batch_dim(r, 0, lvl) for r in result])

tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_higher_order_ops/effects.py ADDED Viewed

	@@ -0,0 +1,204 @@

+from enum import Enum
+from typing import Any, Dict, Optional, Tuple
+import torch
+import torch.utils._pytree as pytree
+from torch._C import DispatchKey
+from torch._ops import HigherOrderOperator
+from torch._subclasses.fake_tensor import FakeTensorMode
+from torch.fx.experimental.proxy_tensor import (
+    disable_proxy_modes_tracing,
+    ProxyTorchDispatchMode,
+    track_tensor_tree,
+)
+class _EffectType(Enum):
+    ORDERED = "Ordered"
+SIDE_EFFECTS: Dict[torch._ops.OpOverload, _EffectType] = {
+    torch.ops.aten._print.default: _EffectType.ORDERED,
+}
+class WithEffects(HigherOrderOperator):
+    """
+    with_effects(token, op, args, kwargs) -> (new_token, op_results)
+    This HOP helps ensure ordering between side effectful ops like prints or ops
+    using torchbind objects. This is needed to ensure a traced graph from
+    AOTAutograd is functional so that future optimization passes do not reorder
+    these operators. This is done through threading "effect tokens" through the
+    graph to enforce data dependence between side effectful ops.
+    The tokens are basically dummy values (torch.tensor([])). We create a token
+    per "effect type", which are enumerated in the _EffectType enum.
+    """
+    def __init__(self):
+        super().__init__("with_effects")
+    def __call__(
+        self,
+        token,
+        op: torch._ops.OpOverload,
+        *args: Tuple[Any, ...],
+        **kwargs: Dict[str, Any],
+    ) -> Tuple[Any, ...]:
+        assert isinstance(op, torch._ops.OpOverload)
+        assert not has_aliasing(op), "Ops with aliasing is not supported"
+        assert has_effects(op, args, kwargs)
+        assert isinstance(kwargs, dict)
+        return super().__call__(token, op, *args, **kwargs)
+with_effects = WithEffects()
+def has_aliasing(op: torch._ops.OpOverload):
+    for arg in op._schema.arguments:
+        if arg.alias_info is not None:
+            return True
+    for arg in op._schema.returns:
+        if arg.alias_info is not None:
+            return True
+    return False
+def has_effects(op, args, kwargs) -> bool:
+    return (
+        isinstance(op, torch._ops.OpOverload)
+        and not has_aliasing(op)
+        and get_effect_key(op, args, kwargs) is not None
+    )
+def get_effect_key(op, args, kwargs) -> Optional[_EffectType]:
+    if op in SIDE_EFFECTS:
+        return SIDE_EFFECTS[op]
+    for arg in args:
+        if isinstance(arg, torch.ScriptObject):
+            return _EffectType.ORDERED
+    return None
+@with_effects.py_impl(DispatchKey.CompositeExplicitAutograd)
+def with_effects_dense(
+    token: torch.Tensor,
+    op: torch._ops.OpOverload,
+    *args: Tuple[Any, ...],
+    **kwargs: Dict[str, Any],
+) -> Tuple[torch.Tensor, ...]:
+    out = op(*args, **kwargs)
+    new_token = torch.tensor([])
+    if isinstance(out, tuple):
+        return (new_token, *out)
+    return (new_token, out)
+@with_effects.py_impl(FakeTensorMode)
+def with_effects_fake(
+    mode,
+    token: torch.Tensor,
+    op: torch._ops.OpOverload,
+    *args: Tuple[Any, ...],
+    **kwargs: Dict[str, Any],
+) -> Tuple[torch.Tensor, ...]:
+    with mode:
+        result = with_effects_dense(token, op, *args, **kwargs)
+        return result
+@with_effects.py_impl(ProxyTorchDispatchMode)
+def with_effects_proxy(
+    mode,
+    token: torch.Tensor,
+    op: torch._ops.OpOverload,
+    *args: Tuple[Any, ...],
+    **kwargs: Dict[str, Any],
+) -> Tuple[torch.Tensor, ...]:
+    if not mode.enable_tracing:
+        return with_effects(token, op, *args, **kwargs)
+    with disable_proxy_modes_tracing():
+        out = with_effects(token, op, *args, **kwargs)
+    proxy_token = mode.tracer.unwrap_proxy(token)
+    proxy_args = pytree.tree_map(mode.tracer.unwrap_proxy, args)
+    proxy_kwargs = pytree.tree_map(mode.tracer.unwrap_proxy, kwargs)
+    out_proxy = mode.tracer.create_proxy(
+        "call_function",
+        with_effects,
+        (proxy_token, op, *proxy_args),
+        proxy_kwargs,
+    )
+    result = track_tensor_tree(out, out_proxy, constant=None, tracer=mode.tracer)
+    return result
+with_effects.fallthrough(DispatchKey.AutogradCPU)
+with_effects.fallthrough(DispatchKey.AutogradCUDA)
+def handle_effects(
+    allow_token_discovery: bool,
+    tokens: Dict[_EffectType, torch.Tensor],
+    op: torch._ops.OpOverload,
+    args: Tuple[Any, ...],
+    kwargs: Dict[str, Any],
+) -> Any:
+    """
+    Args:
+        allow_token_discovery: Whether or not we are discovering tokens. If this
+        is true, we will create a token for every side effect type seen that
+        does not have a token assigned yet.  If this is false, the tokens
+        should've all been created ahead of time, so we will error if there is
+        no token mapping to every effect type.
+        tokens: Map of effect type to tokens. This is to chain operators of the
+        same effects together so that they do not get reordered in later
+        optimization passes.
+    """
+    # Get a token. We can't do `tokens.get(op, torch.tensor([]))` because
+    # this will create an empty tensor during proxy mode tracing if the token
+    # doesn't exist. But the tokens should always exist during proxy mode tracing.
+    key = get_effect_key(op, args, kwargs)
+    assert key is not None
+    if key not in tokens:
+        assert allow_token_discovery, f"Could not find a token for effect {key}"
+        tokens[key] = torch.tensor([])
+    token = tokens[key]
+    from torch._subclasses.functional_tensor import PythonFunctionalizeAPI
+    ctx = PythonFunctionalizeAPI()
+    unwrapped_token = ctx.unwrap_tensors([token])[0]  # type: ignore[arg-type]
+    unwrapped_args = ctx.unwrap_tensors(args)  # type: ignore[arg-type]
+    unwrapped_kwargs = ctx.unwrap_tensors(kwargs)  # type: ignore[arg-type]
+    with ctx.redispatch_to_next():
+        (new_token, *unwrapped_outs) = with_effects(
+            unwrapped_token, op, *unwrapped_args, **unwrapped_kwargs  # type: ignore[arg-type]
+        )
+    if len(op._schema.returns) == 0:
+        assert unwrapped_outs[0] is None
+        unwrapped_outs = None  # type: ignore[assignment]
+    elif len(op._schema.returns) == 1:
+        assert len(unwrapped_outs) == 1
+        unwrapped_outs = unwrapped_outs[0]
+    else:
+        assert len(unwrapped_outs) == len(op._schema.returns)
+    # Add the newly created token into the tokens map for a following call to
+    # use this token.
+    wrapped_token = ctx.wrap_tensors(new_token)
+    assert isinstance(wrapped_token, torch.Tensor)
+    tokens[key] = wrapped_token
+    return ctx.wrap_tensors(unwrapped_outs)  # type: ignore[arg-type]

tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_higher_order_ops/map.py ADDED Viewed

	@@ -0,0 +1,358 @@

+import torch
+import torch.utils._pytree as pytree
+from torch._C import DispatchKey
+from torch._dispatch.python import suspend_functionalization
+from torch._functorch.aot_autograd import AOTConfig, create_joint, from_fun
+from torch._higher_order_ops.utils import (
+    _has_potential_branch_input_alias,
+    _has_potential_branch_input_mutation,
+    reenter_make_fx,
+    UnsupportedAliasMutationException,
+)
+from torch._ops import HigherOrderOperator
+from torch._subclasses.fake_tensor import FakeTensorMode
+from torch._subclasses.functional_tensor import (
+    disable_functional_mode,
+    FunctionalTensor,
+)
+from torch.fx.experimental.proxy_tensor import (
+    disable_proxy_modes_tracing,
+    make_fx,
+    ProxyTorchDispatchMode,
+    track_tensor_tree,
+)
+from torch.multiprocessing.reductions import StorageWeakRef
+# TODO: We add this to prevent dymamo from tracing into map_wrapper,
+# remove the wrapper call when it's ready.
+class MapWrapper(HigherOrderOperator):
+    def __call__(self, xs, *args):
+        return map_wrapper(xs, *args)
+map = MapWrapper("map")
+map_impl = HigherOrderOperator("map_impl")
+dummy_aot_config = AOTConfig(
+    fw_compiler=None,  # type: ignore[arg-type]
+    bw_compiler=None,  # type: ignore[arg-type]
+    partition_fn=None,  # type: ignore[arg-type]
+    decompositions={},
+    num_params_buffers=0,
+    aot_id=0,
+    keep_inference_input_mutations=False,
+)
+def create_fw_bw_graph(f, num_mapped_args, *args):
+    mapped_xs = args[:num_mapped_args]
+    pos_args = args[num_mapped_args:]
+    # Note: We create "clean" environments for make_fx by suspending all dispatch keys
+    # between Autograd and Python key. Currently, we only suspend functionalization but more can be
+    # added when required. Will encounter two problems if we don't suspend functionalization:
+    #
+    # 1. make_fx fails to capture operations on input: the inputs are wrapped as _to_functional_tensor_wrapper,
+    # but they will be unwrapped before entering ProxyTorchDispatchMode as part of the dispatching.
+    # However, it's the outside wrapper that tracer creates proxies for. This casuses tracer fail to
+    # fetch the proxy for the inputs and fail to capture any operations on them.
+    #
+    # 2. make_fx fails to capture output: the outputs after ProxyTorchDispatchMode are further
+    # wrapped as FunctionalTensorWrapper in Functionalize key after return. However, the tracer
+    # only associates the inner tensor with proxy in ProxyTorchDispatchMode. Therefore,
+    # when creating the output node, it fails to associate the wrapped tensor with its proxy.
+    # Instead, it will create _tensor_constant as output.
+    with suspend_functionalization(), disable_functional_mode():
+        with disable_proxy_modes_tracing():
+            def _from_fun(t):
+                if isinstance(t, torch.Tensor):
+                    if t.dtype != torch.bool:
+                        return torch.empty_strided(
+                            t.size(),
+                            t.stride(),
+                            dtype=t.dtype,
+                            requires_grad=t.requires_grad,
+                        )
+                    else:
+                        # clone of a functional tensor produces a functional tensor
+                        # but we want to avoid it so we clone a non-functional version
+                        maybe_unfunc_t = t
+                        if isinstance(t, FunctionalTensor):
+                            torch._sync(t)
+                            maybe_unfunc_t = from_fun(t)
+                        elif torch._is_functional_tensor(t):
+                            # need to handle both types of functionalization here:
+                            # these are the tensors that came from the user,
+                            # which could be either FunctionalTensorWrapper or FunctionalTensor
+                            torch._sync(t)
+                            maybe_unfunc_t = torch._from_functional_tensor(t)
+                        return maybe_unfunc_t.clone()
+                return t
+            unwrapped_mapped_xs = pytree.tree_map(_from_fun, mapped_xs)
+            example_xs = _unstack_pytree(unwrapped_mapped_xs)[0]
+            example_pos_args = [
+                _from_fun(arg) if isinstance(arg, torch.Tensor) else arg
+                for arg in pos_args
+            ]
+            example_flat_out = pytree.tree_map(
+                _from_fun, f(*example_xs, *example_pos_args)
+            )
+            if any(
+                not isinstance(out, torch.Tensor)
+                for out in example_flat_out
+                if out is not None
+            ):
+                raise RuntimeError(
+                    "Expect outputs of map only contains tensors or None. "
+                    f"Got types {[type(out) for out in example_flat_out]}."
+                )
+            example_grad = [_from_fun(out) for out in example_flat_out]
+            fw_graph = make_fx(f)(*example_xs, *example_pos_args)
+        def joint_f(*example_args):
+            joint_mapped_args = example_args[:joint_num_mapped]
+            args = example_args[joint_num_mapped:]
+            mapped_input = joint_mapped_args[:num_mapped_args]
+            mapped_grads = joint_mapped_args[num_mapped_args:]
+            def fw_with_masks(*args):
+                fw_out = f(*args)
+                return fw_out, [
+                    True
+                    if isinstance(ret, torch.Tensor) and ret.requires_grad
+                    else False
+                    for ret in fw_out
+                ]
+            joint = create_joint(fw_with_masks, aot_config=dummy_aot_config)
+            _, grads = joint(
+                list(mapped_input) + list(args),
+                [
+                    grad
+                    for grad in mapped_grads
+                    if grad is not None and grad.requires_grad
+                ],
+            )
+            # In order to keep map functional for backward graph,
+            # we clone outputs that are aliasing inputs
+            input_storage = {
+                StorageWeakRef(arg._typed_storage())
+                for arg in example_args
+                if isinstance(arg, torch.Tensor)
+            }
+            def maybe_clone(t):
+                if (
+                    isinstance(t, torch.Tensor)
+                    and StorageWeakRef(t._typed_storage()) in input_storage
+                ):
+                    return t.clone()
+                return t
+            return pytree.tree_map(maybe_clone, grads)
+        joint_num_mapped = len(example_grad) + len(example_xs)
+        joint_graph = make_fx(joint_f)(*example_xs, *example_grad, *example_pos_args)
+        return fw_graph, joint_graph
+def map_wrapper(f, xs, *args):
+    flat_xs, xs_spec = pytree.tree_flatten(xs)
+    if not all(isinstance(t, torch.Tensor) for t in flat_xs):
+        raise RuntimeError(f"Mapped xs can only consist of tensors. Got xs {flat_xs}.")
+    num_mapped_args = len(flat_xs)
+    shapes = [xs.shape for xs in flat_xs]
+    leading_dim_size = shapes[0][0]
+    if leading_dim_size == 0:
+        raise RuntimeError("Leading dimensions of mapped xs cannot be 0.")
+    if any(cur_shape[0] != leading_dim_size for cur_shape in shapes):
+        raise RuntimeError(
+            f"Leading dimensions of mapped xs must be consistent. Got shapes {shapes}."
+        )
+    out_spec = None
+    def flat_fn(*flat_args):
+        xs = pytree.tree_unflatten(list(flat_args[:num_mapped_args]), xs_spec)
+        unflattened_out = f(xs, *flat_args[num_mapped_args:])
+        flat_out, tmp_out_spec = pytree.tree_flatten(unflattened_out)
+        nonlocal out_spec
+        out_spec = tmp_out_spec
+        return flat_out
+    return pytree.tree_unflatten(
+        map_impl(flat_fn, flat_xs, args), out_spec  # type: ignore[arg-type]
+    )
+class MapAutogradOp(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, fw_graph, joint_graph, num_mapped_args, *flat_args):
+        ctx.save_for_backward(*flat_args)
+        ctx._joint_graph = joint_graph
+        ctx._num_mapped_args = num_mapped_args
+        with torch._C._AutoDispatchBelowAutograd():
+            return (
+                *map_impl(
+                    fw_graph, flat_args[:num_mapped_args], flat_args[num_mapped_args:]
+                ),
+            )
+    @staticmethod
+    def backward(ctx, *flat_grads):
+        fw_args = ctx.saved_tensors
+        fw_mapped_args = fw_args[: ctx._num_mapped_args]
+        pos_args = fw_args[ctx._num_mapped_args :]
+        grads = map_impl(
+            ctx._joint_graph,
+            fw_mapped_args + flat_grads,
+            pos_args,
+        )
+        return None, None, None, *grads
+def trace_map(proxy_mode, func_overload, f, xs, pos_args):
+    leading_dim_size = xs[0].shape[0]
+    example_input = _unstack_pytree(xs)[0]
+    body_graph = f
+    pre_dispatch = getattr(proxy_mode, "pre_dispatch", False)
+    body_graph = reenter_make_fx(body_graph, pre_dispatch)(*example_input, *pos_args)
+    next_name = None
+    i = 0
+    while not next_name:
+        candidate = f"body_graph_{i}"
+        if hasattr(proxy_mode.tracer.root, candidate):
+            i += 1
+        else:
+            next_name = candidate
+    proxy_mode.tracer.root.register_module(next_name, body_graph)
+    with disable_proxy_modes_tracing():
+        example_outs = body_graph(*example_input, *pos_args)
+        def expand_tensor(t):
+            if isinstance(t, torch.Tensor):
+                return t.expand(leading_dim_size, *t.shape)
+            return t
+        expanded_outs = pytree.tree_map(expand_tensor, example_outs)
+    node_args = (body_graph, list(xs), list(pos_args))
+    proxy_args = pytree.tree_map(proxy_mode.tracer.unwrap_proxy, node_args)
+    out_proxy = proxy_mode.tracer.create_proxy(
+        "call_function", func_overload, proxy_args, {}, name="map_impl"
+    )
+    return track_tensor_tree(
+        expanded_outs, out_proxy, constant=None, tracer=proxy_mode.tracer
+    )
+def _unstack_pytree(xs):
+    flat_xs, inspec = pytree.tree_flatten(xs)
+    if not all(isinstance(xs, torch.Tensor) for xs in flat_xs):
+        raise RuntimeError(f"Leaves of xs must be Tensor {flat_xs}")
+    if not all(xs.shape[0] == flat_xs[0].shape[0] for xs in flat_xs):
+        raise RuntimeError(
+            f"Leaves of xs must have same leading dimension size {[xs.shape for xs in flat_xs]}"
+        )
+    a = zip(*flat_xs)
+    pytrees = []
+    for tuple in a:
+        pytrees.append(pytree.tree_unflatten(tuple, inspec))
+    return pytrees
+def _stack_pytree(pytrees):
+    flat_out = []
+    out_spec = None
+    for pt in pytrees:
+        flat_pt, out_spec = pytree.tree_flatten(pt)
+        flat_out.append(flat_pt)
+    assert out_spec is not None
+    b = zip(*flat_out)
+    stacked_out = []
+    for leaves in b:
+        if all(isinstance(leaf, torch.Tensor) for leaf in leaves):
+            stacked_out.append(torch.stack(leaves))
+        elif all(leaf is None for leaf in leaves):
+            # Backward graph can return None output when forward inputs doesn't require grad.
+            # When we eagerly execute backward graph, we need to call _stack_pytree on its output,
+            # therefore we need to deal with None output.
+            stacked_out.append(None)  # type: ignore[arg-type]
+        else:
+            raise RuntimeError(f"Cannot stack {leaves}.")
+    return pytree.tree_unflatten(stacked_out, out_spec)
+@map_impl.py_impl(DispatchKey.CompositeExplicitAutograd)
+def map_dense(f, xs, pos_args):
+    pytrees = []
+    for inp in _unstack_pytree(xs):
+        pytrees.append(f(*inp, *pos_args))
+    return _stack_pytree(pytrees)
+@map_impl.py_impl(DispatchKey.Autograd)
+def map_autograd(f, xs, pos_args):
+    num_mapped_args = len(xs)
+    fw_graph, bw_graph = create_fw_bw_graph(f, num_mapped_args, *xs, *pos_args)
+    flat_out = MapAutogradOp.apply(fw_graph, bw_graph, num_mapped_args, *xs, *pos_args)
+    return flat_out
+@map_impl.py_impl(ProxyTorchDispatchMode)
+def map_proxy_torch_dispatch_mode(mode, f, xs, args):
+    if mode.enable_tracing:
+        return trace_map(mode, map_impl, f, xs, args)
+    else:
+        return map_impl(f, xs, args)
+@map_impl.py_impl(FakeTensorMode)
+def map_fake_tensor_mode(mode, f, xs, args):
+    with mode:
+        return map_dense(f, xs, args)
+@map_impl.py_functionalize_impl
+def map_functionalize(ctx, f, xs, pos_args):
+    unwrapped_xs = ctx.unwrap_tensors(xs)
+    unwrapped_args = ctx.unwrap_tensors(pos_args)
+    wrapped_fn = ctx.functionalize(f)
+    with ctx.redispatch_to_next():
+        with disable_proxy_modes_tracing():
+            example_inputs = (*_unstack_pytree(unwrapped_xs)[0], *unwrapped_args)
+        pre_dispatch = hasattr(ctx, "mode") and ctx.mode.pre_dispatch
+        if _has_potential_branch_input_mutation(
+            f, example_inputs, pre_dispatch=pre_dispatch
+        ):
+            raise UnsupportedAliasMutationException("torch.map is mutating the input!")
+        if _has_potential_branch_input_alias(
+            f, example_inputs, pre_dispatch=pre_dispatch
+        ):
+            raise UnsupportedAliasMutationException("torch.map is aliasing the input!")
+        map_return = map_impl(wrapped_fn, unwrapped_xs, unwrapped_args)
+        return ctx.wrap_tensors(map_return)

tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_higher_order_ops/strict_mode.py ADDED Viewed

	@@ -0,0 +1,100 @@

+import torch
+import torch._subclasses.functional_tensor
+import torch.utils._pytree as pytree
+from torch._C import DispatchKey
+from torch._functorch.utils import exposed_in
+from torch._higher_order_ops.utils import _set_compilation_env, autograd_not_implemented
+from torch._ops import HigherOrderOperator
+from torch._subclasses.fake_tensor import FakeTensorMode
+from torch.fx.experimental.proxy_tensor import (
+    disable_proxy_modes_tracing,
+    make_fx,
+    ProxyTorchDispatchMode,
+    track_tensor_tree,
+)
+from torch.utils._python_dispatch import _get_current_dispatch_mode
+@exposed_in("torch")
+def strict_mode(callable, operands):
+    if torch.compiler.is_dynamo_compiling():
+        return strict_mode_op(callable, operands)
+    with _set_compilation_env():
+        with torch._dynamo.utils.disable_cache_limit():
+            return torch.compile(strict_mode_op, backend="eager", fullgraph=True)(
+                callable, operands
+            )
+strict_mode_op = HigherOrderOperator("strict_mode")
+@strict_mode_op.py_impl(DispatchKey.CompositeExplicitAutograd)
+def strict_mode_op_dense(callable, operands):
+    mode = _get_current_dispatch_mode()
+    assert mode is None, "Mode should never be enabled for CPU/CUDA key"
+    return callable(*operands)
+strict_mode_op.py_impl(DispatchKey.Autograd)(
+    autograd_not_implemented(strict_mode_op, deferred_error=True)
+)
+@strict_mode_op.py_impl(ProxyTorchDispatchMode)
+def inner(mode, callable, operands):
+    if mode.enable_tracing:
+        return trace_strict_mode(mode, strict_mode_op, callable, operands)
+    else:
+        return strict_mode_op(callable, operands)
+def trace_strict_mode(mode, strict_mode_op, callable, operands):
+    pre_dispatch = getattr(mode, "pre_dispatch", False)
+    with disable_proxy_modes_tracing():
+        graph = make_fx(callable, pre_dispatch=pre_dispatch)(*operands)
+    next_name = None
+    i = 0
+    while not next_name:
+        candidate = f"strict_graph_{i}"
+        if hasattr(mode.tracer.root, candidate):
+            i += 1
+        else:
+            next_name = candidate
+    graph_name = next_name
+    mode.tracer.root.register_module(graph_name, graph)
+    args = (graph, operands)
+    proxy_args = pytree.tree_map(mode.tracer.unwrap_proxy, args)
+    out_proxy = mode.tracer.create_proxy(
+        "call_function", strict_mode_op, proxy_args, {}, name="strict_mode"
+    )
+    out = graph(*operands)
+    return track_tensor_tree(out, out_proxy, constant=None, tracer=mode.tracer)
+@strict_mode_op.py_impl(FakeTensorMode)
+def strict_mode_fake_tensor_mode(mode, callable, operands):
+    with mode:
+        true_outs = callable(*operands)
+    return true_outs
+@strict_mode_op.py_functionalize_impl
+def strict_mode_func(ctx, callable, inputs):
+    unwrapped_inputs = ctx.unwrap_tensors(inputs)
+    with ctx.redispatch_to_next():
+        functional_callable = ctx.functionalize(callable)
+        cond_return = strict_mode_op(functional_callable, unwrapped_inputs)
+        return ctx.wrap_tensors(cond_return)

tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_higher_order_ops/torchbind.py ADDED Viewed

	@@ -0,0 +1,94 @@

+from contextlib import contextmanager
+import torch
+from torch._C import DispatchKey  # @manual
+from torch._functorch._aot_autograd.utils import KNOWN_TYPES
+from torch._higher_order_ops.utils import autograd_not_implemented
+from torch._ops import HigherOrderOperator
+from torch._subclasses.fake_tensor import FakeTensorMode
+from torch.fx.experimental.proxy_tensor import ProxyTorchDispatchMode, track_tensor_tree
+from torch.fx.node import has_side_effect
+from torch.utils import _pytree as pytree
+# The call_torchbind operator represents a method invocation on a torchbind
+# object. The calling convention is:
+#   call_torchbind(self: ScriptObject, method_name: str, *method_args, **method_kwargs)
+# We do not expect users to write this operator directly. Instead it will be
+# emitted by Dynamo when tracing encounters a torchbind object.
+call_torchbind = HigherOrderOperator("call_torchbind")
+# Register this operator as side-effectful with FX.
+# TODO: this is not really sufficient. While passes (hopefully) check
+# Node.is_impure() and make good decisions, we also assume we can execute the
+# graph as many times as we want without changing behavior, which is NOT true of
+# ops that mutate torchbind object state.
+has_side_effect(call_torchbind)
+_orig_scriptmethod_call = torch.ScriptMethod.__call__
+def torchbind_method_redispatch(self, *args, **kwargs):
+    if isinstance(self.raw_owner, torch.ScriptObject):
+        return call_torchbind(self.raw_owner, self.name, *args, **kwargs)
+    return _orig_scriptmethod_call(self, *args, **kwargs)
+@contextmanager
+def enable_torchbind_tracing():
+    """Context manager that acts as a feature flag to enable torchbind tracing
+    behavior. Once torchbind tracing has been stabilized, we can remove this and
+    turn it always on.
+    """
+    try:
+        KNOWN_TYPES.append(torch.ScriptObject)
+        torch.ScriptMethod.__call__ = torchbind_method_redispatch  # type: ignore[method-assign]
+        yield
+    finally:
+        assert (
+            KNOWN_TYPES.pop() is torch.ScriptObject
+        ), "Someone else messed with KNOWN_TYPES during tracing, exploding."
+        torch.ScriptMethod.__call__ = _orig_scriptmethod_call  # type: ignore[method-assign]
+@call_torchbind.py_impl(DispatchKey.CompositeExplicitAutograd)
+def call_torchbind_impl(obj, method, *args, **kwargs):
+    return _orig_scriptmethod_call(getattr(obj, method), *args, **kwargs)
+@call_torchbind.py_impl(ProxyTorchDispatchMode)
+def inner(mode, *args, **kwargs):
+    if mode.enable_tracing:
+        proxy_args = pytree.tree_map(mode.tracer.unwrap_proxy, args)
+        proxy_kwargs = pytree.tree_map(mode.tracer.unwrap_proxy, kwargs)
+        out_proxy = mode.tracer.create_proxy(
+            "call_function",
+            call_torchbind,
+            proxy_args,
+            proxy_kwargs,
+        )
+        out = call_torchbind_impl(*args, **kwargs)
+        return track_tensor_tree(out, out_proxy, constant=None, tracer=mode.tracer)
+    else:
+        return call_torchbind(*args, **kwargs)
+# TODO: currently we just run the C++ implementation with fake tensors.
+# But we should make it possible to register a fake torchbind implementation.
+@call_torchbind.py_impl(FakeTensorMode)
+def call_torchbind_fake(mode, *args, **kwargs):
+    with mode:
+        return call_torchbind_impl(*args, **kwargs)
+call_torchbind.py_impl(DispatchKey.Autograd)(
+    autograd_not_implemented(call_torchbind, deferred_error=True)
+)
+@call_torchbind.py_functionalize_impl
+def call_torchbind_func(ctx, *args, **kwargs):
+    args = ctx.unwrap_tensors(args)
+    with ctx.redispatch_to_next():
+        return ctx.wrap_tensors(call_torchbind(*args, **kwargs))

tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_higher_order_ops/triton_kernel_wrap.py ADDED Viewed

	@@ -0,0 +1,842 @@

+import dataclasses
+import logging
+import threading
+import warnings
+from collections import defaultdict
+from typing import Any, Dict, List, Optional, Union
+import torch.utils._pytree as pytree
+from torch import Tensor
+from torch._C import DispatchKey
+from torch._ops import HigherOrderOperator
+from torch._prims_common import clone_preserve_strides
+from torch._subclasses.fake_tensor import FakeTensorMode
+from torch.fx.experimental.proxy_tensor import (
+    disable_proxy_modes_tracing,
+    ProxyTorchDispatchMode,
+    track_tensor_tree,
+)
+log = logging.getLogger("torch._dynamo")
+###############################################################################
+# Kernel Side Table
+# We cannot put Triton Kernels into the FX graph as the graph nodes
+# do not support arbitrary functions.
+# Use a side table.
+# We use two dicts so that fetching both the kernel and id are O(1)
+class KernelSideTable:
+    id_to_kernel: Dict[int, Any] = dict()
+    kernel_to_id: Dict[Any, int] = dict()
+    lock = threading.Lock()
+    # Returns index on the table
+    def add_kernel(self, kernel) -> int:
+        with self.lock:
+            if kernel in self.kernel_to_id:
+                return self.kernel_to_id[kernel]
+            idx = len(self.id_to_kernel)
+            self.id_to_kernel[idx] = kernel
+            self.kernel_to_id[kernel] = idx
+            return idx
+    # Returns the triton kernel at the given index
+    def get_kernel(self, idx: int):
+        # No need to lock here as fetching from dict is atomic
+        assert idx in self.id_to_kernel
+        return self.id_to_kernel[idx]
+    # Resets the table (only meant to be used in unit tests)
+    # This is only safe assuming single threaded execution
+    def reset_table(self) -> None:
+        self.id_to_kernel = dict()
+        self.kernel_to_id = dict()
+kernel_side_table = KernelSideTable()
+###############################################################################
+# Mutation Tracker
+@dataclasses.dataclass(frozen=True)
+class Param:
+    idx: int
+@dataclasses.dataclass(frozen=True)
+class Intermediate:
+    idx: int
+    def fake(self):
+        return self.idx < 0
+@dataclasses.dataclass(frozen=True)
+class Op:
+    name: str
+    fn_call_name: Optional[str]
+    args: List[Union[Param, Intermediate]]
+    ret: Intermediate = dataclasses.field(repr=False)
+    def __post_init__(self):
+        if self.name == "tt.call":
+            assert self.fn_call_name is not None
+        else:
+            assert self.fn_call_name is None
+def generate_ttir(kernel, kwargs):
+    """
+    Uses Triton's internal code generation to create TTIR
+    """
+    from triton.compiler.compiler import ASTSource
+    from triton.runtime.autotuner import Autotuner
+    from triton.runtime.jit import JITFunction
+    import torch
+    from torch._subclasses.fake_tensor import FakeTensor
+    if isinstance(kernel, Autotuner):
+        if len(kernel.configs) > 0:
+            # If we are autotuning, then it doesn't matter which version gets
+            # picked for tracing purposes, so lets pick the first one
+            kwargs = {**kwargs, **kernel.configs[0].kwargs}
+        kernel = kernel.fn
+    assert isinstance(kernel, JITFunction)
+    if len(kwargs) != len(kernel.arg_names):
+        raise Exception("Incorrect number of arguments passed to kernel")
+    # Replace all SymExprs with a regular value for TTIR generation
+    # Replace all FakeTensor with real tensors
+    # These replacements are needed for triton's type, key and config functions
+    ordered_args: Dict[str, Any] = {}
+    for name in kernel.arg_names:
+        a = kwargs[name]
+        if isinstance(a, (torch.SymInt, torch.SymFloat, torch.SymBool)):
+            ordered_args[name] = 2
+        elif isinstance(a, FakeTensor):
+            ordered_args[name] = torch.empty(2, dtype=a.dtype)
+        else:
+            ordered_args[name] = a
+    ordered_tensor_names = [
+        name for name, arg in ordered_args.items() if isinstance(arg, Tensor)
+    ]
+    specialization = kernel._get_config(*ordered_args.values())
+    constants = {
+        i: arg
+        for i, arg in enumerate(ordered_args.values())
+        if not isinstance(arg, Tensor)
+    }
+    # Build kernel signature -- doesn't include constexpr arguments.
+    signature = {
+        i: kernel._type_of(kernel._key_of(arg))
+        for i, arg in enumerate(ordered_args.values())
+        if i not in kernel.constexprs
+    }
+    def get_backend():
+        from triton.compiler.backends.cuda import CUDABackend
+        from triton.runtime.driver import driver
+        target = driver.get_current_target()
+        return CUDABackend(target)
+    backend = get_backend()
+    options = backend.parse_options(dict())
+    # triton._C.libtriton.triton.ir.load_dialects(context)
+    # backend.load_dialects(context)
+    src = ASTSource(kernel, signature, constants, specialization)
+    ttir_module = src.make_ir(options)
+    if not ttir_module.verify():
+        raise Exception("Verification for TTIR module has failed")
+    return ttir_module, ordered_tensor_names
+def ttir_to_functions(ttir_module) -> Dict[str, Dict[Intermediate, List[Op]]]:
+    """
+    Walk the `ttir_module` bottom up to mine the `functions` from
+    the structured MLIR entities representing the Triton kernel
+    (mlir::Operation, mlir::Block, mlir::Region).
+    """
+    functions: Dict[str, Dict[Intermediate, List[Op]]] = {}
+    # block id --> op result (Intermediate) --> one or more ops
+    op_stack: Dict[int, Dict[Intermediate, List[Op]]] = defaultdict(
+        lambda: defaultdict(list)
+    )
+    region_id_to_block_ids: Dict[int, List[int]] = defaultdict(list)
+    block_id_to_block_arg_ids: Dict[int, List[int]] = {}
+    replacements: Dict[int, Union[Intermediate, Param]] = {}
+    reindex_map: Dict[int, int] = {}
+    next_fake_intermediate = 0
+    def reindex(idx):
+        if idx not in reindex_map:
+            reindex_map[idx] = len(reindex_map)
+        return reindex_map[idx]
+    def mlir_to_functions(op) -> None:
+        name: str = op.get_name()
+        if name == "builtin.module":
+            # this wraps all tt.func ops
+            return
+        operand_ids: List[int] = [
+            reindex(op.get_operand(i).id()) for i in range(op.get_num_operands())
+        ]
+        result_ids: List[int] = [
+            reindex(op.get_result(i).id()) for i in range(op.get_num_results())
+        ]
+        child_block_ids: List[int] = []
+        for i in [op.get_region(i).id() for i in range(op.get_num_regions())]:
+            # as the walk is bottom-up, the region_id_to_block_ids[i]
+            # must be populated by the time we process the enclosing op
+            child_block_ids.extend(region_id_to_block_ids[i])
+        parent_block_id = -1
+        parent_block = op.get_block()
+        if parent_block is not None:
+            parent_block_id = parent_block.id()
+            if parent_block_id not in block_id_to_block_arg_ids:
+                block_id_to_block_arg_ids[parent_block_id] = []
+                for i in range(parent_block.get_num_arguments()):
+                    block_id_to_block_arg_ids[parent_block_id].append(
+                        reindex(parent_block.get_argument(i).id()),
+                    )
+                # the region info is collected via ops' parent blocks to be
+                # used later when the region's encloding op is traversed
+                parent_region = parent_block.get_parent()
+                if parent_region is not None:
+                    region_id_to_block_ids[parent_region.id()].append(parent_block_id)
+        nonlocal next_fake_intermediate
+        if name == "tt.func":
+            # for function ops: gather and inline
+            # the ops from all child blocks
+            fn_ops = defaultdict(list)
+            for child_block_id in child_block_ids:
+                for result, block_fn_ops in op_stack.pop(child_block_id).items():
+                    for block_fn_op in block_fn_ops:
+                        fn_ops[result].append(block_fn_op)
+            # replace the corresponding Intermediates in the
+            # child op args with the function args (Params)
+            for i, idx in enumerate(block_id_to_block_arg_ids[child_block_ids[0]]):
+                replacements[idx] = Param(i)
+            for fn_op_list in fn_ops.values():
+                for fn_op in fn_op_list:
+                    for i in range(len(fn_op.args)):
+                        arg = fn_op.args[i]
+                        if isinstance(arg, Intermediate) and arg.idx in replacements:
+                            fn_op.args[i] = replacements[arg.idx]
+            # next function capture starts
+            # with empty replacements
+            replacements.clear()
+            fn_name = op.get_str_attr("sym_name")
+            functions[fn_name] = fn_ops
+        elif child_block_ids:
+            if name in ("scf.if", "scf.for", "scf.while"):
+                # for blocked control flow ops: inline the enclosed
+                # ops into the parent block + rewire the last op in
+                # each child block (yield) to return the scf result
+                yield_ops = []
+                for block_id in child_block_ids:
+                    # the block args used as operands of the ops in the block
+                    # (and nested blocks inlined in the current block by now)
+                    # are replaced by new fake Intermediates to avoid "this
+                    # operand is not returned by anything other op in the fn"
+                    # error in the downstream analysis
+                    for idx in block_id_to_block_arg_ids[block_id]:
+                        next_fake_intermediate -= 1
+                        replacements[idx] = Intermediate(next_fake_intermediate)
+                    if block_id in op_stack:
+                        block_ops = op_stack.pop(block_id)
+                        if not block_ops:
+                            continue
+                        last_ret, last_ops = block_ops.popitem()
+                        if all(op.name == "scf.yield" for op in last_ops):
+                            # if last_ops are scf.yield, treat them separately
+                            yield_ops.extend(last_ops)
+                        else:
+                            # otherwise, return last_ops to the block
+                            block_ops[last_ret] = last_ops
+                        for op_result, child_ops in block_ops.items():
+                            op_stack[parent_block_id][op_result].extend(child_ops)
+                scf_results = [Intermediate(idx) for idx in result_ids]
+                for scf_result in scf_results:
+                    for yield_op in yield_ops:
+                        op_stack[parent_block_id][scf_result].append(yield_op)
+            else:
+                # TODO(oulgen): add support for tt.reduce
+                raise Exception(
+                    f"Unknown blocked function: {name}. Can't capture the TTIR."
+                )
+        else:
+            callee = None
+            if name == "tt.call":
+                callee = op.get_flat_symbol_ref_attr("callee")
+            args: List[Union[Param, Intermediate]] = [
+                Intermediate(operand) for operand in operand_ids
+            ]
+            block_ops = op_stack[parent_block_id]
+            if result_ids:
+                for result_id in result_ids:
+                    res = Intermediate(result_id)
+                    block_ops[res].append(Op(name, callee, args, res))
+            else:
+                next_fake_intermediate -= 1
+                fake_res = Intermediate(next_fake_intermediate)
+                block_ops[fake_res].append(Op(name, callee, args, fake_res))
+    ttir_module.walk(mlir_to_functions)
+    return functions
+def parse_ttir(ttir, kwargs):
+    """
+    Given a Triton emitted TTIR text, this function lexes and parses the
+    code using a minimal grammar defined inside. During the lexing/parsing,
+    we drop any constant value and type information as they are not
+    necessary to us.
+    Being able to choose what we need makes this not a general purpose TTIR
+    parser which further makes parsing much simpler.
+    """
+    # TODO(oulgen):
+    # - Support closures (e.g. "tt.reduce")
+    try:
+        import lark  # type: ignore[import-not-found]
+        from lark import Lark, Transformer, v_args
+    except ModuleNotFoundError:
+        warnings.warn(
+            "Using slow path for user-defined Triton kernels. `pip install lark` to fix this."
+        )
+        raise
+    # Ops looks like one of the following forms:
+    #
+    # %14 = tt.addptr %13, %4 : tensor<4x!tt.ptr<f32, 1>>, tensor<4xi32>
+    # tt.store %14, %12, %5 {cache = 1 : i32, evict = 1 : i32} : tensor<4xf32>
+    # %15 = "tt.atomic_rmw"(%14, %12, %5) <{atomic_rmw_op = 5 : i32, scope = 1 : i32, sem = 4 : i32}> : (tensor<4x!tt.ptr<f32, 1>>, tensor<4xf32>, tensor<4xi1>) -> tensor<4xf32>  # noqa: B950
+    grammar = """
+        start: (module_block | loc_line)+
+        loc_line: "#loc" /.+/ NEWLINE
+        module_block: "module" "{" func_block+ "}" LOC
+        func_block: "tt.func" ("public"|"private") FN_NAME "(" /.+/ NEWLINE stmt* "}" LOC -> process_func
+        ?stmt: op | if | for | while | condition_stmt | label_stmt | cf_stmt
+        if: [assign_lhs "="] "scf.if" args rest stmt* "}" "else" "{" stmt* "}" LOC -> process_if
+        for: [assign_lhs "="] "scf.for" args rest stmt* "}" divisibility_annot? LOC -> process_for
+        while: [assign_lhs "="] "scf.while" args rest stmt* "}" "do" "{" stmt* "}" LOC -> process_while
+        condition_stmt: "scf.condition" "(" arg ")" args rest
+        label_stmt: LABEL ":" "// pred:" LABEL
+                  | LABEL "(" /.+/ NEWLINE
+        cf_stmt: "cf" "." NAME /.+/ NEWLINE
+        op: OP_NAME LOC
+          | [assign_lhs "="] OP_NAME [FN_NAME] args rest?  -> process_op
+        ?rest: (":" | "{" | "\\"" | "->" | "<" | "=") /.+/ NEWLINE
+        divisibility_annot: "{" "tt.divisibility_arg1" /[^}]+/ "}"
+        args: | "(" ")" | "("? arg ("," arg)* ")"?
+        ?arg: INTERMEDIATE
+            | INTERMEDIATE_CONSTANT
+            | CONSTANT
+            | PARAM
+            | "[" args "]"
+            | arg_with_index
+        ?arg_with_index: arg "#" DIGIT+
+        ?assign_lhs: (INTERMEDIATE | INTERMEDIATE_CONSTANT) [":" DIGIT+]
+        PARAM.5: "%arg" DIGIT+
+        INTERMEDIATE.4: "%" DIGIT+
+        INTERMEDIATE_CONSTANT.3: "%" NAME
+        CONSTANT: FLOAT | DIGIT+ | NAME ("<" DIGIT+ ">")?
+        LABEL: "^bb" DIGIT+
+        NAME: (LETTER | DIGIT | "_")+
+        NON_CF_NAME: /(?!(cf))/ NAME
+        FN_NAME: "@" (NAME | ESCAPED_STRING)
+        OP_NAME: "\\""? NON_CF_NAME ("." NAME)+ "\\""?
+        LOC.5: "loc(#loc" DIGIT* ")"
+        %import common.LETTER
+        %import common.DIGIT
+        %import common.WS
+        %import common.NEWLINE
+        %import common.ESCAPED_STRING
+        %import common.FLOAT
+        %ignore WS
+    """
+    next_fake_intermediate = 0
+    def convert(token):
+        if isinstance(token, lark.tree.Tree):
+            if token.data == "args":
+                res = []
+                for a in token.children:
+                    c = convert(a)
+                    if isinstance(c, list):
+                        res.extend(c)
+                    else:
+                        res.append(c)
+                return res
+            elif token.data in {"assign_lhs", "arg_with_index"}:
+                # Drop length/index qualifier
+                return convert(token.children[0])
+            else:
+                raise AssertionError(f"Tree node with {token.data}")
+        if token is None or (
+            isinstance(token, lark.lexer.Token)
+            and token.type in ("CONSTANT", "INTERMEDIATE_CONSTANT")
+        ):
+            nonlocal next_fake_intermediate
+            next_fake_intermediate -= 1
+            return Intermediate(next_fake_intermediate)
+        assert isinstance(token, lark.lexer.Token)
+        if token.type == "INTERMEDIATE":
+            return Intermediate(int(token.value[len("%") :]))
+        if token.type == "PARAM":
+            return Param(int(token.value[len("%arg") :]))
+        raise AssertionError(f"{type(token.type)} => {token.value} invalid")
+    # In alternative representation, function names are quoted.
+    # It should be possible to move this into the grammar alltogether.
+    def convert_name(token):
+        if token is None:
+            return None
+        s = token.value
+        if len(s) > 2 and s[0] == '"' and s[-1] == '"':
+            return s[1:-1]
+        return s
+    functions: Dict[str, Dict[Intermediate, List[Op]]] = {}
+    def extend_dict_list(d1, d2):
+        for key, values in d2.items():
+            d1[key].extend(values)
+    @v_args(inline=True)
+    class TransformOps(Transformer):
+        def process_op(self, ret, op_name, fn_name, args, *rest):
+            return Op(
+                convert_name(op_name),
+                convert_name(fn_name),
+                convert(args),
+                convert(ret),
+            )
+        def process_func(self, name, _args, *stmts):
+            ops: Dict[Intermediate, List[Op]] = defaultdict(list)
+            for e in stmts:
+                if isinstance(e, Op):
+                    ops[e.ret].append(e)
+                elif isinstance(e, dict):
+                    extend_dict_list(ops, e)
+            functions[name.value] = ops
+        def _process_scf(self, ret, stmts):
+            ret = convert(ret)
+            ops: Dict[Intermediate, List[Op]] = defaultdict(list)
+            for e in stmts:
+                if isinstance(e, Op):
+                    if e.name == "scf.yield":
+                        ops[ret].append(Op(e.name, None, e.args, ret))
+                    else:
+                        ops[e.ret].append(e)
+                elif isinstance(e, dict):
+                    extend_dict_list(ops, e)
+            return ops
+        def process_if(self, ret, _args, _rest, *stmts):
+            return self._process_scf(ret, stmts)
+        def process_for(self, ret, _args, _rest, *stmts):
+            return self._process_scf(ret, stmts)
+        def process_while(self, ret, _args, _rest, *stmts):
+            return self._process_scf(ret, stmts)
+    parser = Lark(
+        grammar, parser="lalr", maybe_placeholders=True, transformer=TransformOps()
+    )
+    parser.parse(ttir)
+    return functions
+class MemoizeWithCycleCheck:
+    def __init__(self, fn):
+        self.fn = fn
+        self.reset()
+    def __call__(self, functions, fn_name, num_args):
+        key = (fn_name, num_args)
+        if key not in self.cache:
+            self.cache[key] = None
+            self.cache[key] = self.fn(functions, fn_name, num_args)
+        if self.cache[key] is None:
+            raise Exception("Recursion is not supported")
+        return self.cache[key]
+    def reset(self):
+        self.cache = {}
+@MemoizeWithCycleCheck
+def analyze_kernel_mutations(functions, fn_name, num_args):
+    """
+    Analyzes the graph to detect all sinks from a predefined list of sinks
+    by using triton's MemWrite trait list. NOTE: What if triton exposed this?
+    From each sink, it traverses the CFG backwards to identify all the input
+    pointers that are mutated.
+    """
+    # Name of mutation op to mutated parameter indices
+    # List from Triton Github include/triton/Dialect/Triton/IR/TritonOps.td
+    # All the OPs that have MemWrite trait.
+    # What if Triton exposed this?
+    MUTATION_OPS = {"tt.store": [0], "tt.atomic_cas": [0], "tt.atomic_rmw": [0]}
+    # Ops that we want to bail out on
+    UNKNOWN_OPS = {"tt.elementwise_inline_asm"}
+    stack: List[Union[Param, Intermediate]] = []
+    visited = set()
+    ops = functions[fn_name]
+    for op_list in ops.values():
+        for op in op_list:
+            if op.name in UNKNOWN_OPS:
+                raise Exception(
+                    f"ttir analysis hit an op we do not know how to analyze: {op.name}"
+                )
+            if op.name == "tt.call":
+                assert op.fn_call_name in functions
+                mutations = analyze_kernel_mutations(
+                    functions, op.fn_call_name, len(op.args)
+                )
+                stack.extend(arg for arg, mutated in zip(op.args, mutations) if mutated)
+            else:
+                for idx in MUTATION_OPS.get(op.name, []):
+                    stack.append(op.args[idx])
+    # The following is an iterative DFS algorithm
+    mutated = [False] * num_args
+    while stack:
+        arg = stack.pop()
+        if arg in visited:
+            continue
+        visited.add(arg)
+        if isinstance(arg, Param):
+            if arg.idx >= num_args:
+                # This is an argument defined in the kernel, not passed in
+                continue
+            mutated[arg.idx] = True
+        elif isinstance(arg, Intermediate) and not arg.fake():
+            for op in ops[arg]:
+                # Skip arguments to load
+                if op.name != "tt.load":
+                    stack.extend(op.args)
+    return mutated
+def identify_mutated_tensors(kernel, kwargs):
+    """
+    Given a triton kernel and the arguments for this kernel, this function
+    1) Retrieves the TTIR converted version of the kernel from Triton's API.
+    2) Parses the TTIR and creates a control flow graph
+    3) Analyzes the graph to detect all input tensor mutations
+    """
+    ttir_module = None
+    functions = None
+    try:
+        from torch._dynamo import config
+        if not config.optimize_user_defined_triton_kernels:
+            raise Exception("optimize_user_defined_triton_kernels is False")
+        ttir_module, ordered_tensor_names = generate_ttir(kernel, kwargs)
+        # extract functions from TTIR
+        if hasattr(ttir_module, "walk"):
+            # use MLIR bindings exposed by Triton code
+            functions = ttir_to_functions(ttir_module)
+        else:
+            # parse string representation of Triton IR
+            functions = parse_ttir(str(ttir_module), kwargs)
+        assert functions is not None
+        kernel_name = next(iter(functions.keys()))
+        # Triton codegen modifies the name
+        assert kernel.fn.__name__ in kernel_name
+        # Reset the cache between top level invocations
+        # The cache for analyze kernel mutations is mainly used for cycle
+        # detection, so each top level invocation needs a clean cache
+        analyze_kernel_mutations.reset()
+        mutations = analyze_kernel_mutations(
+            functions, kernel_name, len(ordered_tensor_names)
+        )
+        return [
+            ordered_tensor_names[i] for i, mutated in enumerate(mutations) if mutated
+        ]
+    except Exception as e:
+        import traceback
+        warnings.warn(
+            "Encountered an exception in identify_mutated_tensors, "
+            "assuming every input is mutated:\n"
+            "".join(
+                traceback.TracebackException.from_exception(e).format()  # noqa: G001
+            )
+        )
+        if ttir_module is not None:
+            log.debug("TTIR:\n%s", str(ttir_module))
+        if functions is not None:
+            log.debug("functions:")
+            for name, fn in functions.items():
+                log.debug("===\t%s\t===", name)
+                for ret, ops in fn.items():
+                    log.debug("%s\t=>\t%s", ret, ops)
+        return [key for key, value in kwargs.items() if isinstance(value, Tensor)]
+###############################################################################
+# Triton Kernel Wrappers
+# Used for wrapping a Triton Kernel
+class TritonKernelWrapperMutation(HigherOrderOperator):
+    def __init__(self):
+        super().__init__("triton_kernel_wrapper_mutation")
+triton_kernel_wrapper_mutation = TritonKernelWrapperMutation()
+# Used for wrapping a Triton Kernel in a functional manner
+class TritonKernelWrapperFunctional(HigherOrderOperator):
+    def __init__(self):
+        super().__init__("triton_kernel_wrapper_functional")
+triton_kernel_wrapper_functional = TritonKernelWrapperFunctional()
+@triton_kernel_wrapper_mutation.py_impl(DispatchKey.CompositeExplicitAutograd)
+def triton_kernel_wrapper_mutation_dense(*, kernel_idx, grid, kwargs):
+    from torch._inductor.codegen.wrapper import user_defined_kernel_grid_fn_code
+    kernel = kernel_side_table.get_kernel(kernel_idx)
+    if len(grid) == 1:
+        grid_fn = grid[0]
+    else:
+        fn_name, code = user_defined_kernel_grid_fn_code(
+            kernel.fn.__name__, kernel.configs, grid
+        )
+        namespace: Dict[str, Any] = {}
+        exec(code, namespace)
+        grid_fn = namespace[fn_name]
+    kernel[grid_fn](**kwargs)
+@triton_kernel_wrapper_mutation.py_impl(FakeTensorMode)
+def triton_kernel_wrapper_mutation_fake_tensor_mode(mode, *, kernel_idx, grid, kwargs):
+    with mode:
+        return None
+def trace_triton_kernel_wrapper(proxy_mode, func_overload, node_args):
+    with disable_proxy_modes_tracing():
+        out = func_overload(**node_args)
+    proxy_args = pytree.tree_map(proxy_mode.tracer.unwrap_proxy, node_args)
+    out_proxy = proxy_mode.tracer.create_proxy(
+        "call_function",
+        func_overload,
+        (),
+        proxy_args,
+        name=func_overload.__name__ + "_proxy",
+    )
+    return track_tensor_tree(out, out_proxy, constant=None, tracer=proxy_mode.tracer)
+@triton_kernel_wrapper_mutation.py_impl(ProxyTorchDispatchMode)
+def triton_kernel_wrapper_mutation_proxy_torch_dispatch_mode(
+    mode, *, kernel_idx, grid, kwargs
+):
+    if mode.enable_tracing:
+        trace_triton_kernel_wrapper(
+            mode,
+            triton_kernel_wrapper_mutation,
+            {"kernel_idx": kernel_idx, "grid": grid, "kwargs": kwargs},
+        )
+    else:
+        triton_kernel_wrapper_mutation(kernel_idx=kernel_idx, grid=grid, kwargs=kwargs)
+    return None
+@triton_kernel_wrapper_mutation.py_functionalize_impl
+def triton_kernel_wrapper_mutation_functionalize(ctx, kernel_idx, grid, kwargs):
+    unwrapped_kwargs = ctx.unwrap_tensors(kwargs)
+    kernel = kernel_side_table.get_kernel(kernel_idx)
+    # TODO(oulgen): Preexisting bug, if two kernel inputs are views of each
+    # other, and one gets mutated in kernel, and later another gets mutated,
+    # they are no longer equal. Fix this by graph breaking on this condition
+    # earlier in dynamo.
+    tensors_to_clone = identify_mutated_tensors(kernel, unwrapped_kwargs)
+    with ctx.redispatch_to_next():
+        unwrapped_outputs = triton_kernel_wrapper_functional(
+            kernel_idx=kernel_idx,
+            grid=grid,
+            kwargs=unwrapped_kwargs,
+            tensors_to_clone=tensors_to_clone,
+        )
+    assert set(unwrapped_outputs.keys()).issubset(set(kwargs.keys()))
+    for key, output_arg in unwrapped_outputs.items():
+        if not isinstance(output_arg, Tensor):
+            continue
+        input_arg = kwargs[key]
+        assert isinstance(input_arg, Tensor)
+        ctx.replace(input_arg, output_arg)
+        # indicate that above replace is hidden from autograd
+        ctx.mark_mutation_hidden_from_autograd(input_arg)
+        ctx.commit_update(input_arg)
+        ctx.sync(input_arg)
+        # sync calls replace_ under the hood, so again indicate that
+        # this indirect replace is hidden from autograd
+        ctx.mark_mutation_hidden_from_autograd(input_arg)
+    return None
+@triton_kernel_wrapper_functional.py_impl(DispatchKey.CompositeExplicitAutograd)
+def triton_kernel_wrapper_functional_dense(
+    *, kernel_idx, grid, kwargs, tensors_to_clone
+):
+    # TODO(oulgen): For performance reasons, we want to ensure that these
+    # `clone_preserve_strides` calls are never executed at runtime
+    # (inductor should always optimize them away).
+    # Requires https://github.com/pytorch/pytorch/issues/109240
+    kwargs = {
+        key: (clone_preserve_strides(val) if key in tensors_to_clone else val)
+        for key, val in kwargs.items()
+    }
+    triton_kernel_wrapper_mutation(kernel_idx=kernel_idx, grid=grid, kwargs=kwargs)
+    return {key: val for key, val in kwargs.items() if key in tensors_to_clone}
+@triton_kernel_wrapper_functional.py_impl(FakeTensorMode)
+def triton_kernel_wrapper_functional_fake_tensor_mode(
+    mode, *, kernel_idx, grid, kwargs, tensors_to_clone
+):
+    # TODO(oulgen): For performance reasons, we want to ensure that these
+    # `clone_preserve_strides` calls are never executed at runtime
+    # (inductor should always optimize them away).
+    # Requires https://github.com/pytorch/pytorch/issues/109240
+    with mode:
+        return {
+            key: clone_preserve_strides(val)
+            for key, val in kwargs.items()
+            if key in tensors_to_clone
+        }
+@triton_kernel_wrapper_functional.py_impl(ProxyTorchDispatchMode)
+def triton_kernel_wrapper_functional_proxy_torch_dispatch_mode(
+    mode, *, kernel_idx, grid, kwargs, tensors_to_clone
+):
+    if mode.enable_tracing:
+        return trace_triton_kernel_wrapper(
+            mode,
+            triton_kernel_wrapper_functional,
+            {
+                "kernel_idx": kernel_idx,
+                "grid": grid,
+                "kwargs": kwargs,
+                "tensors_to_clone": tensors_to_clone,
+            },
+        )
+    else:
+        return triton_kernel_wrapper_functional(
+            kernel_idx=kernel_idx,
+            grid=grid,
+            kwargs=kwargs,
+            tensors_to_clone=tensors_to_clone,
+        )
+@triton_kernel_wrapper_functional.py_functionalize_impl
+def triton_kernel_wrapper_functional_functionalize(
+    ctx, kernel_idx, grid, kwargs, tensors_to_clone
+):
+    unwrapped_kwargs = ctx.unwrap_tensors(kwargs)
+    with ctx.redispatch_to_next():
+        outputs = triton_kernel_wrapper_functional(
+            kernel_idx=kernel_idx,
+            grid=grid,
+            kwargs=unwrapped_kwargs,
+            tensors_to_clone=tensors_to_clone,
+        )
+        return ctx.wrap_tensors(outputs)
+triton_kernel_wrapper_mutation.fallthrough(DispatchKey.PythonDispatcher)  # type: ignore[attr-defined]
+triton_kernel_wrapper_mutation.fallthrough(DispatchKey.PythonTLSSnapshot)  # type: ignore[attr-defined]
+triton_kernel_wrapper_mutation.fallthrough(DispatchKey.ADInplaceOrView)
+triton_kernel_wrapper_mutation.fallthrough(DispatchKey.BackendSelect)
+triton_kernel_wrapper_mutation.fallthrough(DispatchKey.AutocastCPU)  # type: ignore[attr-defined]
+triton_kernel_wrapper_mutation.fallthrough(DispatchKey.AutocastCUDA)  # type: ignore[attr-defined]
+triton_kernel_wrapper_mutation.fallthrough(DispatchKey.AutogradCUDA)
+triton_kernel_wrapper_mutation.fallthrough(DispatchKey.AutogradCPU)
+triton_kernel_wrapper_functional.fallthrough(DispatchKey.PythonDispatcher)  # type: ignore[attr-defined]
+triton_kernel_wrapper_functional.fallthrough(DispatchKey.PythonTLSSnapshot)  # type: ignore[attr-defined]
+triton_kernel_wrapper_functional.fallthrough(DispatchKey.ADInplaceOrView)
+triton_kernel_wrapper_functional.fallthrough(DispatchKey.BackendSelect)
+triton_kernel_wrapper_functional.fallthrough(DispatchKey.AutocastCPU)  # type: ignore[attr-defined]
+triton_kernel_wrapper_functional.fallthrough(DispatchKey.AutocastCUDA)  # type: ignore[attr-defined]
+triton_kernel_wrapper_functional.fallthrough(DispatchKey.AutogradCUDA)
+triton_kernel_wrapper_functional.fallthrough(DispatchKey.AutogradCUDA)
+triton_kernel_wrapper_functional.fallthrough(DispatchKey.AutogradCPU)

tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_higher_order_ops/while_loop.py ADDED Viewed

	@@ -0,0 +1,232 @@

+import torch
+import torch.utils._pytree as pytree
+from torch._C import DispatchKey
+from torch._higher_order_ops.utils import (
+    _has_potential_branch_input_alias,
+    _has_potential_branch_input_mutation,
+    _set_compilation_env,
+    autograd_not_implemented,
+    reenter_make_fx,
+    UnsupportedAliasMutationException,
+)
+from torch._ops import HigherOrderOperator
+from torch._subclasses.fake_tensor import FakeTensorMode
+from torch.fx.experimental.proxy_tensor import (
+    disable_proxy_modes_tracing,
+    ProxyTorchDispatchMode,
+    track_tensor_tree,
+)
+class WhileLoopOp(HigherOrderOperator):
+    def __call__(self, cond_fn, body_fn, operands):
+        if not isinstance(cond_fn, torch.fx.GraphModule) or not isinstance(
+            body_fn, torch.fx.GraphModule
+        ):
+            raise RuntimeError(
+                "cond_fn and body_fn must be torch.fx.GraphModule, got "
+                f"{type(cond_fn)} and {type(body_fn)}"
+            )
+        if not isinstance(operands, tuple):
+            raise RuntimeError("operands must be a tuple, got " f"{type(operands)}")
+        if not all(isinstance(t, (torch.Tensor, int, float, bool)) for t in operands):
+            raise RuntimeError(
+                "operands must be a tuple of tensors, ints, floats, or bools, got "
+                f"{operands}"
+            )
+        return super().__call__(cond_fn, body_fn, operands)
+while_loop_op = HigherOrderOperator("while_loop")
+def while_loop(cond_fn, body_fn, operands):
+    r"""
+    Run body_fn(*operands) while cond_fn(*operands) returns a True scalar tensor. Returns the output of body_fn or
+    initial operands.
+    .. warning::
+        `torch.while_loop` is a prototype feature in PyTorch. It has limited support for input and output types and
+        doesn't support training currently. Please look forward to a more stable implementation in a future version of PyTorch.
+        Read more about feature classification at: https://pytorch.org/blog/pytorch-feature-classification-changes/#prototype
+    `while_loop` is a structured control flow operator. It preserves the loop semantic across the torch.compile and torch.export.
+    `while_loop` is equivalent to the following:
+        def while_loop(cond_fn, body_fn, operands):
+            val = operands
+            while cond_fn(*val):
+                val = body_fn(*val)
+            return val
+    Args:
+        cond_fn (Callable): A callable function that returns a boolean Scalar tensor.
+        body_fn (Callable): A callable function that takes the same inputs as `cond_fn` and returns a tuple of tensors
+        operands (Tuple of possibly nested dict/list/tuple of tensors): A tuple of inputs to cond_fn and body_fn. It's also
+            the initial value of states that are carried across iterations.
+    Example:
+        def cond_fn(iter, x):
+            return iter.sum() < 10
+        def body_fn(iter, x):
+            return iter + 1, x.sin()
+        while_loop(cond_fn, body_fn, (torch.zeros(1), torch.randn(3, 4)))
+    Restrictions:
+        - body_fn must return tensors with the same metadata (e.g.shape, dtype) as inputs.
+        - body_fn and cond_fn must not in-place mutate the operands. A clone before the mutation is required.
+        - body_fn and cond_fn must not mutate python varialbles (e.g. list/dict) created outside of the body_fn.
+        - body_fn and cond_fn's output cannot aliase any of the inputs. A clone is required.
+    .. warning::
+        Temporal Limitations:
+        - 'while_loop' only supports **inference** right now. Autograd will be supported in the future.
+    """
+    if torch.compiler.is_dynamo_compiling():
+        return while_loop_op(cond_fn, body_fn, operands)
+    def _validate_input(cond_fn, body_fn, operands):
+        if not callable(cond_fn) or not callable(body_fn):
+            raise RuntimeError("Expect cond_fn and body_fn to be callbale.")
+        if not isinstance(operands, (tuple, list)) or pytree.tree_any(
+            lambda t: not isinstance(t, torch.Tensor), operands
+        ):
+            raise RuntimeError(
+                "Expect operands to be a tuple of possibly nested dict/list/tuple that only"
+                f"consists of tensor leaves, but got {operands}."
+            )
+    _validate_input(cond_fn, body_fn, operands)
+    with _set_compilation_env(), torch._dynamo.utils.disable_cache_limit():
+        return torch.compile(while_loop_op, backend="eager", fullgraph=True)(
+            cond_fn, body_fn, operands
+        )
+@while_loop_op.py_impl(DispatchKey.CompositeExplicitAutograd)
+def while_loop_dense(cond_fn, body_fn, operands):
+    init_val = operands
+    def _is_boolean_scalar_tensor(pred):
+        return (
+            isinstance(pred, torch.Tensor)
+            and pred.size() == torch.Size([])
+            and pred.dtype == torch.bool
+        )
+    if not isinstance(operands, tuple):
+        raise RuntimeError(f"operands must be a tuple but got {type(operands)}")
+    while pred := cond_fn(*init_val):
+        if not _is_boolean_scalar_tensor(pred):
+            raise RuntimeError(
+                f"cond_fn must return a boolean scalar tensor but got {pred}"
+            )
+        out = body_fn(*init_val)
+        assert isinstance(
+            out, tuple
+        ), f"body_fn should return a tuple but got {type(out)}"
+        assert len(out) == len(
+            init_val
+        ), "body_fn should return the same number of elements as operands"
+        init_val = out
+    return init_val
+while_loop_op.py_impl(DispatchKey.Autograd)(
+    autograd_not_implemented(while_loop_op, deferred_error=True)
+)
+@while_loop_op.py_impl(ProxyTorchDispatchMode)
+def while_loop_tracing(mode, cond_fn, body_fn, operands):
+    def _trace_while_loop(proxy_mode, while_loop_op, cond_fn, body_fn, operands):
+        pre_dispatch = getattr(proxy_mode, "pre_dispatch", False)
+        with disable_proxy_modes_tracing():
+            cond_graph = reenter_make_fx(cond_fn, pre_dispatch)(*operands)
+            body_graph = reenter_make_fx(body_fn, pre_dispatch)(*operands)
+        next_name = None
+        i = 0
+        while not next_name:
+            candidate = f"while_loop_cond_graph_{i}"
+            if hasattr(proxy_mode.tracer.root, candidate):
+                i += 1
+            else:
+                next_name = candidate
+        cond_graph_name = next_name
+        body_graph_name = f"while_loop_body_graph_{i}"
+        assert not hasattr(proxy_mode.tracer.root, body_graph_name)
+        proxy_mode.tracer.root.register_module(cond_graph_name, cond_graph)
+        proxy_mode.tracer.root.register_module(body_graph_name, body_graph)
+        args = (cond_graph, body_graph, operands)
+        proxy_args = pytree.tree_map(proxy_mode.tracer.unwrap_proxy, args)
+        out_proxy = proxy_mode.tracer.create_proxy(
+            "call_function", while_loop_op, proxy_args, {}, name="while_loop"
+        )
+        # body_fn return output with the same pytree and tensor meta data as operands
+        # so we could just return the output after one iteration.
+        out = body_fn(*operands)
+        return track_tensor_tree(
+            out, out_proxy, constant=None, tracer=proxy_mode.tracer
+        )
+    if mode.enable_tracing:
+        return _trace_while_loop(mode, while_loop_op, cond_fn, body_fn, operands)
+    else:
+        return while_loop_op(cond_fn, body_fn, operands)
+@while_loop_op.py_impl(FakeTensorMode)
+def while_loop_fake_tensor_mode(mode, cond_fn, body_fn, operands):
+    return body_fn(*operands)
+@while_loop_op.py_functionalize_impl
+def while_loop_func(ctx, cond_fn, body_fn, operands):
+    unwrapped_operands = ctx.unwrap_tensors(operands)
+    with ctx.redispatch_to_next() as m:
+        functional_cond_fn = ctx.functionalize(cond_fn)
+        functional_body_fn = ctx.functionalize(body_fn)
+        pre_dispatch = hasattr(ctx, "mode") and ctx.mode.pre_dispatch
+        for fn, fn_name in [
+            (functional_cond_fn, "cond_fn"),
+            (functional_body_fn, "body_fn"),
+        ]:
+            if _has_potential_branch_input_mutation(
+                fn, unwrapped_operands, pre_dispatch=pre_dispatch
+            ):
+                raise UnsupportedAliasMutationException(
+                    f"torch.while_loop's {fn_name} might be modifying the input!"
+                )
+        for fn in [functional_cond_fn, functional_body_fn]:
+            if _has_potential_branch_input_alias(
+                fn, unwrapped_operands, pre_dispatch=pre_dispatch
+            ):
+                raise UnsupportedAliasMutationException(
+                    f"torch.while_loop's {fn_name} might be aliasing the input!"
+                )
+        ret = while_loop_op(functional_cond_fn, functional_body_fn, unwrapped_operands)
+        return ctx.wrap_tensors(ret)

tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/backends/_nnapi/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (222 Bytes). View file

tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/backends/mkl/__init__.py ADDED Viewed

	@@ -0,0 +1,56 @@

+import torch
+def is_available():
+    r"""Return whether PyTorch is built with MKL support."""
+    return torch._C.has_mkl
+VERBOSE_OFF = 0
+VERBOSE_ON = 1
+class verbose:
+    """
+    On-demand oneMKL verbosing functionality.
+    To make it easier to debug performance issues, oneMKL can dump verbose
+    messages containing execution information like duration while executing
+    the kernel. The verbosing functionality can be invoked via an environment
+    variable named `MKL_VERBOSE`. However, this methodology dumps messages in
+    all steps. Those are a large amount of verbose messages. Moreover, for
+    investigating the performance issues, generally taking verbose messages
+    for one single iteration is enough. This on-demand verbosing functionality
+    makes it possible to control scope for verbose message dumping. In the
+    following example, verbose messages will be dumped out for the second
+    inference only.
+    .. highlight:: python
+    .. code-block:: python
+        import torch
+        model(data)
+        with torch.backends.mkl.verbose(torch.backends.mkl.VERBOSE_ON):
+            model(data)
+    Args:
+        level: Verbose level
+            - ``VERBOSE_OFF``: Disable verbosing
+            - ``VERBOSE_ON``:  Enable verbosing
+    """
+    def __init__(self, enable):
+        self.enable = enable
+    def __enter__(self):
+        if self.enable == VERBOSE_OFF:
+            return
+        st = torch._C._verbose.mkl_set_verbose(self.enable)
+        assert (
+            st
+        ), "Failed to set MKL into verbose mode. Please consider to disable this verbose scope."
+        return self
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        torch._C._verbose.mkl_set_verbose(VERBOSE_OFF)
+        return False

tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/backends/mps/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (2.84 kB). View file

tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/backends/nnpack/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (2.19 kB). View file

tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/backends/openmp/__init__.py ADDED Viewed

	@@ -0,0 +1,6 @@

+import torch
+def is_available():
+    r"""Return whether PyTorch is built with OpenMP support."""
+    return torch._C.has_openmp

tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/distributed/_sharded_tensor/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (723 Bytes). View file

tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/distributed/_tools/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (291 Bytes). View file

tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/distributed/autograd/__init__.py ADDED Viewed

	@@ -0,0 +1,52 @@

+import sys
+import torch
+def is_available():
+    return hasattr(torch._C, "_dist_autograd_init")
+if is_available() and not torch._C._dist_autograd_init():
+    raise RuntimeError("Failed to initialize torch.distributed.autograd")
+if is_available():
+    from torch._C._distributed_autograd import (
+        get_gradients,
+        backward,
+        _init,
+        _new_context,
+        _release_context,
+        _get_max_id,
+        _is_valid_context,
+        _retrieve_context,
+        _current_context,
+        _get_debug_info,
+        DistAutogradContext,
+    )
+class context:
+    '''
+    Context object to wrap forward and backward passes when using
+    distributed autograd. The ``context_id`` generated in the ``with``
+    statement  is required to uniquely identify a distributed backward pass
+    on all workers. Each worker stores metadata associated with this
+    ``context_id``, which is required to correctly execute a distributed
+    autograd pass.
+    Example::
+        >>> # xdoctest: +SKIP
+        >>> import torch.distributed.autograd as dist_autograd
+        >>> with dist_autograd.context() as context_id:
+        >>>     t1 = torch.rand((3, 3), requires_grad=True)
+        >>>     t2 = torch.rand((3, 3), requires_grad=True)
+        >>>     loss = rpc.rpc_sync("worker1", torch.add, args=(t1, t2)).sum()
+        >>>     dist_autograd.backward(context_id, [loss])
+    '''
+    def __enter__(self):
+        self.autograd_context = _new_context()
+        return self.autograd_context._context_id()
+    def __exit__(self, type, value, traceback):
+        _release_context(self.autograd_context._context_id())

tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/distributed/checkpoint/__pycache__/_dedup_tensors.cpython-311.pyc ADDED Viewed

Binary file (3.51 kB). View file

tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/distributed/elastic/events/api.py ADDED Viewed

	@@ -0,0 +1,112 @@

+#!/usr/bin/env python3
+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+import json
+from dataclasses import asdict, dataclass, field
+from enum import Enum
+from typing import Dict, Union, Optional
+__all__ = ['EventSource', 'Event', 'NodeState', 'RdzvEvent']
+EventMetadataValue = Union[str, int, float, bool, None]
+class EventSource(str, Enum):
+    """Known identifiers of the event producers."""
+    AGENT = "AGENT"
+    WORKER = "WORKER"
+@dataclass
+class Event:
+    """
+    The class represents the generic event that occurs during the torchelastic job execution.
+    The event can be any kind of meaningful action.
+    Args:
+        name: event name.
+        source: the event producer, e.g. agent or worker
+        timestamp: timestamp in milliseconds when event occurred.
+        metadata: additional data that is associated with the event.
+    """
+    name: str
+    source: EventSource
+    timestamp: int = 0
+    metadata: Dict[str, EventMetadataValue] = field(default_factory=dict)
+    def __str__(self):
+        return self.serialize()
+    @staticmethod
+    def deserialize(data: Union[str, "Event"]) -> "Event":
+        if isinstance(data, Event):
+            return data
+        if isinstance(data, str):
+            data_dict = json.loads(data)
+        data_dict["source"] = EventSource[data_dict["source"]]  # type: ignore[possibly-undefined]
+        return Event(**data_dict)
+    def serialize(self) -> str:
+        return json.dumps(asdict(self))
+class NodeState(str, Enum):
+    """The states that a node can be in rendezvous."""
+    INIT = "INIT"
+    RUNNING = "RUNNING"
+    SUCCEEDED = "SUCCEEDED"
+    FAILED = "FAILED"
+@dataclass
+class RdzvEvent:
+    """
+    Dataclass to represent any rendezvous event.
+    Args:
+        name: Event name. (E.g. Current action being performed)
+        run_id: The run id of the rendezvous
+        message: The message describing the event
+        hostname: Hostname of the node
+        pid: The process id of the node
+        node_state: The state of the node (INIT, RUNNING, SUCCEEDED, FAILED)
+        master_endpoint: The master endpoint for the rendezvous store, if known
+        rank: The rank of the node, if known
+        local_id: The local_id of the node, if defined in dynamic_rendezvous.py
+        error_trace: Error stack trace, if this is an error event.
+    """
+    name: str
+    run_id: str
+    message: str
+    hostname: str
+    pid: int
+    node_state: NodeState
+    master_endpoint: str = ""
+    rank: Optional[int] = None
+    local_id: Optional[int] = None
+    error_trace: str = ""
+    def __str__(self):
+        return self.serialize()
+    @staticmethod
+    def deserialize(data: Union[str, "RdzvEvent"]) -> "RdzvEvent":
+        if isinstance(data, RdzvEvent):
+            return data
+        if isinstance(data, str):
+            data_dict = json.loads(data)
+        data_dict["node_state"] = NodeState[data_dict["node_state"]]  # type: ignore[possibly-undefined]
+        return RdzvEvent(**data_dict)
+    def serialize(self) -> str:
+        return json.dumps(asdict(self))

tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/distributed/elastic/events/handlers.py ADDED Viewed

	@@ -0,0 +1,22 @@

+#!/usr/bin/env python3
+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+import logging
+from typing import Dict
+_log_handlers: Dict[str, logging.Handler] = {
+    "console": logging.StreamHandler(),
+    "dynamic_rendezvous": logging.NullHandler(),
+    "null": logging.NullHandler(),
+}
+def get_logging_handler(destination: str = "null") -> logging.Handler:
+    global _log_handlers
+    return _log_handlers[destination]

tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/distributed/elastic/metrics/api.py ADDED Viewed

	@@ -0,0 +1,201 @@

+#!/usr/bin/env python3
+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+import abc
+import time
+import warnings
+from collections import namedtuple
+from functools import wraps
+from typing import Dict, Optional
+__all__ = ['MetricsConfig', 'MetricHandler', 'ConsoleMetricHandler', 'NullMetricHandler', 'MetricStream',
+           'configure', 'getStream', 'prof', 'profile', 'put_metric', 'publish_metric', 'get_elapsed_time_ms',
+           'MetricData']
+MetricData = namedtuple("MetricData", ["timestamp", "group_name", "name", "value"])
+class MetricsConfig:
+    __slots__ = ["params"]
+    def __init__(self, params: Optional[Dict[str, str]] = None):
+        self.params = params
+        if self.params is None:
+            self.params = {}
+class MetricHandler(abc.ABC):
+    @abc.abstractmethod
+    def emit(self, metric_data: MetricData):
+        pass
+class ConsoleMetricHandler(MetricHandler):
+    def emit(self, metric_data: MetricData):
+        print(
+            f"[{metric_data.timestamp}][{metric_data.group_name}]: {metric_data.name}={metric_data.value}"
+        )
+class NullMetricHandler(MetricHandler):
+    def emit(self, metric_data: MetricData):
+        pass
+class MetricStream:
+    def __init__(self, group_name: str, handler: MetricHandler):
+        self.group_name = group_name
+        self.handler = handler
+    def add_value(self, metric_name: str, metric_value: int):
+        self.handler.emit(
+            MetricData(time.time(), self.group_name, metric_name, metric_value)
+        )
+_metrics_map: Dict[str, MetricHandler] = {}
+_default_metrics_handler: MetricHandler = NullMetricHandler()
+# pyre-fixme[9]: group has type `str`; used as `None`.
+def configure(handler: MetricHandler, group: Optional[str] = None):
+    if group is None:
+        global _default_metrics_handler
+        # pyre-fixme[9]: _default_metrics_handler has type `NullMetricHandler`; used
+        #  as `MetricHandler`.
+        _default_metrics_handler = handler
+    else:
+        _metrics_map[group] = handler
+def getStream(group: str):
+    if group in _metrics_map:
+        handler = _metrics_map[group]
+    else:
+        handler = _default_metrics_handler
+    return MetricStream(group, handler)
+def _get_metric_name(fn):
+    qualname = fn.__qualname__
+    split = qualname.split(".")
+    if len(split) == 1:
+        module = fn.__module__
+        if module:
+            return module.split(".")[-1] + "." + split[0]
+        else:
+            return split[0]
+    else:
+        return qualname
+def prof(fn=None, group: str = "torchelastic"):
+    r"""
+    @profile decorator publishes duration.ms, count, success, failure metrics for the function that it decorates.
+    The metric name defaults to the qualified name (``class_name.def_name``) of the function.
+    If the function does not belong to a class, it uses the leaf module name instead.
+    Usage
+    ::
+     @metrics.prof
+     def x():
+         pass
+     @metrics.prof(group="agent")
+     def y():
+         pass
+    """
+    def wrap(f):
+        @wraps(f)
+        def wrapper(*args, **kwargs):
+            key = _get_metric_name(f)
+            try:
+                start = time.time()
+                result = f(*args, **kwargs)
+                put_metric(f"{key}.success", 1, group)
+            except Exception:
+                put_metric(f"{key}.failure", 1, group)
+                raise
+            finally:
+                put_metric(f"{key}.duration.ms", get_elapsed_time_ms(start), group)  # type: ignore[possibly-undefined]
+            return result
+        return wrapper
+    if fn:
+        return wrap(fn)
+    else:
+        return wrap
+def profile(group=None):
+    """
+    @profile decorator adds latency and success/failure metrics to any given function.
+    Usage
+    ::
+     @metrics.profile("my_metric_group")
+     def some_function(<arguments>):
+    """
+    warnings.warn("Deprecated, use @prof instead", DeprecationWarning)
+    def wrap(func):
+        @wraps(func)
+        def wrapper(*args, **kwargs):
+            try:
+                start_time = time.time()
+                result = func(*args, **kwargs)
+                publish_metric(group, f"{func.__name__}.success", 1)
+            except Exception:
+                publish_metric(group, f"{func.__name__}.failure", 1)
+                raise
+            finally:
+                publish_metric(
+                    group,
+                    f"{func.__name__}.duration.ms",
+                    get_elapsed_time_ms(start_time),  # type: ignore[possibly-undefined]
+                )
+            return result
+        return wrapper
+    return wrap
+def put_metric(metric_name: str, metric_value: int, metric_group: str = "torchelastic"):
+    """
+    Publish a metric data point.
+    Usage
+    ::
+     put_metric("metric_name", 1)
+     put_metric("metric_name", 1, "metric_group_name")
+    """
+    getStream(metric_group).add_value(metric_name, metric_value)
+def publish_metric(metric_group: str, metric_name: str, metric_value: int):
+    warnings.warn(
+        "Deprecated, use put_metric(metric_group)(metric_name, metric_value) instead"
+    )
+    metric_stream = getStream(metric_group)
+    metric_stream.add_value(metric_name, metric_value)
+def get_elapsed_time_ms(start_time_in_seconds: float):
+    """Return the elapsed time in millis from the given start time."""
+    end_time = time.time()
+    return int((end_time - start_time_in_seconds) * 1000)

tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/distributed/elastic/multiprocessing/__pycache__/redirects.cpython-311.pyc ADDED Viewed

Binary file (4.54 kB). View file

tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py ADDED Viewed

	@@ -0,0 +1,375 @@

+#!/usr/bin/env python3
+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+"""
+Each host in a distributed PyTorch job runs with a single TorchElastic agent,
+and multiple workers (as children processes of the TorchElastic agent).
+Since the workers are user-provided (your PyTorch script/job), TorchElastic
+has a way to propagate errors on the trainers through the agent and up to the
+scheduler, which ultimately informs the end-user about the state of the job
+and applies any retry policies.
+TorchElastic categorizes errors into 3 categories:
++----------------+----------------+--------------------------------------------------------------+
+| Category       | Sub-Category   |  Description                                                 |
++================+================+==============================================================+
+| User Error     | Input Error    | invalid inputs to TorchElastic APIs (e.g. min > max nodes)   |
+|                +----------------+--------------------------------------------------------------+
+|                | Worker Failure | any failures on the worker child process                     |
++----------------+----------------+--------------------------------------------------------------+
+| Platform Error |      n/a       | failures caused by the agent                                 |
++----------------+----------------+--------------------------------------------------------------+
+| Infra Error    |      n/a       | failures outside the domain of the agent and workers         |
+|                |                | (e.g. host failures)                                         |
++----------------+----------------+--------------------------------------------------------------+
+All errors other than "Worker Failure" are either raised canonically from the
+agent process or implicitly or explicitly crash the agent process. So the
+standard language (python) provided exception handling strategies apply.
+Worker Failures are special because the exception/failure originates on a different
+process from the agent so the error needs to be propagated inter-process
+(e.g. the agent cannot simply ``try-catch`` an exception raised on the worker process).
+TorchElastic agents use :func:`torch.distributed.elastic.multiprocessing.start_processes`
+to launch the workers which has a simple file based inter-process error propagation
+built-in.
+Any function or binary entrypoint decorated with :func:`record`
+will write uncaught exceptions (with the trace information) to a file specified by the
+environment variable ``TORCHELASTIC_ERROR_FILE``. The parent process (e.g. agent)
+sets this env var on each child it launches, then aggregates the error files for all
+children, and propagates the one with the **smallest** timestamp (e.g. the **first** error).
+"""
+import json
+import os
+import signal
+import socket
+import time
+import warnings
+from dataclasses import dataclass, field
+from datetime import datetime
+from functools import wraps
+from string import Template
+from typing import Any, Callable, Dict, List, Optional, Tuple, TypeVar
+from torch.distributed.elastic.utils.logging import get_logger
+from .error_handler import ErrorHandler  # noqa: F401
+from .handlers import get_error_handler  # noqa: F401
+__all__ = ["ProcessFailure", "ChildFailedError", "record", "ErrorHandler", "get_error_handler"]
+log = get_logger(__name__)
+JSON = Dict
+_EMPTY_ERROR_DATA = {"message": "<NONE>"}
+_NOT_AVAILABLE = "<N/A>"
+T = TypeVar("T")
+@dataclass
+class ProcessFailure:
+    """
+    Represent the failed process result. When the worker process fails, it may record failure root cause into the file.
+    Tries to read the failure timestamp from the provided ``error_file``,
+    if the ``error_file`` does not exist, the timestamp is the current
+    timestamp (seconds since epoch).
+    The ``message`` field is a concise explanation of the failure. If
+    the error file exists then the message is obtained from the error file.
+    Otherwise one is generated based on the failure signature.
+    .. note:: It is assumed that the ``error_file`` is written by
+              ``torch.distributed.elastic.multiprocessing.errors.error_handler.ErrorHandler``.
+              Otherwise the behavior is undefined.
+    """
+    local_rank: int
+    pid: int
+    exitcode: int
+    error_file: str
+    error_file_data: JSON = field(init=False)
+    message: str = field(init=False)
+    timestamp: int = field(init=False)
+    def __post_init__(self):
+        self.error_file_data = _EMPTY_ERROR_DATA
+        if os.path.isfile(self.error_file):
+            try:
+                with open(self.error_file) as fp:
+                    self.error_file_data = json.load(fp)
+                    log.debug(
+                        "User process failed with error data: %s", json.dumps(self.error_file_data, indent=2)
+                    )
+                    self.message, self.timestamp = self._get_error_data(
+                        self.error_file_data
+                    )
+            except Exception:
+                log.exception("Failed to parse reply file: %s", self.error_file)
+                raise
+        else:
+            self._set_no_reply_file()
+        # make up an informative message if not already present
+        if not self.message:
+            # signals typically do not generate an error file message
+            if self.exitcode < 0:
+                self.message = (
+                    f"Signal {-self.exitcode} ({self.signal_name()})"
+                    f" received by PID {self.pid}"
+                )
+            else:
+                self.message = "To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html"
+    def _get_error_data(self, error_file_data: Dict[str, Any]) -> Tuple[str, int]:
+        message = error_file_data["message"]
+        if isinstance(message, str):
+            timestamp = int(error_file_data.get("timestamp", 0))
+        else:
+            timestamp = int(message["extraInfo"]["timestamp"])
+        return (message, timestamp)
+    def _set_no_reply_file(self):
+        self.error_file = _NOT_AVAILABLE
+        self.error_file_data = _EMPTY_ERROR_DATA
+        self.message = ""
+        self.timestamp = int(time.time())
+    def signal_name(self) -> str:
+        if self.exitcode < 0:
+            # We don't want to kill the parent process trying to find the signal name.
+            # if the signal doesn't map to a known name, use not available.
+            try:
+                return signal.Signals(-self.exitcode).name
+            except Exception:
+                return _NOT_AVAILABLE
+        else:
+            return _NOT_AVAILABLE
+    def timestamp_isoformat(self):
+        """Return timestamp in ISO format (YYYY-MM-DD_HH:MM:SS)."""
+        return datetime.fromtimestamp(self.timestamp).isoformat(sep="_")
+GlobalRank = int
+_FAILURE_FORMAT_TEMPLATE = """[${idx}]:
+  time      : ${time}
+  host      : ${hostname}
+  rank      : ${rank} (local_rank: ${local_rank})
+  exitcode  : ${exitcode} (pid: ${pid})
+  error_file: ${error_file}
+  traceback : ${message}"""
+# extra new lines before and after are intentional
+_MSG_FORMAT_TEMPLATE = """
+${boarder}
+${title}
+${section}
+Failures:
+${other_failures}
+${section}
+Root Cause (first observed failure):
+${root_failure}
+${boarder}"""
+class ChildFailedError(Exception):
+    """
+    Special exception type that can be raised from a function annotated with the
+    ``@record`` decorator to have the child process' (root exception) propagate
+    up the stack as-is (e.g. without being wrapped in the parent's traceback).
+    Useful in cases where the parent is a simple nanny process
+    and the child (worker) processes are actually doing meaningful compute.
+    In this case, errors typically occur on the child process as the parent
+    is not doing anything non-trivial, and child errors should be propagated
+    to the scheduler for accurate root cause diagnostics.
+    .. note:: The propagation relies on error files rather than exception handling to
+              support both function and binary launches.
+    Example:
+    ::
+     # process tree on a host (container)
+     0: scheduler-init-process:
+                |- 1: torchelastic_agent:
+                         |- 2: trainer_0 (ok)
+                         |- 3: trainer_1 (fail) -> error.json
+                         |- ...
+                         |- n+2: trainer_n (ok)
+                |- n+3: other processes
+                |- ...
+    In the example above, trainer 1's failure (written into error.json) is
+    the root cause and should be reported to the scheduler's init process.
+    The torchelastic agent raises a ``ChildFailedError("trainer", {1: "trainer_1/error.json"})``
+    upon detecting trainer 1's failure which would propagate the contents
+    of trainer 1's error file to the scheduler's init process.
+    """
+    def __init__(self, name: str, failures: Dict[GlobalRank, ProcessFailure]):
+        self.name = name
+        self.failures = failures
+        assert (
+            self.failures
+        )  # does not make sense to create a ChildFaileError with no failures
+        super().__init__(self.format_msg())
+    def get_first_failure(self) -> Tuple[GlobalRank, ProcessFailure]:
+        rank = min(self.failures.keys(), key=lambda r: self.failures[r].timestamp)
+        return rank, self.failures[rank]
+    def format_msg(self, boarder_delim="=", section_delim="-"):
+        title = f"{self.name} FAILED"
+        root_rank, root_failure = self.get_first_failure()
+        root_failure_fmt: str = ""
+        other_failures_fmt: List[str] = []
+        width = len(title)
+        for idx, (rank, failure) in enumerate(self.failures.items()):
+            fmt, w = self._format_failure(idx, rank, failure)
+            width = max(width, w)
+            if rank == root_rank:
+                root_failure_fmt = fmt
+            else:
+                other_failures_fmt.append(fmt)
+        # upper boundary on width
+        width = min(width, 60)
+        return Template(_MSG_FORMAT_TEMPLATE).substitute(
+            boarder=boarder_delim * width,
+            title=title,
+            section=section_delim * width,
+            root_failure=root_failure_fmt,
+            other_failures="\n".join(other_failures_fmt or ["  <NO_OTHER_FAILURES>"]),
+        )
+    def _format_failure(
+        self, idx: int, rank: int, failure: ProcessFailure
+    ) -> Tuple[str, int]:
+        # failure.message is either a str (when the failure does not generate a traceback - e.g. signals)
+        # or a dict (json) of the form
+        # {"message": $ERROR_MSG, "extraInfo": {"py_callstack": $TRACEBACK, timestamp: $TS}}
+        # so the display logic is:
+        # 1. if failure.message is not a dict (it is a str) just show it as is
+        # 2. else try to get the traceback (py_callstack)
+        # 3.      if the traceback is not there, use the message
+        # 4.      if the message  is not there show <N/A>
+        msg = failure.message
+        if isinstance(failure.message, dict):
+            msg = (
+                failure.message.get("extraInfo", {})
+                .get("py_callstack", failure.message.get("message", "<N/A>"))
+                .replace("\n", "\n  ")  # to properly indent the traceback
+            )
+        fmt = Template(_FAILURE_FORMAT_TEMPLATE).substitute(
+            idx=idx,
+            time=failure.timestamp_isoformat(),
+            hostname=socket.getfqdn(),
+            rank=rank,
+            local_rank=failure.local_rank,
+            exitcode=failure.exitcode,
+            pid=failure.pid,
+            error_file=failure.error_file,
+            message=msg,
+        )
+        width = 0
+        for line in fmt.split("\n"):
+            width = max(width, len(line))
+        return fmt, width
+def record(
+    fn: Callable[..., T], error_handler: Optional[ErrorHandler] = None
+) -> Callable[..., T]:
+    """
+    Syntactic sugar to record errors/exceptions that happened in the decorated
+    function using the provided ``error_handler``.
+    Using this decorator is equivalent to:
+    ::
+     error_handler = get_error_handler()
+     error_handler.initialize()
+     try:
+        foobar()
+     except ChildFailedError as e:
+        _, failure = e.get_first_failure()
+        error_handler.dump_error_file(failure.error_file, failure.exitcode)
+        raise
+     except Exception as e:
+        error_handler.record(e)
+        raise
+    .. important:: use this decorator once per process at the top level method,
+                   typically this is the main method.
+    Example
+    ::
+     @record
+     def main():
+         pass
+     if __name__=="__main__":
+        main()
+    """
+    if not error_handler:
+        error_handler = get_error_handler()
+    def wrap(f):
+        @wraps(f)
+        def wrapper(*args, **kwargs):
+            assert error_handler is not None  # assertion for mypy type checker
+            error_handler.initialize()
+            try:
+                return f(*args, **kwargs)
+            except SystemExit as se:
+                # For run_path based entrypoints, SystemExit with code = 0 will never exit.
+                # Handling it here by returning a value:
+                if se.code == 0:
+                    return None
+                else:
+                    raise
+            except ChildFailedError as e:
+                rank, failure = e.get_first_failure()
+                if failure.error_file != _NOT_AVAILABLE:
+                    error_handler.dump_error_file(failure.error_file, failure.exitcode)
+                else:
+                    log.info(
+                        (
+                            "local_rank %s FAILED with no error file."
+                            " Decorate your entrypoint fn with @record for traceback info."
+                            " See: https://pytorch.org/docs/stable/elastic/errors.html",
+                            rank
+                        )
+                    )
+                raise
+            except Exception as e:
+                error_handler.record_exception(e)
+                raise
+        return wrapper
+    return wrap(fn)

tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/distributed/elastic/multiprocessing/errors/handlers.py ADDED Viewed

	@@ -0,0 +1,16 @@

+#!/usr/bin/env python3
+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+# Multiprocessing error-reporting module
+from torch.distributed.elastic.multiprocessing.errors.error_handler import ErrorHandler
+__all__ = ['get_error_handler']
+def get_error_handler():
+    return ErrorHandler()

tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/distributed/elastic/multiprocessing/subprocess_handler/__pycache__/handlers.cpython-311.pyc ADDED Viewed

Binary file (937 Bytes). View file

tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/distributed/elastic/multiprocessing/subprocess_handler/__pycache__/subprocess_handler.cpython-311.pyc ADDED Viewed

Binary file (3.72 kB). View file

tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/distributed/elastic/multiprocessing/subprocess_handler/handlers.py ADDED Viewed

	@@ -0,0 +1,32 @@

+#!/usr/bin/env python3
+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+from typing import Dict, Tuple
+from torch.distributed.elastic.multiprocessing.subprocess_handler.subprocess_handler import (
+    SubprocessHandler,
+)
+__all__ = ["get_subprocess_handler"]
+def get_subprocess_handler(
+    entrypoint: str,
+    args: Tuple,
+    env: Dict[str, str],
+    stdout: str,
+    stderr: str,
+    local_rank_id: int,
+):
+    return SubprocessHandler(
+        entrypoint=entrypoint,
+        args=args,
+        env=env,
+        stdout=stdout,
+        stderr=stderr,
+        local_rank_id=local_rank_id,
+    )

tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/distributed/elastic/timer/__init__.py ADDED Viewed

	@@ -0,0 +1,44 @@

+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+"""
+Expiration timers are set up on the same process as the agent and
+used from your script to deal with stuck workers. When you go into
+a code-block that has the potential to get stuck you can acquire
+an expiration timer, which instructs the timer server to kill the
+process if it does not release the timer by the self-imposed expiration
+deadline.
+Usage::
+    import torchelastic.timer as timer
+    import torchelastic.agent.server as agent
+    def main():
+        start_method = "spawn"
+        message_queue = mp.get_context(start_method).Queue()
+        server = timer.LocalTimerServer(message, max_interval=0.01)
+        server.start() # non-blocking
+        spec = WorkerSpec(
+                    fn=trainer_func,
+                    args=(message_queue,),
+                    ...<OTHER_PARAMS...>)
+        agent = agent.LocalElasticAgent(spec, start_method)
+        agent.run()
+    def trainer_func(message_queue):
+        timer.configure(timer.LocalTimerClient(message_queue))
+        with timer.expires(after=60): # 60 second expiry
+            # do some work
+In the example above if ``trainer_func`` takes more than 60 seconds to
+complete, then the worker process is killed and the agent retries the worker group.
+"""
+from .api import TimerClient, TimerRequest, TimerServer, configure, expires  # noqa: F401
+from .local_timer import LocalTimerClient, LocalTimerServer  # noqa: F401
+from .file_based_local_timer import FileTimerClient, FileTimerServer, FileTimerRequest  # noqa: F401

tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/distributed/elastic/timer/__pycache__/local_timer.cpython-311.pyc ADDED Viewed

Binary file (7.63 kB). View file

tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/distributed/elastic/timer/local_timer.py ADDED Viewed

	@@ -0,0 +1,125 @@

+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+import logging
+import multiprocessing as mp
+import os
+import signal
+import time
+from queue import Empty
+from typing import Any, Dict, List, Set, Tuple
+from .api import RequestQueue, TimerClient, TimerRequest, TimerServer
+__all__ = ['LocalTimerClient', 'MultiprocessingRequestQueue', 'LocalTimerServer']
+log = logging.getLogger(__name__)
+class LocalTimerClient(TimerClient):
+    """
+    Client side of ``LocalTimerServer``. This client is meant to be used
+    on the same host that the ``LocalTimerServer`` is running on and uses
+    pid to uniquely identify a worker. This is particularly useful in situations
+    where one spawns a subprocess (trainer) per GPU on a host with multiple
+    GPU devices.
+    """
+    def __init__(self, mp_queue):
+        super().__init__()
+        self._mp_queue = mp_queue
+    def acquire(self, scope_id, expiration_time):
+        pid = os.getpid()
+        acquire_request = TimerRequest(pid, scope_id, expiration_time)
+        self._mp_queue.put(acquire_request)
+    def release(self, scope_id):
+        pid = os.getpid()
+        release_request = TimerRequest(pid, scope_id, -1)
+        self._mp_queue.put(release_request)
+class MultiprocessingRequestQueue(RequestQueue):
+    """
+    A ``RequestQueue`` backed by python ``multiprocessing.Queue``
+    """
+    def __init__(self, mp_queue: mp.Queue):
+        super().__init__()
+        self._mp_queue = mp_queue
+    def size(self) -> int:
+        return self._mp_queue.qsize()
+    def get(self, size, timeout: float) -> List[TimerRequest]:
+        requests = []
+        wait = timeout
+        for _ in range(0, size):
+            start = time.time()
+            try:
+                r = self._mp_queue.get(block=True, timeout=wait)
+            except Empty:
+                break
+            requests.append(r)
+            wait = wait - (time.time() - start)
+            if wait <= 0:
+                break
+        return requests
+class LocalTimerServer(TimerServer):
+    """
+    Server that works with ``LocalTimerClient``. Clients are expected to be
+    subprocesses to the parent process that is running this server. Each host
+    in the job is expected to start its own timer server locally and each
+    server instance manages timers for local workers (running on processes
+    on the same host).
+    """
+    def __init__(
+        self, mp_queue: mp.Queue, max_interval: float = 60, daemon: bool = True
+    ):
+        super().__init__(MultiprocessingRequestQueue(mp_queue), max_interval, daemon)
+        self._timers: Dict[Tuple[Any, str], TimerRequest] = {}
+    def register_timers(self, timer_requests: List[TimerRequest]) -> None:
+        for request in timer_requests:
+            pid = request.worker_id
+            scope_id = request.scope_id
+            expiration_time = request.expiration_time
+            # negative expiration is a proxy for a release call
+            if expiration_time < 0:
+                self._timers.pop((pid, scope_id), None)
+            else:
+                self._timers[(pid, scope_id)] = request
+    def clear_timers(self, worker_ids: Set[int]) -> None:
+        for (pid, scope_id) in list(self._timers.keys()):
+            if pid in worker_ids:
+                self._timers.pop((pid, scope_id))
+    def get_expired_timers(self, deadline: float) -> Dict[Any, List[TimerRequest]]:
+        # pid -> [timer_requests...]
+        expired_timers: Dict[Any, List[TimerRequest]] = {}
+        for request in self._timers.values():
+            if request.expiration_time <= deadline:
+                expired_scopes = expired_timers.setdefault(request.worker_id, [])
+                expired_scopes.append(request)
+        return expired_timers
+    def _reap_worker(self, worker_id: int) -> bool:
+        try:
+            os.kill(worker_id, signal.SIGKILL)
+            return True
+        except ProcessLookupError:
+            log.info("Process with pid=%s does not exist. Skipping", worker_id)
+            return True
+        except Exception:
+            log.exception("Error terminating pid=%s", worker_id)
+        return False

tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/distributed/nn/__init__.py ADDED Viewed

	@@ -0,0 +1,4 @@

+import torch
+if torch.distributed.rpc.is_available():
+    from .api.remote_module import RemoteModule
+from .functional import *  # noqa: F403

tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/distributed/nn/api/__init__.py ADDED Viewed

File without changes

tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/distributed/nn/jit/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (225 Bytes). View file

tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/distributed/nn/jit/templates/__init__.py ADDED Viewed

File without changes

tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/distributed/optim/__pycache__/post_localSGD_optimizer.cpython-311.pyc ADDED Viewed

Binary file (6.34 kB). View file

tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/distributed/pipeline/sync/__pycache__/copy.cpython-311.pyc ADDED Viewed

Binary file (5.95 kB). View file

tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/distributed/pipeline/sync/__pycache__/worker.cpython-311.pyc ADDED Viewed

Binary file (7.46 kB). View file

tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/distributed/tensor/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (225 Bytes). View file

tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/distributed/tensor/parallel/__pycache__/api.cpython-311.pyc ADDED Viewed

Binary file (5.2 kB). View file

tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/distributed/tensor/parallel/__pycache__/loss.cpython-311.pyc ADDED Viewed

Binary file (21 kB). View file

tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/distributed/tensor/parallel/__pycache__/style.cpython-311.pyc ADDED Viewed

Binary file (28 kB). View file

tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/distributed/tensor/parallel/api.py ADDED Viewed

	@@ -0,0 +1,108 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates
+from typing import Dict, Union
+import torch
+import torch.distributed._tensor.random as random
+import torch.nn as nn
+from torch.distributed._tensor import (
+    DeviceMesh,
+)
+from torch.distributed._tensor.random import (
+    is_rng_supported_mesh,
+    TensorParallelRNGTracker,
+)
+from torch.distributed.tensor.parallel._utils import _validate_tp_mesh_dim
+from torch.distributed.tensor.parallel.style import (
+    ParallelStyle,
+)
+__all__ = [
+    "parallelize_module",
+]
+def parallelize_module(  # type: ignore[return]
+    module: nn.Module,
+    device_mesh: DeviceMesh,
+    parallelize_plan: Union[ParallelStyle, Dict[str, ParallelStyle]],
+) -> nn.Module:
+    """
+    Apply Tensor Parallelism in PyTorch by parallelizing modules or sub-modules based on a user-specified plan.
+    We parallelize module or sub_modules based on a parallelize_plan. The parallelize_plan contains
+    :class:`ParallelStyle`, which indicates how user wants the module or sub_module
+    to be parallelized.
+    User can also specify different parallel style per module fully qualified name (FQN).
+    Note that ``parallelize_module`` only accepts a 1-D :class:`DeviceMesh`, if you have a 2-D or N-D :class:`DeviceMesh`,
+    slice the DeviceMesh to a 1-D sub DeviceMesh first then pass to this API(i.e. ``device_mesh[\"tp\"]``)
+    Args:
+        module (:class:`nn.Module`):
+            Module to be parallelized.
+        device_mesh (:class:`DeviceMesh`):
+            Object which describes the mesh topology
+            of devices for the DTensor.
+        parallelize_plan (Union[:class:`ParallelStyle`, Dict[str, :class:`ParallelStyle`]]):
+            The plan used to parallelize the module. It can be either a
+            :class:`ParallelStyle` object which contains how
+            we prepare input/output for Tensor Parallelism or it can be a
+            dict of module FQN and its corresponding :class:`ParallelStyle` object.
+    Return:
+        A :class:`nn.Module` object parallelized.
+    Example::
+        >>> # xdoctest: +SKIP("distributed")
+        >>> from torch.distributed.tensor.parallel import parallelize_module, ColwiseParallel
+        >>> from torch.distributed.device_mesh import init_device_mesh
+        >>>
+        >>> # Define the module.
+        >>> m = Model(...)
+        >>> tp_mesh = init_device_mesh("cuda", (8,))
+        >>> m = parallelize_module(m, tp_mesh, {"w1": ColwiseParallel(), "w2": RowwiseParallel()})
+        >>>
+    .. note:: For complex module architecture like Attention, MLP layers, we recommend composing
+        different ParallelStyles together (i.e. ``ColwiseParallel`` and ``RowwiseParallel``) and pass
+        as a parallelize_plan, to achieves the desired sharding computation.
+    """
+    torch._C._log_api_usage_once("torch.distributed.tensor.parallel.parallelize_module")
+    _validate_tp_mesh_dim(device_mesh)
+    # instantiate a TP RNG state tracker if it's not there
+    if is_rng_supported_mesh(device_mesh) and not isinstance(
+        random._rng_tracker, TensorParallelRNGTracker
+    ):
+        random._rng_tracker = TensorParallelRNGTracker(device_mesh.device_type)
+        # TODO: we should allow user to pass in the default seed from a config
+        random._rng_tracker._manual_seed(device_mesh, base_seed=1234)
+        # By default we execute random ops in non-tensor-parallel region. If users want
+        # to execute in tensor-parallel region, they can manually set this field to True
+        # after parallelizing the model.
+        random._rng_tracker.distribute_region_enabled = False
+    if isinstance(parallelize_plan, ParallelStyle):
+        return parallelize_plan._apply(module, device_mesh)
+    elif isinstance(parallelize_plan, dict):
+        for module_path, parallelize_style in parallelize_plan.items():
+            sub_module = module.get_submodule(module_path)
+            parent_module = module
+            if "." in module_path:
+                parent_module_path = ".".join(module_path.split(".")[:-1])
+                parent_module = module.get_submodule(parent_module_path)
+                module_path = module_path.split(".")[-1]
+            parent_module.register_module(  # type: ignore[call-arg] # pyre-ignore[20]
+                module_path,
+                parallelize_module(  # type: ignore[arg-type]
+                    sub_module, device_mesh, parallelize_style  # type: ignore[arg-type] # pyre-ignore[6]
+                ),
+            )
+        return module
+    else:
+        raise RuntimeError(  # pyre-ignore[7]
+            "Expect Union[ParallelStyle, Dict[str, ParallelStyle]] for"
+            f" parallelize_plan, {type(parallelize_plan)} found!"
+        )