koichi12 commited on Feb 12, 2025

Commit

de9dde6

verified ·

1 Parent(s): 24f659d

Add files using upload-large-folder tool

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.venv/lib/python3.11/site-packages/ray/rllib/policy/__pycache__/dynamic_tf_policy.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/rllib/policy/__pycache__/dynamic_tf_policy_v2.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/rllib/policy/__pycache__/eager_tf_policy.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/rllib/policy/__pycache__/eager_tf_policy_v2.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/rllib/policy/__pycache__/policy.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/rllib/policy/__pycache__/policy_map.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/rllib/policy/__pycache__/policy_template.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/rllib/policy/__pycache__/rnn_sequencing.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/rllib/policy/__pycache__/sample_batch.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/rllib/policy/__pycache__/tf_mixins.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/rllib/policy/__pycache__/tf_policy_template.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/rllib/policy/__pycache__/torch_policy.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/rllib/policy/__pycache__/view_requirement.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/tune/__init__.py +115 -0
.venv/lib/python3.11/site-packages/ray/tune/automl/__init__.py +1 -0
.venv/lib/python3.11/site-packages/ray/tune/automl/__pycache__/__init__.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/tune/callback.py +512 -0
.venv/lib/python3.11/site-packages/ray/tune/constants.py +32 -0
.venv/lib/python3.11/site-packages/ray/tune/context.py +113 -0
.venv/lib/python3.11/site-packages/ray/tune/error.py +48 -0
.venv/lib/python3.11/site-packages/ray/tune/examples/cifar10_pytorch.py +285 -0
.venv/lib/python3.11/site-packages/ray/tune/examples/lightgbm_example.py +105 -0
.venv/lib/python3.11/site-packages/ray/tune/execution/__init__.py +0 -0
.venv/lib/python3.11/site-packages/ray/tune/execution/__pycache__/__init__.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/tune/execution/__pycache__/class_cache.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/tune/execution/__pycache__/cluster_info.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/tune/execution/__pycache__/experiment_state.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/tune/execution/__pycache__/insufficient_resources_manager.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/tune/execution/__pycache__/placement_groups.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/tune/execution/class_cache.py +68 -0
.venv/lib/python3.11/site-packages/ray/tune/execution/cluster_info.py +12 -0
.venv/lib/python3.11/site-packages/ray/tune/execution/experiment_state.py +287 -0
.venv/lib/python3.11/site-packages/ray/tune/execution/insufficient_resources_manager.py +167 -0
.venv/lib/python3.11/site-packages/ray/tune/execution/placement_groups.py +131 -0
.venv/lib/python3.11/site-packages/ray/tune/execution/tune_controller.py +2181 -0
.venv/lib/python3.11/site-packages/ray/tune/experiment/__init__.py +4 -0
.venv/lib/python3.11/site-packages/ray/tune/experiment/__pycache__/__init__.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/tune/experiment/__pycache__/config_parser.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/tune/experiment/__pycache__/experiment.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/tune/experiment/__pycache__/trial.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/tune/experiment/config_parser.py +210 -0
.venv/lib/python3.11/site-packages/ray/tune/experiment/experiment.py +445 -0
.venv/lib/python3.11/site-packages/ray/tune/experiment/trial.py +1073 -0
.venv/lib/python3.11/site-packages/ray/tune/integration/__init__.py +0 -0
.venv/lib/python3.11/site-packages/ray/tune/integration/__pycache__/keras.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/tune/integration/__pycache__/lightgbm.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/tune/integration/__pycache__/pytorch_lightning.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/tune/integration/__pycache__/ray_train.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/tune/integration/__pycache__/xgboost.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/tune/integration/keras.py +28 -0

.venv/lib/python3.11/site-packages/ray/rllib/policy/__pycache__/dynamic_tf_policy.cpython-311.pyc ADDED Viewed

Binary file (60.1 kB). View file

.venv/lib/python3.11/site-packages/ray/rllib/policy/__pycache__/dynamic_tf_policy_v2.cpython-311.pyc ADDED Viewed

Binary file (45 kB). View file

.venv/lib/python3.11/site-packages/ray/rllib/policy/__pycache__/eager_tf_policy.cpython-311.pyc ADDED Viewed

Binary file (49.2 kB). View file

.venv/lib/python3.11/site-packages/ray/rllib/policy/__pycache__/eager_tf_policy_v2.cpython-311.pyc ADDED Viewed

Binary file (42.7 kB). View file

.venv/lib/python3.11/site-packages/ray/rllib/policy/__pycache__/policy.cpython-311.pyc ADDED Viewed

Binary file (74.2 kB). View file

.venv/lib/python3.11/site-packages/ray/rllib/policy/__pycache__/policy_map.cpython-311.pyc ADDED Viewed

Binary file (13.4 kB). View file

.venv/lib/python3.11/site-packages/ray/rllib/policy/__pycache__/policy_template.cpython-311.pyc ADDED Viewed

Binary file (22.4 kB). View file

.venv/lib/python3.11/site-packages/ray/rllib/policy/__pycache__/rnn_sequencing.cpython-311.pyc ADDED Viewed

Binary file (29.7 kB). View file

.venv/lib/python3.11/site-packages/ray/rllib/policy/__pycache__/sample_batch.cpython-311.pyc ADDED Viewed

Binary file (77.5 kB). View file

.venv/lib/python3.11/site-packages/ray/rllib/policy/__pycache__/tf_mixins.cpython-311.pyc ADDED Viewed

Binary file (20.4 kB). View file

.venv/lib/python3.11/site-packages/ray/rllib/policy/__pycache__/tf_policy_template.cpython-311.pyc ADDED Viewed

Binary file (18.2 kB). View file

.venv/lib/python3.11/site-packages/ray/rllib/policy/__pycache__/torch_policy.cpython-311.pyc ADDED Viewed

Binary file (58.7 kB). View file

.venv/lib/python3.11/site-packages/ray/rllib/policy/__pycache__/view_requirement.cpython-311.pyc ADDED Viewed

Binary file (7.91 kB). View file

.venv/lib/python3.11/site-packages/ray/tune/__init__.py ADDED Viewed

	@@ -0,0 +1,115 @@

+# isort: off
+# Try import ray[tune] core requirements (defined in setup.py)
+try:
+    import fsspec  # noqa: F401
+    import pandas  # noqa: F401
+    import pyarrow  # noqa: F401
+    import requests  # noqa: F401
+except ImportError as exc:
+    raise ImportError(
+        "Can't import ray.tune as some dependencies are missing. "
+        'Run `pip install "ray[tune]"` to fix.'
+    ) from exc
+# isort: on
+from ray.air.result import Result
+from ray.tune.analysis import ExperimentAnalysis
+from ray.tune.callback import Callback
+from ray.tune.context import TuneContext, get_context
+from ray.tune.error import TuneError
+from ray.tune.execution.placement_groups import PlacementGroupFactory
+from ray.tune.experiment import Experiment
+from ray.tune.impl.config import CheckpointConfig, FailureConfig, RunConfig
+from ray.tune.progress_reporter import (
+    CLIReporter,
+    JupyterNotebookReporter,
+    ProgressReporter,
+)
+from ray.tune.registry import register_env, register_trainable
+from ray.tune.result_grid import ResultGrid
+from ray.tune.schedulers import create_scheduler
+from ray.tune.search import create_searcher, grid_search
+from ray.tune.search.sample import (
+    choice,
+    lograndint,
+    loguniform,
+    qlograndint,
+    qloguniform,
+    qrandint,
+    qrandn,
+    quniform,
+    randint,
+    randn,
+    sample_from,
+    uniform,
+)
+from ray.tune.stopper import Stopper
+from ray.tune.syncer import SyncConfig
+from ray.tune.trainable import Trainable
+from ray.tune.trainable.trainable_fn_utils import Checkpoint, get_checkpoint, report
+from ray.tune.trainable.util import with_parameters, with_resources
+from ray.tune.tune import run, run_experiments
+from ray.tune.tune_config import ResumeConfig, TuneConfig
+from ray.tune.tuner import Tuner
+__all__ = [
+    "Trainable",
+    "Callback",
+    "TuneError",
+    "grid_search",
+    "register_env",
+    "register_trainable",
+    "run",
+    "run_experiments",
+    "with_parameters",
+    "with_resources",
+    "Stopper",
+    "Experiment",
+    "sample_from",
+    "uniform",
+    "quniform",
+    "choice",
+    "randint",
+    "lograndint",
+    "qrandint",
+    "qlograndint",
+    "randn",
+    "qrandn",
+    "loguniform",
+    "qloguniform",
+    "ExperimentAnalysis",
+    "CLIReporter",
+    "JupyterNotebookReporter",
+    "ProgressReporter",
+    "ResultGrid",
+    "create_searcher",
+    "create_scheduler",
+    "PlacementGroupFactory",
+    "Tuner",
+    "TuneConfig",
+    "ResumeConfig",
+    "RunConfig",
+    "CheckpointConfig",
+    "FailureConfig",
+    "Result",
+    "Checkpoint",
+    "get_checkpoint",
+    "report",
+    "get_context",
+    "TuneContext",
+    # TODO(justinvyu): [Deprecated]
+    "SyncConfig",
+]
+report.__module__ = "ray.tune"
+get_checkpoint.__module__ = "ray.tune"
+get_context.__module__ = "ray.tune"
+TuneContext.__module__ = "ray.tune"
+Checkpoint.__module__ = "ray.tune"
+Result.__module__ = "ray.tune"
+RunConfig.__module__ = "ray.tune"
+CheckpointConfig.__module__ = "ray.tune"
+FailureConfig.__module__ = "ray.tune"
+# DO NOT ADD ANYTHING AFTER THIS LINE.

.venv/lib/python3.11/site-packages/ray/tune/automl/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ raise DeprecationWarning("`ray.tune.automl` is deprecated in Ray 2.6.")

.venv/lib/python3.11/site-packages/ray/tune/automl/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (275 Bytes). View file

.venv/lib/python3.11/site-packages/ray/tune/callback.py ADDED Viewed

	@@ -0,0 +1,512 @@

+import glob
+import warnings
+from abc import ABCMeta
+from pathlib import Path
+from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple
+import ray.tune
+from ray.tune.utils.util import _atomic_save, _load_newest_checkpoint
+from ray.util.annotations import DeveloperAPI, PublicAPI
+if TYPE_CHECKING:
+    from ray.tune.experiment import Trial
+    from ray.tune.stopper import Stopper
+class _CallbackMeta(ABCMeta):
+    """A helper metaclass to ensure container classes (e.g. CallbackList) have
+    implemented all the callback methods (e.g. `on_*`).
+    """
+    def __new__(mcs, name: str, bases: Tuple[type], attrs: Dict[str, Any]) -> type:
+        cls = super().__new__(mcs, name, bases, attrs)
+        if mcs.need_check(cls, name, bases, attrs):
+            mcs.check(cls, name, bases, attrs)
+        return cls
+    @classmethod
+    def need_check(
+        mcs, cls: type, name: str, bases: Tuple[type], attrs: Dict[str, Any]
+    ) -> bool:
+        return attrs.get("IS_CALLBACK_CONTAINER", False)
+    @classmethod
+    def check(
+        mcs, cls: type, name: str, bases: Tuple[type], attrs: Dict[str, Any]
+    ) -> None:
+        methods = set()
+        for base in bases:
+            methods.update(
+                attr_name
+                for attr_name, attr in vars(base).items()
+                if mcs.need_override_by_subclass(attr_name, attr)
+            )
+        overridden = {
+            attr_name
+            for attr_name, attr in attrs.items()
+            if mcs.need_override_by_subclass(attr_name, attr)
+        }
+        missing = methods.difference(overridden)
+        if missing:
+            raise TypeError(
+                f"Found missing callback method: {missing} "
+                f"in class {cls.__module__}.{cls.__qualname__}."
+            )
+    @classmethod
+    def need_override_by_subclass(mcs, attr_name: str, attr: Any) -> bool:
+        return (
+            (
+                attr_name.startswith("on_")
+                and not attr_name.startswith("on_trainer_init")
+            )
+            or attr_name == "setup"
+        ) and callable(attr)
+@PublicAPI(stability="beta")
+class Callback(metaclass=_CallbackMeta):
+    """Tune base callback that can be extended and passed to a ``TrialRunner``
+    Tune callbacks are called from within the ``TrialRunner`` class. There are
+    several hooks that can be used, all of which are found in the submethod
+    definitions of this base class.
+    The parameters passed to the ``**info`` dict vary between hooks. The
+    parameters passed are described in the docstrings of the methods.
+    This example will print a metric each time a result is received:
+    .. testcode::
+        from ray import train, tune
+        from ray.tune import Callback
+        class MyCallback(Callback):
+            def on_trial_result(self, iteration, trials, trial, result,
+                                **info):
+                print(f"Got result: {result['metric']}")
+        def train_func(config):
+            for i in range(10):
+                tune.report(metric=i)
+        tuner = tune.Tuner(
+            train_func,
+            run_config=train.RunConfig(
+                callbacks=[MyCallback()]
+            )
+        )
+        tuner.fit()
+    .. testoutput::
+        :hide:
+        ...
+    """
+    # File templates for any artifacts written by this callback
+    # These files should live in the `trial.local_path` for each trial.
+    # TODO(ml-team): Make this more visible to users to override. Internal use for now.
+    _SAVED_FILE_TEMPLATES = []
+    # arguments here match Experiment.public_spec
+    def setup(
+        self,
+        stop: Optional["Stopper"] = None,
+        num_samples: Optional[int] = None,
+        total_num_samples: Optional[int] = None,
+        **info,
+    ):
+        """Called once at the very beginning of training.
+        Any Callback setup should be added here (setting environment
+        variables, etc.)
+        Arguments:
+            stop: Stopping criteria.
+                If ``time_budget_s`` was passed to ``train.RunConfig``, a
+                ``TimeoutStopper`` will be passed here, either by itself
+                or as a part of a ``CombinedStopper``.
+            num_samples: Number of times to sample from the
+                hyperparameter space. Defaults to 1. If `grid_search` is
+                provided as an argument, the grid will be repeated
+                `num_samples` of times. If this is -1, (virtually) infinite
+                samples are generated until a stopping condition is met.
+            total_num_samples: Total number of samples factoring
+                in grid search samplers.
+            **info: Kwargs dict for forward compatibility.
+        """
+        pass
+    def on_step_begin(self, iteration: int, trials: List["Trial"], **info):
+        """Called at the start of each tuning loop step.
+        Arguments:
+            iteration: Number of iterations of the tuning loop.
+            trials: List of trials.
+            **info: Kwargs dict for forward compatibility.
+        """
+        pass
+    def on_step_end(self, iteration: int, trials: List["Trial"], **info):
+        """Called at the end of each tuning loop step.
+        The iteration counter is increased before this hook is called.
+        Arguments:
+            iteration: Number of iterations of the tuning loop.
+            trials: List of trials.
+            **info: Kwargs dict for forward compatibility.
+        """
+        pass
+    def on_trial_start(
+        self, iteration: int, trials: List["Trial"], trial: "Trial", **info
+    ):
+        """Called after starting a trial instance.
+        Arguments:
+            iteration: Number of iterations of the tuning loop.
+            trials: List of trials.
+            trial: Trial that just has been started.
+            **info: Kwargs dict for forward compatibility.
+        """
+        pass
+    def on_trial_restore(
+        self, iteration: int, trials: List["Trial"], trial: "Trial", **info
+    ):
+        """Called after restoring a trial instance.
+        Arguments:
+            iteration: Number of iterations of the tuning loop.
+            trials: List of trials.
+            trial: Trial that just has been restored.
+            **info: Kwargs dict for forward compatibility.
+        """
+        pass
+    def on_trial_save(
+        self, iteration: int, trials: List["Trial"], trial: "Trial", **info
+    ):
+        """Called after receiving a checkpoint from a trial.
+        Arguments:
+            iteration: Number of iterations of the tuning loop.
+            trials: List of trials.
+            trial: Trial that just saved a checkpoint.
+            **info: Kwargs dict for forward compatibility.
+        """
+        pass
+    def on_trial_result(
+        self,
+        iteration: int,
+        trials: List["Trial"],
+        trial: "Trial",
+        result: Dict,
+        **info,
+    ):
+        """Called after receiving a result from a trial.
+        The search algorithm and scheduler are notified before this
+        hook is called.
+        Arguments:
+            iteration: Number of iterations of the tuning loop.
+            trials: List of trials.
+            trial: Trial that just sent a result.
+            result: Result that the trial sent.
+            **info: Kwargs dict for forward compatibility.
+        """
+        pass
+    def on_trial_complete(
+        self, iteration: int, trials: List["Trial"], trial: "Trial", **info
+    ):
+        """Called after a trial instance completed.
+        The search algorithm and scheduler are notified before this
+        hook is called.
+        Arguments:
+            iteration: Number of iterations of the tuning loop.
+            trials: List of trials.
+            trial: Trial that just has been completed.
+            **info: Kwargs dict for forward compatibility.
+        """
+        pass
+    def on_trial_recover(
+        self, iteration: int, trials: List["Trial"], trial: "Trial", **info
+    ):
+        """Called after a trial instance failed (errored) but the trial is scheduled
+        for retry.
+        The search algorithm and scheduler are not notified.
+        Arguments:
+            iteration: Number of iterations of the tuning loop.
+            trials: List of trials.
+            trial: Trial that just has errored.
+            **info: Kwargs dict for forward compatibility.
+        """
+        pass
+    def on_trial_error(
+        self, iteration: int, trials: List["Trial"], trial: "Trial", **info
+    ):
+        """Called after a trial instance failed (errored).
+        The search algorithm and scheduler are notified before this
+        hook is called.
+        Arguments:
+            iteration: Number of iterations of the tuning loop.
+            trials: List of trials.
+            trial: Trial that just has errored.
+            **info: Kwargs dict for forward compatibility.
+        """
+        pass
+    def on_checkpoint(
+        self,
+        iteration: int,
+        trials: List["Trial"],
+        trial: "Trial",
+        checkpoint: "ray.tune.Checkpoint",
+        **info,
+    ):
+        """Called after a trial saved a checkpoint with Tune.
+        Arguments:
+            iteration: Number of iterations of the tuning loop.
+            trials: List of trials.
+            trial: Trial that just has errored.
+            checkpoint: Checkpoint object that has been saved
+                by the trial.
+            **info: Kwargs dict for forward compatibility.
+        """
+        pass
+    def on_experiment_end(self, trials: List["Trial"], **info):
+        """Called after experiment is over and all trials have concluded.
+        Arguments:
+            trials: List of trials.
+            **info: Kwargs dict for forward compatibility.
+        """
+        pass
+    def get_state(self) -> Optional[Dict]:
+        """Get the state of the callback.
+        This method should be implemented by subclasses to return a dictionary
+        representation of the object's current state.
+        This is called automatically by Tune to periodically checkpoint callback state.
+        Upon :ref:`Tune experiment restoration <tune-experiment-level-fault-tolerance>`,
+        callback state will be restored via :meth:`~ray.tune.Callback.set_state`.
+        .. testcode::
+            from typing import Dict, List, Optional
+            from ray.tune import Callback
+            from ray.tune.experiment import Trial
+            class MyCallback(Callback):
+                def __init__(self):
+                    self._trial_ids = set()
+                def on_trial_start(
+                    self, iteration: int, trials: List["Trial"], trial: "Trial", **info
+                ):
+                    self._trial_ids.add(trial.trial_id)
+                def get_state(self) -> Optional[Dict]:
+                    return {"trial_ids": self._trial_ids.copy()}
+                def set_state(self, state: Dict) -> Optional[Dict]:
+                    self._trial_ids = state["trial_ids"]
+        Returns:
+            dict: State of the callback. Should be `None` if the callback does not
+            have any state to save (this is the default).
+        """
+        return None
+    def set_state(self, state: Dict):
+        """Set the state of the callback.
+        This method should be implemented by subclasses to restore the callback's
+        state based on the given dict state.
+        This is used automatically by Tune to restore checkpoint callback state
+        on :ref:`Tune experiment restoration <tune-experiment-level-fault-tolerance>`.
+        See :meth:`~ray.tune.Callback.get_state` for an example implementation.
+        Args:
+            state: State of the callback.
+        """
+        pass
+@DeveloperAPI
+class CallbackList(Callback):
+    """Call multiple callbacks at once."""
+    IS_CALLBACK_CONTAINER = True
+    CKPT_FILE_TMPL = "callback-states-{}.pkl"
+    def __init__(self, callbacks: List[Callback]):
+        self._callbacks = callbacks
+    def setup(self, **info):
+        for callback in self._callbacks:
+            try:
+                callback.setup(**info)
+            except TypeError as e:
+                if "argument" in str(e):
+                    warnings.warn(
+                        "Please update `setup` method in callback "
+                        f"`{callback.__class__}` to match the method signature"
+                        " in `ray.tune.callback.Callback`.",
+                        FutureWarning,
+                    )
+                    callback.setup()
+                else:
+                    raise e
+    def on_step_begin(self, **info):
+        for callback in self._callbacks:
+            callback.on_step_begin(**info)
+    def on_step_end(self, **info):
+        for callback in self._callbacks:
+            callback.on_step_end(**info)
+    def on_trial_start(self, **info):
+        for callback in self._callbacks:
+            callback.on_trial_start(**info)
+    def on_trial_restore(self, **info):
+        for callback in self._callbacks:
+            callback.on_trial_restore(**info)
+    def on_trial_save(self, **info):
+        for callback in self._callbacks:
+            callback.on_trial_save(**info)
+    def on_trial_result(self, **info):
+        for callback in self._callbacks:
+            callback.on_trial_result(**info)
+    def on_trial_complete(self, **info):
+        for callback in self._callbacks:
+            callback.on_trial_complete(**info)
+    def on_trial_recover(self, **info):
+        for callback in self._callbacks:
+            callback.on_trial_recover(**info)
+    def on_trial_error(self, **info):
+        for callback in self._callbacks:
+            callback.on_trial_error(**info)
+    def on_checkpoint(self, **info):
+        for callback in self._callbacks:
+            callback.on_checkpoint(**info)
+    def on_experiment_end(self, **info):
+        for callback in self._callbacks:
+            callback.on_experiment_end(**info)
+    def get_state(self) -> Optional[Dict]:
+        """Gets the state of all callbacks contained within this list.
+        If there are no stateful callbacks, then None will be returned in order
+        to avoid saving an unnecessary callback checkpoint file."""
+        state = {}
+        any_stateful_callbacks = False
+        for i, callback in enumerate(self._callbacks):
+            callback_state = callback.get_state()
+            if callback_state:
+                any_stateful_callbacks = True
+            state[i] = callback_state
+        if not any_stateful_callbacks:
+            return None
+        return state
+    def set_state(self, state: Dict):
+        """Sets the state for all callbacks contained within this list.
+        Skips setting state for all stateless callbacks where `get_state`
+        returned None."""
+        for i, callback in enumerate(self._callbacks):
+            callback_state = state.get(i, None)
+            if callback_state:
+                callback.set_state(callback_state)
+    def save_to_dir(self, checkpoint_dir: str, session_str: str = "default"):
+        """Save the state of the callback list to the checkpoint_dir.
+        Args:
+            checkpoint_dir: directory where the checkpoint is stored.
+            session_str: Unique identifier of the current run session (ex: timestamp).
+        """
+        state_dict = self.get_state()
+        if state_dict:
+            file_name = self.CKPT_FILE_TMPL.format(session_str)
+            tmp_file_name = f".tmp-{file_name}"
+            _atomic_save(
+                state=state_dict,
+                checkpoint_dir=checkpoint_dir,
+                file_name=file_name,
+                tmp_file_name=tmp_file_name,
+            )
+    def restore_from_dir(self, checkpoint_dir: str):
+        """Restore the state of the list of callbacks from the checkpoint_dir.
+        You should check if it's possible to restore with `can_restore`
+        before calling this method.
+        Args:
+            checkpoint_dir: directory where the checkpoint is stored.
+        Raises:
+            RuntimeError: if unable to find checkpoint.
+            NotImplementedError: if the `set_state` method is not implemented.
+        """
+        state_dict = _load_newest_checkpoint(
+            checkpoint_dir, self.CKPT_FILE_TMPL.format("*")
+        )
+        if not state_dict:
+            raise RuntimeError(
+                "Unable to find checkpoint in {}.".format(checkpoint_dir)
+            )
+        self.set_state(state_dict)
+    def can_restore(self, checkpoint_dir: str) -> bool:
+        """Check if the checkpoint_dir contains the saved state for this callback list.
+        Returns:
+            can_restore: True if the checkpoint_dir contains a file of the
+                format `CKPT_FILE_TMPL`. False otherwise.
+        """
+        return any(
+            glob.iglob(Path(checkpoint_dir, self.CKPT_FILE_TMPL.format("*")).as_posix())
+        )
+    def __len__(self) -> int:
+        return len(self._callbacks)
+    def __getitem__(self, i: int) -> "Callback":
+        return self._callbacks[i]

.venv/lib/python3.11/site-packages/ray/tune/constants.py ADDED Viewed

	@@ -0,0 +1,32 @@

+# ==================================================
+#               Environment Variables
+# ==================================================
+# NOTE: When adding a new environment variable, please track it in this list.
+TUNE_ENV_VARS = {
+    "RAY_AIR_LOCAL_CACHE_DIR",
+    "TUNE_DISABLE_AUTO_CALLBACK_LOGGERS",
+    "TUNE_DISABLE_AUTO_INIT",
+    "TUNE_DISABLE_DATED_SUBDIR",
+    "TUNE_DISABLE_STRICT_METRIC_CHECKING",
+    "TUNE_DISABLE_SIGINT_HANDLER",
+    "TUNE_FORCE_TRIAL_CLEANUP_S",
+    "TUNE_FUNCTION_THREAD_TIMEOUT_S",
+    "TUNE_GLOBAL_CHECKPOINT_S",
+    "TUNE_MAX_LEN_IDENTIFIER",
+    "TUNE_MAX_PENDING_TRIALS_PG",
+    "TUNE_PLACEMENT_GROUP_PREFIX",
+    "TUNE_PLACEMENT_GROUP_RECON_INTERVAL",
+    "TUNE_PRINT_ALL_TRIAL_ERRORS",
+    "TUNE_RESULT_DIR",
+    "TUNE_RESULT_BUFFER_LENGTH",
+    "TUNE_RESULT_DELIM",
+    "TUNE_RESULT_BUFFER_MAX_TIME_S",
+    "TUNE_RESULT_BUFFER_MIN_TIME_S",
+    "TUNE_WARN_THRESHOLD_S",
+    "TUNE_WARN_INSUFFICENT_RESOURCE_THRESHOLD_S",
+    "TUNE_WARN_INSUFFICENT_RESOURCE_THRESHOLD_S_AUTOSCALER",
+    "TUNE_WARN_EXCESSIVE_EXPERIMENT_CHECKPOINT_SYNC_THRESHOLD_S",
+    "TUNE_STATE_REFRESH_PERIOD",
+    "TUNE_RESTORE_RETRY_NUM",
+}

.venv/lib/python3.11/site-packages/ray/tune/context.py ADDED Viewed

	@@ -0,0 +1,113 @@

+import threading
+from typing import Any, Dict, Optional
+from ray.train._internal import session
+from ray.train.constants import _v2_migration_warnings_enabled
+from ray.train.context import TrainContext as TrainV1Context
+from ray.train.utils import _copy_doc
+from ray.tune.execution.placement_groups import PlacementGroupFactory
+from ray.util.annotations import Deprecated, PublicAPI
+# The context singleton on this process.
+_tune_context: Optional["TuneContext"] = None
+_tune_context_lock = threading.Lock()
+_TRAIN_SPECIFIC_CONTEXT_DEPRECATION_MESSAGE = (
+    "`{}` is deprecated for Ray Tune because there is no concept of worker ranks "
+    "for Ray Tune, so these methods only make sense to use in the context of "
+    "a Ray Train worker."
+)
+@PublicAPI(stability="beta")
+class TuneContext(TrainV1Context):
+    """Context to access metadata within Ray Tune functions."""
+    # NOTE: These methods are deprecated on the TrainContext, but are still
+    # available on the TuneContext. Re-defining them here to avoid the
+    # deprecation warnings.
+    @_copy_doc(session.get_trial_name)
+    def get_trial_name(self) -> str:
+        return session.get_trial_name()
+    @_copy_doc(session.get_trial_id)
+    def get_trial_id(self) -> str:
+        return session.get_trial_id()
+    @_copy_doc(session.get_trial_resources)
+    def get_trial_resources(self) -> PlacementGroupFactory:
+        return session.get_trial_resources()
+    @_copy_doc(session.get_trial_dir)
+    def get_trial_dir(self) -> str:
+        return session.get_trial_dir()
+    # Deprecated APIs
+    @Deprecated
+    def get_metadata(self) -> Dict[str, Any]:
+        raise DeprecationWarning(
+            "`get_metadata` is deprecated for Ray Tune, as it has never been usable."
+        )
+    @Deprecated(
+        message=_TRAIN_SPECIFIC_CONTEXT_DEPRECATION_MESSAGE.format("get_world_size"),
+        warning=_v2_migration_warnings_enabled(),
+    )
+    @_copy_doc(TrainV1Context.get_world_size)
+    def get_world_size(self) -> int:
+        return session.get_world_size()
+    @Deprecated(
+        message=_TRAIN_SPECIFIC_CONTEXT_DEPRECATION_MESSAGE.format("get_world_rank"),
+        warning=_v2_migration_warnings_enabled(),
+    )
+    @_copy_doc(TrainV1Context.get_world_rank)
+    def get_world_rank(self) -> int:
+        return session.get_world_rank()
+    @Deprecated(
+        message=_TRAIN_SPECIFIC_CONTEXT_DEPRECATION_MESSAGE.format("get_local_rank"),
+        warning=_v2_migration_warnings_enabled(),
+    )
+    @_copy_doc(TrainV1Context.get_local_rank)
+    def get_local_rank(self) -> int:
+        return session.get_local_rank()
+    @Deprecated(
+        message=_TRAIN_SPECIFIC_CONTEXT_DEPRECATION_MESSAGE.format(
+            "get_local_world_size"
+        ),
+        warning=_v2_migration_warnings_enabled(),
+    )
+    @_copy_doc(TrainV1Context.get_local_world_size)
+    def get_local_world_size(self) -> int:
+        return session.get_local_world_size()
+    @Deprecated(
+        message=_TRAIN_SPECIFIC_CONTEXT_DEPRECATION_MESSAGE.format("get_node_rank"),
+        warning=_v2_migration_warnings_enabled(),
+    )
+    @_copy_doc(TrainV1Context.get_node_rank)
+    def get_node_rank(self) -> int:
+        return session.get_node_rank()
+@PublicAPI(stability="beta")
+def get_context() -> TuneContext:
+    """Get or create a singleton Ray Tune context.
+    The context is only available in a tune function passed to the `ray.tune.Tuner`.
+    See the :class:`~ray.tune.TuneContext` API reference to see available methods.
+    """
+    global _tune_context
+    with _tune_context_lock:
+        if _tune_context is None:
+            # TODO(justinvyu): This default should be a dummy context
+            # that is only used for testing / running outside of Tune.
+            _tune_context = TuneContext()
+        return _tune_context

.venv/lib/python3.11/site-packages/ray/tune/error.py ADDED Viewed

	@@ -0,0 +1,48 @@

+from ray.util.annotations import PublicAPI
+@PublicAPI
+class TuneError(Exception):
+    """General error class raised by ray.tune."""
+    pass
+class _AbortTrialExecution(TuneError):
+    """Error that indicates a trial should not be retried."""
+    pass
+class _SubCategoryTuneError(TuneError):
+    """The more specific TuneError that happens for a certain Tune
+    subroutine. For example starting/stopping a trial.
+    """
+    def __init__(self, traceback_str: str):
+        self.traceback_str = traceback_str
+    def __str__(self):
+        return self.traceback_str
+class _TuneStopTrialError(_SubCategoryTuneError):
+    """Error that happens when stopping a tune trial."""
+    pass
+class _TuneStartTrialError(_SubCategoryTuneError):
+    """Error that happens when starting a tune trial."""
+    pass
+class _TuneNoNextExecutorEventError(_SubCategoryTuneError):
+    """Error that happens when waiting to get the next event to
+    handle from RayTrialExecutor.
+    Note: RayTaskError will be raised by itself and will not be using
+    this category. This category is for everything else."""
+    pass

.venv/lib/python3.11/site-packages/ray/tune/examples/cifar10_pytorch.py ADDED Viewed

	@@ -0,0 +1,285 @@

+# ruff: noqa
+# fmt: off
+# __import_begin__
+import os
+import tempfile
+from typing import Dict
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.optim as optim
+import torchvision
+import torchvision.transforms as transforms
+from filelock import FileLock
+from torch.utils.data import random_split
+import ray
+from ray import train, tune
+from ray.train import Checkpoint
+from ray.tune.schedulers import ASHAScheduler
+# __import_end__
+# __load_data_begin__
+DATA_DIR = tempfile.mkdtemp()
+def load_data(data_dir):
+    transform = transforms.Compose([
+        transforms.ToTensor(),
+        transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)),
+    ])
+    # We add FileLock here because multiple workers will want to
+    # download data, and this may cause overwrites since
+    # DataLoader is not threadsafe.
+    with FileLock(os.path.expanduser("~/.data.lock")):
+        trainset = torchvision.datasets.CIFAR10(
+            root=data_dir, train=True, download=True, transform=transform)
+        testset = torchvision.datasets.CIFAR10(
+            root=data_dir, train=False, download=True, transform=transform)
+    return trainset, testset
+# __load_data_end__
+def load_test_data():
+    # Loads a fake dataset for testing so it doesn't rely on external download.
+    trainset = torchvision.datasets.FakeData(
+        128, (3, 32, 32), num_classes=10, transform=transforms.ToTensor()
+    )
+    testset = torchvision.datasets.FakeData(
+        16, (3, 32, 32), num_classes=10, transform=transforms.ToTensor()
+    )
+    return trainset, testset
+# __net_begin__
+class Net(nn.Module):
+    def __init__(self, l1=120, l2=84):
+        super(Net, self).__init__()
+        self.conv1 = nn.Conv2d(3, 6, 5)
+        self.pool = nn.MaxPool2d(2, 2)
+        self.conv2 = nn.Conv2d(6, 16, 5)
+        self.fc1 = nn.Linear(16 * 5 * 5, l1)
+        self.fc2 = nn.Linear(l1, l2)
+        self.fc3 = nn.Linear(l2, 10)
+    def forward(self, x):
+        x = self.pool(F.relu(self.conv1(x)))
+        x = self.pool(F.relu(self.conv2(x)))
+        x = x.view(-1, 16 * 5 * 5)
+        x = F.relu(self.fc1(x))
+        x = F.relu(self.fc2(x))
+        x = self.fc3(x)
+        return x
+# __net_end__
+# __train_begin__
+def train_cifar(config):
+    net = Net(config["l1"], config["l2"])
+    device = "cpu"
+    if torch.cuda.is_available():
+        device = "cuda:0"
+        if torch.cuda.device_count() > 1:
+            net = nn.DataParallel(net)
+    net.to(device)
+    criterion = nn.CrossEntropyLoss()
+    optimizer = optim.SGD(net.parameters(), lr=config["lr"], momentum=0.9)
+    # Load existing checkpoint through `get_checkpoint()` API.
+    if train.get_checkpoint():
+        loaded_checkpoint = train.get_checkpoint()
+        with loaded_checkpoint.as_directory() as loaded_checkpoint_dir:
+            model_state, optimizer_state = torch.load(
+                os.path.join(loaded_checkpoint_dir, "checkpoint.pt")
+            )
+            net.load_state_dict(model_state)
+            optimizer.load_state_dict(optimizer_state)
+    if config["smoke_test"]:
+        trainset, testset = load_test_data()
+    else:
+        trainset, testset = load_data(DATA_DIR)
+    test_abs = int(len(trainset) * 0.8)
+    train_subset, val_subset = random_split(
+        trainset, [test_abs, len(trainset) - test_abs])
+    trainloader = torch.utils.data.DataLoader(
+        train_subset,
+        batch_size=int(config["batch_size"]),
+        shuffle=True,
+        num_workers=0 if config["smoke_test"] else 8,
+    )
+    valloader = torch.utils.data.DataLoader(
+        val_subset,
+        batch_size=int(config["batch_size"]),
+        shuffle=True,
+        num_workers=0 if config["smoke_test"] else 8,
+    )
+    for epoch in range(10):  # loop over the dataset multiple times
+        running_loss = 0.0
+        epoch_steps = 0
+        for i, data in enumerate(trainloader):
+            # get the inputs; data is a list of [inputs, labels]
+            inputs, labels = data
+            inputs, labels = inputs.to(device), labels.to(device)
+            # zero the parameter gradients
+            optimizer.zero_grad()
+            # forward + backward + optimize
+            outputs = net(inputs)
+            loss = criterion(outputs, labels)
+            loss.backward()
+            optimizer.step()
+            # print statistics
+            running_loss += loss.item()
+            epoch_steps += 1
+            if i % 2000 == 1999:  # print every 2000 mini-batches
+                print("[%d, %5d] loss: %.3f" % (epoch + 1, i + 1,
+                                                running_loss / epoch_steps))
+                running_loss = 0.0
+        # Validation loss
+        val_loss = 0.0
+        val_steps = 0
+        total = 0
+        correct = 0
+        for i, data in enumerate(valloader, 0):
+            with torch.no_grad():
+                inputs, labels = data
+                inputs, labels = inputs.to(device), labels.to(device)
+                outputs = net(inputs)
+                _, predicted = torch.max(outputs.data, 1)
+                total += labels.size(0)
+                correct += (predicted == labels).sum().item()
+                loss = criterion(outputs, labels)
+                val_loss += loss.cpu().numpy()
+                val_steps += 1
+        # Here we save a checkpoint. It is automatically registered with
+        # Ray Tune and will potentially be accessed through in ``get_checkpoint()``
+        # in future iterations.
+        # Note to save a file like checkpoint, you still need to put it under a directory
+        # to construct a checkpoint.
+        with tempfile.TemporaryDirectory() as temp_checkpoint_dir:
+            path = os.path.join(temp_checkpoint_dir, "checkpoint.pt")
+            torch.save(
+                (net.state_dict(), optimizer.state_dict()), path
+            )
+            checkpoint = Checkpoint.from_directory(temp_checkpoint_dir)
+            train.report(
+                {"loss": (val_loss / val_steps), "accuracy": correct / total},
+                checkpoint=checkpoint,
+            )
+    print("Finished Training")
+# __train_end__
+# __test_acc_begin__
+def test_best_model(config: Dict, checkpoint: "Checkpoint", smoke_test=False):
+    best_trained_model = Net(config["l1"], config["l2"])
+    device = "cuda:0" if torch.cuda.is_available() else "cpu"
+    best_trained_model.to(device)
+    with checkpoint.as_directory() as checkpoint_dir:
+        checkpoint_path = os.path.join(checkpoint_dir, "checkpoint.pt")
+        model_state, optimizer_state = torch.load(checkpoint_path)
+        best_trained_model.load_state_dict(model_state)
+    if smoke_test:
+        _, testset = load_test_data()
+    else:
+        _, testset = load_data(DATA_DIR)
+    testloader = torch.utils.data.DataLoader(
+        testset, batch_size=4, shuffle=False, num_workers=2)
+    correct = 0
+    total = 0
+    with torch.no_grad():
+        for data in testloader:
+            images, labels = data
+            images, labels = images.to(device), labels.to(device)
+            outputs = best_trained_model(images)
+            _, predicted = torch.max(outputs.data, 1)
+            total += labels.size(0)
+            correct += (predicted == labels).sum().item()
+    print("Best trial test set accuracy: {}".format(correct / total))
+# __test_acc_end__
+# __main_begin__
+def main(num_samples=10, max_num_epochs=10, gpus_per_trial=2, smoke_test=False):
+    config = {
+        "l1": tune.sample_from(lambda _: 2 ** np.random.randint(2, 9)),
+        "l2": tune.sample_from(lambda _: 2 ** np.random.randint(2, 9)),
+        "lr": tune.loguniform(1e-4, 1e-1),
+        "batch_size": tune.choice([2, 4, 8, 16]),
+        "smoke_test": smoke_test,
+    }
+    scheduler = ASHAScheduler(
+        max_t=max_num_epochs,
+        grace_period=1,
+        reduction_factor=2)
+    tuner = tune.Tuner(
+        tune.with_resources(
+            tune.with_parameters(train_cifar),
+            resources={"cpu": 2, "gpu": gpus_per_trial},
+        ),
+        tune_config=tune.TuneConfig(
+            metric="loss",
+            mode="min",
+            num_samples=num_samples,
+            scheduler=scheduler
+        ),
+        param_space=config,
+    )
+    results = tuner.fit()
+    best_result = results.get_best_result("loss", "min")
+    print("Best trial config: {}".format(best_result.config))
+    print("Best trial final validation loss: {}".format(
+        best_result.metrics["loss"]))
+    print("Best trial final validation accuracy: {}".format(
+        best_result.metrics["accuracy"]))
+    test_best_model(best_result.config, best_result.checkpoint, smoke_test=smoke_test)
+# __main_end__
+if __name__ == "__main__":
+    import argparse
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--smoke-test", action="store_true", help="Finish quickly for testing")
+    parser.add_argument(
+        "--ray-address",
+        help="Address of Ray cluster for seamless distributed execution.",
+        required=False)
+    args, _ = parser.parse_known_args()
+    if args.smoke_test:
+        ray.init(num_cpus=2)
+        main(num_samples=1, max_num_epochs=1, gpus_per_trial=0, smoke_test=True)
+    else:
+        ray.init(args.ray_address)
+        # Change this to activate training on GPUs
+        main(num_samples=10, max_num_epochs=10, gpus_per_trial=0)

.venv/lib/python3.11/site-packages/ray/tune/examples/lightgbm_example.py ADDED Viewed

	@@ -0,0 +1,105 @@

+import lightgbm as lgb
+import sklearn.datasets
+import sklearn.metrics
+from sklearn.model_selection import train_test_split
+from ray import tune
+from ray.tune.integration.lightgbm import TuneReportCheckpointCallback
+from ray.tune.schedulers import ASHAScheduler
+def train_breast_cancer(config: dict):
+    # This is a simple training function to be passed into Tune
+    # Load dataset
+    data, target = sklearn.datasets.load_breast_cancer(return_X_y=True)
+    # Split into train and test set
+    train_x, test_x, train_y, test_y = train_test_split(data, target, test_size=0.25)
+    # Build input Datasets for LightGBM
+    train_set = lgb.Dataset(train_x, label=train_y)
+    test_set = lgb.Dataset(test_x, label=test_y)
+    # Train the classifier, using the Tune callback
+    lgb.train(
+        config,
+        train_set,
+        valid_sets=[test_set],
+        valid_names=["eval"],
+        verbose_eval=False,
+        callbacks=[
+            TuneReportCheckpointCallback(
+                {
+                    "binary_error": "eval-binary_error",
+                    "binary_logloss": "eval-binary_logloss",
+                }
+            )
+        ],
+    )
+def train_breast_cancer_cv(config: dict):
+    # This is a simple training function to be passed into Tune, using
+    # lightgbm's cross validation functionality
+    # Load dataset
+    data, target = sklearn.datasets.load_breast_cancer(return_X_y=True)
+    train_set = lgb.Dataset(data, label=target)
+    # Run CV, using the Tune callback
+    lgb.cv(
+        config,
+        train_set,
+        verbose_eval=False,
+        stratified=True,
+        # Checkpointing is not supported for CV
+        # LightGBM aggregates metrics over folds automatically
+        # with the cv_agg key. Both mean and standard deviation
+        # are provided.
+        callbacks=[
+            TuneReportCheckpointCallback(
+                {
+                    "binary_error": "cv_agg-binary_error-mean",
+                    "binary_logloss": "cv_agg-binary_logloss-mean",
+                    "binary_error_stdv": "cv_agg-binary_error-stdv",
+                    "binary_logloss_stdv": "cv_agg-binary_logloss-stdv",
+                },
+                frequency=0,
+            )
+        ],
+    )
+if __name__ == "__main__":
+    import argparse
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--use-cv", action="store_true", help="Use `lgb.cv` instead of `lgb.train`."
+    )
+    args, _ = parser.parse_known_args()
+    config = {
+        "objective": "binary",
+        "metric": ["binary_error", "binary_logloss"],
+        "verbose": -1,
+        "boosting_type": tune.grid_search(["gbdt", "dart"]),
+        "num_leaves": tune.randint(10, 1000),
+        "learning_rate": tune.loguniform(1e-8, 1e-1),
+    }
+    tuner = tune.Tuner(
+        train_breast_cancer if not args.use_cv else train_breast_cancer_cv,
+        tune_config=tune.TuneConfig(
+            metric="binary_error",
+            mode="min",
+            num_samples=2,
+            scheduler=ASHAScheduler(),
+        ),
+        param_space=config,
+    )
+    results = tuner.fit()
+    print("Best hyperparameters found were: ", results.get_best_result().config)

.venv/lib/python3.11/site-packages/ray/tune/execution/__init__.py ADDED Viewed

File without changes

.venv/lib/python3.11/site-packages/ray/tune/execution/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (191 Bytes). View file

.venv/lib/python3.11/site-packages/ray/tune/execution/__pycache__/class_cache.cpython-311.pyc ADDED Viewed

Binary file (2.78 kB). View file

.venv/lib/python3.11/site-packages/ray/tune/execution/__pycache__/cluster_info.cpython-311.pyc ADDED Viewed

Binary file (824 Bytes). View file

.venv/lib/python3.11/site-packages/ray/tune/execution/__pycache__/experiment_state.cpython-311.pyc ADDED Viewed

Binary file (13.2 kB). View file

.venv/lib/python3.11/site-packages/ray/tune/execution/__pycache__/insufficient_resources_manager.cpython-311.pyc ADDED Viewed

Binary file (7.59 kB). View file

.venv/lib/python3.11/site-packages/ray/tune/execution/__pycache__/placement_groups.cpython-311.pyc ADDED Viewed

Binary file (5.54 kB). View file

.venv/lib/python3.11/site-packages/ray/tune/execution/class_cache.py ADDED Viewed

	@@ -0,0 +1,68 @@

+import os
+import ray
+from ray.air.constants import COPY_DIRECTORY_CHECKPOINTS_INSTEAD_OF_MOVING_ENV
+from ray.train.constants import (
+    ENABLE_V2_MIGRATION_WARNINGS_ENV_VAR,
+    RAY_CHDIR_TO_TRIAL_DIR,
+)
+from ray.train.v2._internal.constants import (
+    ENV_VARS_TO_PROPAGATE as TRAIN_ENV_VARS_TO_PROPAGATE,
+)
+DEFAULT_ENV_VARS = {
+    # https://github.com/ray-project/ray/issues/28197
+    "PL_DISABLE_FORK": "1"
+}
+ENV_VARS_TO_PROPAGATE = (
+    {
+        COPY_DIRECTORY_CHECKPOINTS_INSTEAD_OF_MOVING_ENV,
+        RAY_CHDIR_TO_TRIAL_DIR,
+        ENABLE_V2_MIGRATION_WARNINGS_ENV_VAR,
+        "AWS_ACCESS_KEY_ID",
+        "AWS_SECRET_ACCESS_KEY",
+        "AWS_SECURITY_TOKEN",
+        "AWS_SESSION_TOKEN",
+    }
+    # Propagate the Ray Train environment variables from the driver process
+    # to the trainable process so that Tune + Train v2 can be used together.
+    | TRAIN_ENV_VARS_TO_PROPAGATE
+)
+class _ActorClassCache:
+    """Caches actor classes.
+    ray.remote is a registration call. It sends the serialized object to the
+    key value store (redis), and will be fetched at an arbitrary worker
+    later. Registration does not use any Ray scheduling resources.
+    Later, class.remote() actually creates the remote actor. The
+    actor will be instantiated on some arbitrary machine,
+    according to the underlying Ray scheduler.
+    Without this cache, you would register the same serialized object
+    over and over again. Naturally, since redis doesn’t spill to disk,
+    this can easily nuke the redis instance (and basically blow up Ray).
+    This cache instead allows us to register once and only once.
+    Note that we assume there can be multiple trainables in the
+    system at once.
+    """
+    def __init__(self):
+        self._cache = {}
+    def get(self, trainable_cls):
+        """Gets the wrapped trainable_cls, otherwise calls ray.remote."""
+        env_vars = DEFAULT_ENV_VARS.copy()
+        for env_var_to_propagate in ENV_VARS_TO_PROPAGATE:
+            if env_var_to_propagate in os.environ:
+                env_vars[env_var_to_propagate] = os.environ[env_var_to_propagate]
+        runtime_env = {"env_vars": env_vars}
+        if trainable_cls not in self._cache:
+            remote_cls = ray.remote(runtime_env=runtime_env)(trainable_cls)
+            self._cache[trainable_cls] = remote_cls
+        return self._cache[trainable_cls]

.venv/lib/python3.11/site-packages/ray/tune/execution/cluster_info.py ADDED Viewed

	@@ -0,0 +1,12 @@

+from functools import lru_cache
+from pathlib import Path
+@lru_cache()
+def _is_ray_cluster():
+    """Checks if the bootstrap config file exists.
+    This will always exist if using an autoscaling cluster/started
+    with the ray cluster launcher.
+    """
+    return Path("~/ray_bootstrap_config.yaml").expanduser().exists()

.venv/lib/python3.11/site-packages/ray/tune/execution/experiment_state.py ADDED Viewed

	@@ -0,0 +1,287 @@

+import fnmatch
+import logging
+import os
+import time
+from collections import Counter
+from pathlib import Path
+from typing import Callable, Dict, Optional, Union
+import pyarrow.fs
+from ray.train._internal.storage import (
+    StorageContext,
+    _download_from_fs_path,
+    _list_at_fs_path,
+    get_fs_and_path,
+)
+from ray.tune.experiment.trial import Trial
+from ray.tune.impl.out_of_band_serialize_dataset import out_of_band_serialize_dataset
+logger = logging.getLogger(__name__)
+_SLOW_SYNC_WARNING = (
+    "This could be due to a large number of trials, "
+    "large logfiles from lots of reported metrics, or throttling from the "
+    "remote storage if uploading too frequently.\n"
+    "You may want to consider switching the `RunConfig(storage_filesystem)`"
+    " to a more performant storage backend such as s3fs for a "
+    "S3 storage path.\n"
+    "You can suppress this error by setting the environment variable "
+    "TUNE_WARN_SLOW_EXPERIMENT_CHECKPOINT_SYNC_THRESHOLD_S to a higher "
+    "value than the current threshold ({threshold})."
+)
+def _find_newest_experiment_checkpoint(
+    experiment_path: str, fs: Optional[pyarrow.fs.FileSystem] = None
+) -> Optional[str]:
+    """Returns file name of most recently created experiment checkpoint.
+    Args:
+        experiment_path: Local or remote path to the experiment directory
+            containing at least one experiment checkpoint file.
+    Returns:
+        str: The local or remote path to the latest experiment checkpoint file
+            based on timestamp. None if no experiment checkpoints were found.
+    """
+    from ray.tune.execution.tune_controller import TuneController
+    fs, experiment_fs_path = get_fs_and_path(experiment_path, storage_filesystem=fs)
+    filenames = _list_at_fs_path(fs=fs, fs_path=experiment_fs_path)
+    pattern = TuneController.CKPT_FILE_TMPL.format("*")
+    matching = fnmatch.filter(filenames, pattern)
+    if not matching:
+        return None
+    filename = max(matching)
+    return Path(experiment_fs_path, filename).as_posix()
+class _ExperimentCheckpointManager:
+    """Helper class for managing experiment-level checkpoints.
+    This class implements the ``checkpoint()`` method used to checkpoint
+    experiment state. When called, this will serialize and write to disk
+    the state of the trial runner, trial executor, and search algorithm, to
+    a specified checkpoint file.
+    The checkpoint period is automatically adjusted to
+    ``max(10, time_per_checkpoint * 19)``. This means that at most 5% of the
+    time (1/20) will be used for writing checkpoints, while 95% of the time
+    (19/20) will be used to handle the rest of the training loop.
+    """
+    def __init__(
+        self,
+        *,
+        storage: Optional[StorageContext],
+        checkpoint_period: Union[int, float, str],
+        sync_every_n_trial_checkpoints: Optional[int] = None,
+    ):
+        self._storage = storage
+        self._last_save_time = float("-inf")
+        self._last_sync_time = None
+        # Dynamic checkpointing period
+        self._auto_checkpoint_enabled = checkpoint_period == "auto"
+        if self._auto_checkpoint_enabled:
+            self._checkpoint_period = 10.0  # Initial value
+        else:
+            self._checkpoint_period = float(checkpoint_period)
+        # TODO(justinvyu): This is a non-performant workaround to force sync
+        # every num_to_keep checkpoints in order to maintain consistency
+        # between the experiment state's view of the latest checkpoint,
+        # and the actual latest checkpoint that was uploaded.
+        self._sync_every_n_trial_checkpoints = sync_every_n_trial_checkpoints
+        self._trial_num_checkpoints_since_last_sync: Dict[Trial, int] = Counter()
+        self._should_force_sync_up: bool = False
+        self._excessive_sync_threshold = float(
+            os.environ.get(
+                "TUNE_WARN_EXCESSIVE_EXPERIMENT_CHECKPOINT_SYNC_THRESHOLD_S", "5"
+            )
+        )
+        self._slow_sync_threshold = float(
+            os.environ.get(
+                "TUNE_WARN_SLOW_EXPERIMENT_CHECKPOINT_SYNC_THRESHOLD_S", "30"
+            )
+        )
+    @property
+    def auto_checkpoint_enabled(self):
+        return self._auto_checkpoint_enabled
+    def _update_auto_checkpoint_time(self, time_taken: float):
+        if self._auto_checkpoint_enabled:
+            # Multiplying this time by 19 means we spend ~5% of the time
+            # writing global checkpoints and 95% of the time processing trials
+            self._checkpoint_period = max(10.0, time_taken * 19)
+            logger.debug(
+                f"Experiment state snapshotting took "
+                f"{time_taken:.2f} seconds. "
+                f"Adjusting snapshotting period to "
+                f"{self._checkpoint_period:.2f} seconds."
+            )
+    def sync_up_experiment_state(
+        self,
+        save_fn: Callable[[], None],
+        force: bool = False,
+        wait: bool = False,
+    ):
+        """Saves execution state to the experiment directory on the storage path.
+        This includes an experiment checkpoint file that contains trial statuses
+        and the searcher state.
+        Overwrites the current session checkpoint, which starts when self
+        is instantiated. Throttle depends on self._checkpoint_period.
+        Args:
+            save_fn: Function to call to actually save data to the driver
+                staging path. The files in the driver staging path will be
+                uploaded to the storage path.
+            force: Forces an experiment checkpoint and launches a sync to storage.
+                This happens regardless of checkpoint_period
+            wait: Waits for the sync up to complete before returning.
+        """
+        driver_staging_path = self._storage.experiment_driver_staging_path
+        force = force or self._should_force_sync_up
+        now = time.monotonic()
+        if now - self._last_save_time < self._checkpoint_period and not force:
+            return
+        # Checkpoint
+        checkpoint_time_start = time.monotonic()
+        # NOTE: This context manager is for Datasets captured in a trial config.
+        # This is the case when *tuning over datasets*.
+        # If the datasets have already been full executed, then serializing
+        # block refs means that this checkpoint is not usable in a new Ray cluster.
+        # This context will serialize the dataset execution plan instead, if available.
+        with out_of_band_serialize_dataset():
+            save_fn()
+        def wait_for_sync():
+            try:
+                self._storage.syncer.wait()
+            except Exception:
+                logger.error(
+                    "Saving experiment state to storage at "
+                    f"'{self._storage.experiment_fs_path}' failed with exception: ",
+                    exc_info=True,
+                )
+        if force:
+            start_time = time.monotonic()
+            wait_for_sync()
+            wait_time = time.monotonic() - start_time
+            if wait_time > self._slow_sync_threshold:
+                logger.warning(
+                    "Saving the experiment state (which holds a global view "
+                    "of trial statuses and is used to restore the experiment) "
+                    f"took ~{wait_time:.2f} seconds, which may be a performance "
+                    "bottleneck.\n"
+                    f"{_SLOW_SYNC_WARNING.format(threshold=self._slow_sync_threshold)}"
+                )
+        time_since_last_sync = (
+            time.monotonic() - self._last_sync_time
+            if self._last_sync_time is not None
+            else None
+        )
+        launched_sync = self._storage.syncer.sync_up(
+            driver_staging_path, self._storage.experiment_fs_path
+        )
+        if launched_sync:
+            if (
+                time_since_last_sync is not None
+                and time_since_last_sync < self._excessive_sync_threshold
+                and self._should_force_sync_up
+            ):
+                logger.warning(
+                    "Experiment state snapshotting has been triggered multiple "
+                    f"times in the last {self._excessive_sync_threshold} seconds "
+                    "and may become a bottleneck. "
+                    "A snapshot is forced if `CheckpointConfig(num_to_keep)` is set, "
+                    "and a trial has checkpointed >= `num_to_keep` times "
+                    "since the last snapshot.\n"
+                    "You may want to consider increasing the "
+                    "`CheckpointConfig(num_to_keep)` or decreasing the frequency of "
+                    "saving checkpoints.\n"
+                    "You can suppress this warning by setting the environment variable "
+                    "TUNE_WARN_EXCESSIVE_EXPERIMENT_CHECKPOINT_SYNC_THRESHOLD_S "
+                    "to a smaller value than the current threshold "
+                    f"({self._excessive_sync_threshold}). "
+                    "Set it to 0 to completely suppress this warning."
+                )
+            self._last_sync_time = time.monotonic()
+            # We just synced, so reset the force flag
+            self._trial_num_checkpoints_since_last_sync.clear()
+            self._should_force_sync_up = False
+        else:
+            if (
+                time_since_last_sync is not None
+                and time_since_last_sync > self._slow_sync_threshold
+            ):
+                logger.warning(
+                    "Saving the experiment state (which holds a global view "
+                    "of trial statuses and is used to restore the experiment) "
+                    f"has already taken {time_since_last_sync:.2f} seconds, "
+                    "which may cause consistency issues upon restoration if your "
+                    "driver script ungracefully exits.\n"
+                    f"{_SLOW_SYNC_WARNING.format(threshold=self._slow_sync_threshold)}"
+                )
+        if wait:
+            wait_for_sync()
+        checkpoint_time_taken = time.monotonic() - checkpoint_time_start
+        # Adjust dynamic checkpointing
+        self._update_auto_checkpoint_time(time_taken=checkpoint_time_taken)
+        # Finish
+        self._last_save_time = time.monotonic()
+    def sync_down_experiment_state(self) -> None:
+        fs = self._storage.storage_filesystem
+        filepaths = _list_at_fs_path(fs=fs, fs_path=self._storage.experiment_fs_path)
+        # TODO(ekl) we should refactor our restore code to read the necessary data
+        # directly from the storage context. As a temporary hack, restore all the
+        # serialized files from the root dir where other modules expect them to be.
+        matches = [
+            path
+            for path in filepaths
+            if path.endswith(".json") or path.endswith(".pkl")
+        ]
+        for relpath in matches:
+            fs_path = Path(self._storage.experiment_fs_path, relpath).as_posix()
+            local_path = Path(
+                self._storage.experiment_driver_staging_path, relpath
+            ).as_posix()
+            _download_from_fs_path(fs=fs, fs_path=fs_path, local_path=local_path)
+        logger.debug(
+            f"Copied {matches} from:\n(fs, path) = "
+            f"({self._storage.storage_filesystem.type_name}, "
+            f"{self._storage.experiment_fs_path})\n"
+            f"-> {self._storage.experiment_driver_staging_path}"
+        )
+    def on_trial_checkpoint(self, trial: Trial):
+        if not self._sync_every_n_trial_checkpoints:
+            return
+        self._trial_num_checkpoints_since_last_sync[trial] += 1
+        if (
+            self._trial_num_checkpoints_since_last_sync[trial]
+            >= self._sync_every_n_trial_checkpoints
+        ):
+            self._should_force_sync_up = True

.venv/lib/python3.11/site-packages/ray/tune/execution/insufficient_resources_manager.py ADDED Viewed

	@@ -0,0 +1,167 @@

+import logging
+import os
+import time
+from functools import lru_cache
+from typing import Dict, Optional, Tuple
+import ray
+from ray.tune.execution.cluster_info import _is_ray_cluster
+from ray.tune.experiment import Trial
+logger = logging.getLogger(__name__)
+# Ideally we want to use @cache; but it's only available for python 3.9.
+# Caching is only helpful/correct for no autoscaler case.
+@lru_cache()
+def _get_cluster_resources_no_autoscaler() -> Dict:
+    return ray.cluster_resources()
+def _get_trial_cpu_and_gpu(trial: Trial) -> Tuple[int, int]:
+    cpu = trial.placement_group_factory.required_resources.get("CPU", 0)
+    gpu = trial.placement_group_factory.required_resources.get("GPU", 0)
+    return cpu, gpu
+def _can_fulfill_no_autoscaler(trial: Trial) -> bool:
+    """Calculates if there is enough resources for a PENDING trial.
+    For no autoscaler case.
+    """
+    assert trial.status == Trial.PENDING
+    asked_cpus, asked_gpus = _get_trial_cpu_and_gpu(trial)
+    return asked_cpus <= _get_cluster_resources_no_autoscaler().get(
+        "CPU", 0
+    ) and asked_gpus <= _get_cluster_resources_no_autoscaler().get("GPU", 0)
+@lru_cache()
+def _get_insufficient_resources_warning_threshold() -> float:
+    if _is_ray_cluster():
+        return float(
+            os.environ.get(
+                "TUNE_WARN_INSUFFICENT_RESOURCE_THRESHOLD_S_AUTOSCALER", "60"
+            )
+        )
+    else:
+        # Set the default to 10s so that we don't prematurely determine that
+        # a cluster cannot fulfill the resources requirements.
+        # TODO(xwjiang): Change it back once #18608 is resolved.
+        return float(os.environ.get("TUNE_WARN_INSUFFICENT_RESOURCE_THRESHOLD_S", "60"))
+MSG_TRAIN_START = (
+    "Training has not started in the last {wait_time:.0f} seconds. "
+    "This could be due to the cluster not having enough resources available. "
+)
+MSG_TRAIN_INSUFFICIENT = (
+    "You asked for {asked_cpus} CPUs and {asked_gpus} GPUs, but the cluster only "
+    "has {cluster_cpus} CPUs and {cluster_gpus} GPUs available. "
+)
+MSG_TRAIN_END = (
+    "Stop the training and adjust the required resources (e.g. via the "
+    "`ScalingConfig` or `resources_per_trial`, or `num_workers` for rllib), "
+    "or add more resources to your cluster."
+)
+MSG_TUNE_START = (
+    "No trial is running and no new trial has been started within "
+    "the last {wait_time:.0f} seconds. "
+    "This could be due to the cluster not having enough resources available. "
+)
+MSG_TUNE_INSUFFICIENT = (
+    "You asked for {asked_cpus} CPUs and {asked_gpus} GPUs per trial, "
+    "but the cluster only has {cluster_cpus} CPUs and {cluster_gpus} GPUs available. "
+)
+MSG_TUNE_END = (
+    "Stop the tuning and adjust the required resources (e.g. via the "
+    "`ScalingConfig` or `resources_per_trial`, or `num_workers` for rllib), "
+    "or add more resources to your cluster."
+)
+# TODO(xwjiang): Consider having a help page with more detailed instructions.
+@lru_cache()
+def _get_insufficient_resources_warning_msg(
+    for_train: bool = False, trial: Optional[Trial] = None
+) -> str:
+    msg = "Ignore this message if the cluster is autoscaling. "
+    if for_train:
+        start = MSG_TRAIN_START
+        insufficient = MSG_TRAIN_INSUFFICIENT
+        end = MSG_TRAIN_END
+    else:
+        start = MSG_TUNE_START
+        insufficient = MSG_TUNE_INSUFFICIENT
+        end = MSG_TUNE_END
+    msg += start.format(wait_time=_get_insufficient_resources_warning_threshold())
+    if trial:
+        asked_cpus, asked_gpus = _get_trial_cpu_and_gpu(trial)
+        cluster_resources = _get_cluster_resources_no_autoscaler()
+        msg += insufficient.format(
+            asked_cpus=asked_cpus,
+            asked_gpus=asked_gpus,
+            cluster_cpus=cluster_resources.get("CPU", 0),
+            cluster_gpus=cluster_resources.get("GPU", 0),
+        )
+    msg += end
+    return msg
+class _InsufficientResourcesManager:
+    """Insufficient resources manager.
+    Makes best effort, conservative guesses about if Tune loop is stuck due to
+    infeasible resources. If so, outputs usability messages for users to
+    act upon.
+    """
+    def __init__(self, for_train: bool = False):
+        # The information tracked across the life time of Tune loop.
+        self._no_running_trials_since = -1
+        self._last_trial_num = -1
+        self._for_train = for_train
+    def on_no_available_trials(self, all_trials):
+        """Tracks information across the life of Tune loop and makes guesses
+        about if Tune loop is stuck due to infeasible resources.
+        If so, outputs certain warning messages.
+        The logic should be conservative, non-intrusive and informative.
+        For example, rate limiting is applied so that the message is not
+        spammy.
+        """
+        # This is approximately saying we are not making progress.
+        if len(all_trials) == self._last_trial_num:
+            if self._no_running_trials_since == -1:
+                self._no_running_trials_since = time.monotonic()
+            elif (
+                time.monotonic() - self._no_running_trials_since
+                > _get_insufficient_resources_warning_threshold()
+            ):
+                can_fulfill_any = any(
+                    trial.status == Trial.PENDING and _can_fulfill_no_autoscaler(trial)
+                    for trial in all_trials
+                )
+                if can_fulfill_any:
+                    # If one trial can be fulfilled, it will be fulfilled eventually
+                    self._no_running_trials_since = -1
+                    return
+                # Otherwise, can fulfill none
+                msg = _get_insufficient_resources_warning_msg(
+                    for_train=self._for_train, trial=all_trials[0]
+                )
+                logger.warning(msg)
+                self._no_running_trials_since = time.monotonic()
+        else:
+            self._no_running_trials_since = -1
+        self._last_trial_num = len(all_trials)

.venv/lib/python3.11/site-packages/ray/tune/execution/placement_groups.py ADDED Viewed

	@@ -0,0 +1,131 @@

+import warnings
+from typing import Dict, Optional
+from ray.air.execution.resources.request import ResourceRequest
+from ray.util.annotations import DeveloperAPI, PublicAPI
+from ray.util.placement_group import placement_group
+@PublicAPI(stability="beta")
+class PlacementGroupFactory(ResourceRequest):
+    """Wrapper class that creates placement groups for trials.
+    This function should be used to define resource requests for Ray Tune
+    trials. It holds the parameters to create
+    :ref:`placement groups <ray-placement-group-doc-ref>`.
+    At a minimum, this will hold at least one bundle specifying the
+    resource requirements for each trial:
+    .. code-block:: python
+        from ray import tune
+        tuner = tune.Tuner(
+            tune.with_resources(
+                train,
+                resources=tune.PlacementGroupFactory([
+                    {"CPU": 1, "GPU": 0.5, "custom_resource": 2}
+                ])
+            )
+        )
+        tuner.fit()
+    If the trial itself schedules further remote workers, the resource
+    requirements should be specified in additional bundles. You can also
+    pass the placement strategy for these bundles, e.g. to enforce
+    co-located placement:
+    .. code-block:: python
+        from ray import tune
+        tuner = tune.Tuner(
+            tune.with_resources(
+                train,
+                resources=tune.PlacementGroupFactory([
+                    {"CPU": 1, "GPU": 0.5, "custom_resource": 2},
+                    {"CPU": 2},
+                    {"CPU": 2},
+                ], strategy="PACK")
+            )
+        )
+        tuner.fit()
+    The example above will reserve 1 CPU, 0.5 GPUs and 2 custom_resources
+    for the trainable itself, and reserve another 2 bundles of 2 CPUs each.
+    The trial will only start when all these resources are available. This
+    could be used e.g. if you had one learner running in the main trainable
+    that schedules two remote workers that need access to 2 CPUs each.
+    If the trainable itself doesn't require resources.
+    You can specify it as:
+    .. code-block:: python
+        from ray import tune
+        tuner = tune.Tuner(
+            tune.with_resources(
+                train,
+                resources=tune.PlacementGroupFactory([
+                    {},
+                    {"CPU": 2},
+                    {"CPU": 2},
+                ], strategy="PACK")
+            )
+        )
+        tuner.fit()
+    Args:
+        bundles: A list of bundles which
+            represent the resources requirements.
+        strategy: The strategy to create the placement group.
+         - "PACK": Packs Bundles into as few nodes as possible.
+         - "SPREAD": Places Bundles across distinct nodes as even as possible.
+         - "STRICT_PACK": Packs Bundles into one node. The group is
+           not allowed to span multiple nodes.
+         - "STRICT_SPREAD": Packs Bundles across distinct nodes.
+        *args: Passed to the call of ``placement_group()``
+        **kwargs: Passed to the call of ``placement_group()``
+    """
+    def __call__(self, *args, **kwargs):
+        warnings.warn(
+            "Calling PlacementGroupFactory objects is deprecated. Use "
+            "`to_placement_group()` instead.",
+            DeprecationWarning,
+        )
+        kwargs.update(self._bound.kwargs)
+        # Call with bounded *args and **kwargs
+        return placement_group(*self._bound.args, **kwargs)
+@DeveloperAPI
+def resource_dict_to_pg_factory(spec: Optional[Dict[str, float]] = None):
+    """Translates resource dict into PlacementGroupFactory."""
+    spec = spec or {"cpu": 1}
+    spec = spec.copy()
+    cpus = spec.pop("cpu", spec.pop("CPU", 0.0))
+    gpus = spec.pop("gpu", spec.pop("GPU", 0.0))
+    memory = spec.pop("memory", 0.0)
+    # If there is a custom_resources key, use as base for bundle
+    bundle = {k: v for k, v in spec.pop("custom_resources", {}).items()}
+    # Otherwise, consider all other keys as custom resources
+    if not bundle:
+        bundle = spec
+    bundle.update(
+        {
+            "CPU": cpus,
+            "GPU": gpus,
+            "memory": memory,
+        }
+    )
+    return PlacementGroupFactory([bundle])

.venv/lib/python3.11/site-packages/ray/tune/execution/tune_controller.py ADDED Viewed

	@@ -0,0 +1,2181 @@

+import copy
+import json
+import logging
+import os
+import time
+import traceback
+import warnings
+from collections import defaultdict, deque
+from datetime import datetime
+from functools import partial
+from pathlib import Path
+from typing import Any, Callable, Dict, List, Optional, Set, Tuple, Union
+import ray
+from ray.air import ResourceRequest
+from ray.air.constants import TIME_THIS_ITER_S
+from ray.air.execution import PlacementGroupResourceManager, ResourceManager
+from ray.air.execution._internal import RayActorManager, TrackedActor
+from ray.exceptions import RayActorError, RayTaskError
+from ray.train import CheckpointConfig
+from ray.train._internal.session import _FutureTrainingResult, _TrainingResult
+from ray.train._internal.storage import StorageContext
+from ray.tune.callback import Callback, CallbackList
+from ray.tune.error import TuneError, _AbortTrialExecution, _TuneStopTrialError
+from ray.tune.execution.class_cache import _ActorClassCache
+from ray.tune.execution.experiment_state import (
+    _ExperimentCheckpointManager,
+    _find_newest_experiment_checkpoint,
+)
+from ray.tune.execution.insufficient_resources_manager import (
+    _InsufficientResourcesManager,
+)
+from ray.tune.execution.placement_groups import PlacementGroupFactory
+from ray.tune.experiment import Experiment, Trial
+from ray.tune.experiment.trial import (
+    _change_working_directory,
+    _get_trainable_kwargs,
+    _Location,
+    _noop_logger_creator,
+    _TrialInfo,
+)
+from ray.tune.result import (
+    DEBUG_METRICS,
+    DEFAULT_METRIC,
+    DONE,
+    RESULT_DUPLICATE,
+    SHOULD_CHECKPOINT,
+    STDERR_FILE,
+    STDOUT_FILE,
+    TRIAL_INFO,
+)
+from ray.tune.schedulers import FIFOScheduler, TrialScheduler
+from ray.tune.search import BasicVariantGenerator, SearchAlgorithm
+from ray.tune.stopper import NoopStopper, Stopper
+from ray.tune.tune_config import ResumeConfig
+from ray.tune.utils import flatten_dict, warn_if_slow
+from ray.tune.utils.log import Verbosity, _dedup_logs, has_verbosity
+from ray.tune.utils.object_cache import _ObjectCache
+from ray.tune.utils.resource_updater import _ResourceUpdater
+from ray.tune.utils.serialization import TuneFunctionDecoder, TuneFunctionEncoder
+from ray.util.annotations import DeveloperAPI
+from ray.util.debug import log_once
+logger = logging.getLogger(__name__)
+@DeveloperAPI
+class TuneController:
+    CKPT_FILE_TMPL = "experiment_state-{}.json"
+    RAISE = "RAISE"
+    def __init__(
+        self,
+        *,
+        search_alg: Optional[SearchAlgorithm] = None,
+        placeholder_resolvers: Optional[Dict[Tuple, Any]] = None,
+        scheduler: Optional[TrialScheduler] = None,
+        stopper: Optional[Stopper] = None,
+        resume_config: Optional[ResumeConfig] = None,
+        fail_fast: bool = False,
+        checkpoint_period: Union[str, int] = None,
+        callbacks: Optional[List[Callback]] = None,
+        metric: Optional[str] = None,
+        trial_checkpoint_config: Optional[CheckpointConfig] = None,
+        storage: Optional[StorageContext] = None,
+        reuse_actors: bool = False,
+        resource_manager_factory: Optional[Callable[[], ResourceManager]] = None,
+        _trainer_api: bool = False,
+    ):
+        if resource_manager_factory:
+            resource_manager = resource_manager_factory()
+        else:
+            resource_manager = PlacementGroupResourceManager()
+        self._actor_manager = RayActorManager(resource_manager=resource_manager)
+        self._class_cache = _ActorClassCache()
+        # Resource status
+        self._resource_updater = _ResourceUpdater(None)
+        # Actor <-> Trial mappings
+        self._actor_to_trial: Dict[TrackedActor, Trial] = {}
+        self._trial_to_actor: Dict[Trial, TrackedActor] = {}
+        # Resources <-> Trial
+        self._resources_to_pending_trials: Dict[
+            ResourceRequest, Set[Trial]
+        ] = defaultdict(set)
+        # Keep track of actor states
+        self._pending_trials: Set[Trial] = set()
+        self._pending_trials_list: List[Trial] = []
+        self._running_trials: Set[Trial] = set()
+        self._paused_trials: Set[Trial] = set()
+        self._stopped_trials: Set[Trial] = set()
+        self._failed_trials: Set[Trial] = set()
+        self._resetting_trials: Set[Trial] = set()
+        self._staged_trials: Set[Trial] = set()
+        # Removed actors
+        self._started_actors: Set[TrackedActor] = set()
+        # Map of tracked actors -> timestamp
+        # The timestamp is when we requested the stop.
+        # We track these actors here to force a
+        # cleanup after some time (as they might be hanging).
+        # Todo: This timeout logic should be moved into the actor manager.
+        # This map is populated whenever we request an actor stop:
+        #  - Regular STOP decision
+        #  - Removing an actor because its trial REUSEs a different trial's actor
+        #  - Removing a cached actor because it's not needed anymore
+        # Actors are only tracked in this map if they actually started (not if they
+        # were only requested but never started).
+        # Actors are removed from this map:
+        #  - When the STOP resolved and the actor actually stopped
+        #  - When they are forcefully cleaned up after the timeout.
+        self._stopping_actors: Dict[TrackedActor, float] = {}
+        self._earliest_stopping_actor: float = float("inf")
+        self._actor_cleanup_timeout: int = int(
+            os.environ.get("TUNE_FORCE_TRIAL_CLEANUP_S", "600")
+        )
+        self._actor_force_cleanup_timeout: int = 10
+        # Reuse actors
+        self._reuse_actors = reuse_actors
+        self._actor_cache = _ObjectCache(may_keep_one=True)
+        # Trial metadata for experiment checkpoints
+        self._trials_to_cache: Set[Trial] = set()
+        self._trial_metadata: Dict[str, str] = {}
+        # TRAINING
+        self._buffer_length = int(os.getenv("TUNE_RESULT_BUFFER_LENGTH", 1))
+        self._buffer_min_time_s = float(os.getenv("TUNE_RESULT_BUFFER_MIN_TIME_S", 0.0))
+        self._buffer_max_time_s = float(
+            os.getenv("TUNE_RESULT_BUFFER_MAX_TIME_S", 100.0)
+        )
+        # Legacy TrialRunner init
+        self._search_alg = search_alg or BasicVariantGenerator()
+        self._placeholder_resolvers = placeholder_resolvers
+        self._scheduler_alg = scheduler or FIFOScheduler()
+        self._callbacks = CallbackList(callbacks or [])
+        self._insufficient_resources_manager = _InsufficientResourcesManager(
+            for_train=_trainer_api
+        )
+        self._pending_trial_queue_times = {}
+        self._max_pending_trials = _get_max_pending_trials(self._search_alg)
+        self._storage = storage
+        self._metric = metric
+        self._total_time = 0
+        self._iteration = 0
+        self._has_errored = False
+        self._fail_fast = fail_fast
+        if isinstance(self._fail_fast, str):
+            self._fail_fast = self._fail_fast.upper()
+            if self._fail_fast == self.RAISE:
+                warnings.warn(
+                    "fail_fast='raise' detected. Be careful when using this "
+                    "mode as resources (such as Ray processes, "
+                    "file descriptors, and temporary files) may not be "
+                    "cleaned up properly. To use "
+                    "a safer mode, use fail_fast=True."
+                )
+            else:
+                raise ValueError(
+                    "fail_fast must be one of {bool, RAISE}. " f"Got {self._fail_fast}."
+                )
+        self._print_trial_errors = bool(
+            int(os.environ.get("TUNE_PRINT_ALL_TRIAL_ERRORS", "1"))
+        )
+        self._trials: List[Trial] = []
+        self._live_trials: Set[Trial] = set()  # Set of non-terminated trials
+        self._cached_trial_decisions = {}
+        self._queued_trial_decisions = {}
+        self._stop_queue = []
+        self._should_stop_experiment = False  # used by TuneServer
+        self._stopper = stopper or NoopStopper()
+        self._start_time = time.time()
+        self._session_str = datetime.fromtimestamp(self._start_time).strftime(
+            "%Y-%m-%d_%H-%M-%S"
+        )
+        if checkpoint_period is None:
+            checkpoint_period = os.getenv("TUNE_GLOBAL_CHECKPOINT_S", "auto")
+        self._checkpoint_period = checkpoint_period
+        self._trial_checkpoint_config = trial_checkpoint_config or CheckpointConfig()
+        self._checkpoint_manager = self._create_checkpoint_manager()
+        self._resumed = False
+        if resume_config is not None:
+            # Use the metadata file to restore TuneController state
+            try:
+                self.resume(resume_config=resume_config)
+                self._resumed = True
+            except Exception as e:
+                if has_verbosity(Verbosity.V3_TRIAL_DETAILS):
+                    logger.error(str(e))
+                logger.exception("Failed to restore the run state.")
+                if self._fail_fast:
+                    raise
+                logger.info("Restarting experiment.")
+        else:
+            logger.debug("Starting a new experiment.")
+    def _wrapped(self):
+        """Return wrapped tune controller to be passed to scheduler/searchers."""
+        return TrialRunnerWrapper(
+            self,
+            trial_executor=_FakeRayTrialExecutor(self),
+            runner_whitelist_attr={
+                "search_alg",
+                "get_trials",
+                "get_live_trials",
+                "_set_trial_status",
+                "pause_trial",
+                "stop_trial",
+                "_schedule_trial_save",
+            },
+            executor_whitelist_attr={
+                "has_resources_for_trial",
+                "pause_trial",
+                "save",
+                "_resource_updater",
+            },
+        )
+    @property
+    def resumed(self):
+        return self._resumed
+    @property
+    def search_alg(self):
+        return self._search_alg
+    @property
+    def scheduler_alg(self):
+        return self._scheduler_alg
+    def setup_experiments(
+        self, experiments: List[Experiment], total_num_samples: int
+    ) -> None:
+        """Obtains any necessary information from experiments.
+        Mainly used to setup callbacks.
+        Args:
+            experiments: List of Experiments
+                to use.
+            total_num_samples: Total number of samples
+                factoring in grid search samplers.
+        """
+        experiment = experiments[0]
+        spec = experiment.public_spec if experiment else {}
+        spec["total_num_samples"] = total_num_samples
+        self._callbacks.setup(**spec)
+    def end_experiment_callbacks(self) -> None:
+        """Calls ``on_experiment_end`` method in callbacks."""
+        self._callbacks.on_experiment_end(trials=self._trials)
+    @property
+    def experiment_state_file_name(self) -> str:
+        return self.CKPT_FILE_TMPL.format(self._session_str)
+    @property
+    def experiment_state_path(self) -> str:
+        """Returns the local experiment checkpoint path."""
+        return Path(
+            self._storage.experiment_driver_staging_path,
+            self.experiment_state_file_name,
+        ).as_posix()
+    @property
+    def experiment_path(self) -> str:
+        return self._storage.experiment_fs_path
+    def _create_checkpoint_manager(self):
+        return _ExperimentCheckpointManager(
+            storage=self._storage,
+            checkpoint_period=self._checkpoint_period,
+            sync_every_n_trial_checkpoints=self._trial_checkpoint_config.num_to_keep,
+        )
+    def save_to_dir(self):
+        """Save TuneController state to the local staging experiment directory.
+        This includes:
+        - trial states
+        - TuneController internal state (all the serializable attributes)
+        - the searcher state
+        - the callback states
+        """
+        # Get state from trial executor and runner
+        runner_state = {
+            # Trials
+            "trial_data": list(self._get_trial_checkpoints().values()),
+            # Experiment data
+            "runner_data": self.__getstate__(),
+            # Metadata
+            "stats": {"start_time": self._start_time},
+        }
+        driver_staging_path = self._storage.experiment_driver_staging_path
+        os.makedirs(driver_staging_path, exist_ok=True)
+        with open(
+            Path(driver_staging_path, self.experiment_state_file_name),
+            "w",
+        ) as f:
+            json.dump(runner_state, f, cls=TuneFunctionEncoder)
+        self._search_alg.save_to_dir(driver_staging_path, session_str=self._session_str)
+        self._callbacks.save_to_dir(driver_staging_path, session_str=self._session_str)
+    def checkpoint(self, force: bool = False, wait: bool = False):
+        self._checkpoint_manager.sync_up_experiment_state(
+            save_fn=self.save_to_dir, force=force, wait=wait
+        )
+    def _requeue_restored_trials(
+        self, trials: List[Trial], resume_config: ResumeConfig
+    ):
+        # Set trial statuses according to the resume configuration
+        for trial in sorted(
+            trials, key=lambda t: t.run_metadata.last_result_time, reverse=True
+        ):
+            if trial.status == Trial.ERROR:
+                resume_type = resume_config.errored
+            elif trial.status == Trial.TERMINATED:
+                resume_type = resume_config.finished
+            else:  # Unfinished (PENDING, RUNNING, PAUSED)
+                resume_type = resume_config.unfinished
+            trial_to_add = None
+            if resume_type == ResumeConfig.ResumeType.RESUME:
+                # Keep trial ID on resume
+                trial_to_add = trial
+                trial_to_add.run_metadata.error_filename = None
+                trial_to_add.run_metadata.pickled_error_filename = None
+                trial_to_add.set_status(Trial.PENDING)
+            elif resume_type == ResumeConfig.ResumeType.RESTART:
+                trial_to_add = trial.reset()
+                trial_to_add.restore_path = None
+            elif resume_type == ResumeConfig.ResumeType.SKIP:
+                trial_to_add = trial
+                if trial_to_add.status != Trial.ERROR:
+                    # Set the status to terminated to skip it.
+                    # Keep errored trial status as ERROR.
+                    trial_to_add.set_status(Trial.TERMINATED)
+            else:
+                raise ValueError(f"Unknown resume type: {resume_type}")
+            assert trial_to_add is not None
+            self.add_trial(trial_to_add)
+    def _restore_trials(self, experiment_state: Dict) -> List[Trial]:
+        trials = []
+        for trial_json_state, trial_runtime_metadata in experiment_state["trial_data"]:
+            trial = Trial.from_json_state(trial_json_state)
+            trial.restore_run_metadata(trial_runtime_metadata)
+            # The following properties may be updated on restoration
+            # Ex: moved local/cloud experiment directory
+            # Propagate updated storage ctx properties to the trial's restored copy.
+            new_storage = copy.copy(trial.storage)
+            new_storage.storage_filesystem = self._storage.storage_filesystem
+            new_storage.storage_fs_path = self._storage.storage_fs_path
+            new_storage.experiment_dir_name = self._storage.experiment_dir_name
+            # ATTN: `trial.set_storage` is used intentionally, since it
+            # also updates the absolute paths and filesystem of tracked checkpoints.
+            trial.set_storage(new_storage)
+            # Avoid creating logdir in client mode for returned trial results,
+            # since the dir might not be creatable locally.
+            # TODO(ekl) this is kind of a hack.
+            if not ray.util.client.ray.is_connected():
+                trial.init_local_path()  # Create logdir if it does not exist
+            trials.append(trial)
+        # NOTE: The restored run should reuse the same driver staging directory.
+        self._storage._timestamp = trials[0].storage._timestamp
+        return trials
+    def resume(self, resume_config: ResumeConfig):
+        """Resumes all checkpointed trials from previous run.
+        Requires user to manually re-register their objects. Also stops
+        all ongoing trials.
+        """
+        # 1. Restore TuneController state
+        # Find newest state file
+        newest_state_path = _find_newest_experiment_checkpoint(
+            self._storage.experiment_fs_path, fs=self._storage.storage_filesystem
+        )
+        if newest_state_path is None:
+            raise ValueError(
+                f"Tried to resume experiment from directory "
+                f"'{self._storage.experiment_fs_path}', but no "
+                f"experiment state file of the form '{TuneController.CKPT_FILE_TMPL}' "
+                "was found. This is expected if you are launching a new experiment."
+            )
+        logger.info(
+            "Restoring the run from the latest experiment state file: "
+            f"{Path(newest_state_path).name}"
+        )
+        with self._storage.storage_filesystem.open_input_stream(newest_state_path) as f:
+            experiment_state = json.loads(f.readall(), cls=TuneFunctionDecoder)
+        self.__setstate__(experiment_state["runner_data"])
+        # 2. Get the trial states that the run left off at.
+        trials = self._restore_trials(experiment_state)
+        # 3. Restore search algorithm and callback state
+        # Download the search algorithm and callback state to the driver staging dir.
+        self._checkpoint_manager.sync_down_experiment_state()
+        driver_staging_dir = self._storage.experiment_driver_staging_path
+        if self._search_alg.has_checkpoint(driver_staging_dir):
+            self._search_alg.restore_from_dir(driver_staging_dir)
+        if self._callbacks.can_restore(driver_staging_dir):
+            self._callbacks.restore_from_dir(driver_staging_dir)
+        # 4. Re-queue trials as needed, depending on their status.
+        self._requeue_restored_trials(trials, resume_config)
+    def update_max_pending_trials(self, max_pending_trials: Optional[int] = None):
+        self._max_pending_trials = max_pending_trials or _get_max_pending_trials(
+            self._search_alg
+        )
+    def update_pending_trial_resources(
+        self, resources: Union[dict, PlacementGroupFactory]
+    ):
+        """Update trial resources when resuming from checkpoint.
+        Only updating the pending ones.
+        """
+        assert resources
+        if isinstance(resources, dict) and "gpu" not in resources:
+            resources["gpu"] = 0
+        for trial in self._trials:
+            if trial.status == Trial.PENDING:
+                trial.update_resources(resources=resources)
+    def is_finished(self):
+        """Returns whether all trials have finished running."""
+        # The checks here are partly redundant but optimized for quick
+        # evaluation. Specifically, if there are live trials, we check
+        # these live trials first. Only if none of the live trials is
+        # live anymore do we loop over all trials for a final check.
+        trials_done = (
+            len(self._live_trials) == 0
+            or all(trial.is_finished() for trial in self._live_trials)
+        ) and all(trial.is_finished() for trial in self._trials)
+        return trials_done and self._search_alg.is_finished()
+    def get_trial(self, tid):
+        trial = [t for t in self._trials if t.trial_id == tid]
+        return trial[0] if trial else None
+    def get_trials(self):
+        """Returns the list of trials managed by this TrialRunner.
+        Note that the caller usually should not mutate trial state directly.
+        """
+        return self._trials
+    def get_live_trials(self):
+        """Returns the set of trials that are not in Trial.TERMINATED state."""
+        return self._live_trials
+    def add_trial(self, trial: Trial):
+        """Adds a new trial to this TrialRunner.
+        Trials may be added at any time.
+        Args:
+            trial: Trial to queue.
+        """
+        # If the config map has had all the references replaced with placeholders,
+        # resolve them before adding the trial.
+        if self._placeholder_resolvers:
+            trial.resolve_config_placeholders(self._placeholder_resolvers)
+        # With trial.config resolved, create placement group factory if needed.
+        trial.create_placement_group_factory()
+        self._trials.append(trial)
+        if trial.status != Trial.TERMINATED:
+            self._live_trials.add(trial)
+        with warn_if_slow("scheduler.on_trial_add"):
+            self._scheduler_alg.on_trial_add(self._wrapped(), trial)
+        self._mark_trial_to_checkpoint(trial)
+        logger.debug(f"Adding trial {trial} with status {trial.status}")
+        status_str_map = {
+            Trial.PENDING: self._pending_trials,
+            Trial.RUNNING: self._running_trials,
+            Trial.PAUSED: self._paused_trials,
+            Trial.TERMINATED: self._stopped_trials,
+            Trial.ERROR: self._failed_trials,
+        }
+        status_str_map[trial.status].add(trial)
+        if trial.status == Trial.PENDING:
+            self._pending_trials_list.append(trial)
+            self._resources_to_pending_trials[trial.placement_group_factory].add(trial)
+    def _update_trial_queue(self, blocking: bool = False, timeout: int = 600) -> bool:
+        """Adds next trials to queue if possible.
+        Note that the timeout is currently unexposed to the user.
+        Args:
+            blocking: Blocks until either a trial is available
+                or is_finished (timeout or search algorithm finishes).
+            timeout: Seconds before blocking times out.
+        Returns:
+            Boolean indicating if a new trial was created or not.
+        """
+        trial = self._search_alg.next_trial()
+        if blocking and not trial:
+            start = time.time()
+            # Checking `is_finished` instead of _search_alg.is_finished
+            # is fine because blocking only occurs if all trials are
+            # finished and search_algorithm is not yet finished
+            while (
+                not trial and not self.is_finished() and time.time() - start < timeout
+            ):
+                logger.debug("Blocking for next trial...")
+                trial = self._search_alg.next_trial()
+                time.sleep(1)
+        if trial:
+            self.add_trial(trial)
+            return True
+        return False
+    def _used_resources_string(self) -> str:
+        allocated_resources = self._actor_manager.get_live_actors_resources()
+        return self._resource_updater.debug_string(allocated_resources)
+    def on_step_begin(self):
+        self._resource_updater.update_avail_resources()
+    def on_step_end(self):
+        self._cleanup_cached_actors(force_all=False)
+        self._cleanup_stopping_actors(force_all=False)
+    def _cleanup_cached_actors(self, force_all: bool = False):
+        if (
+            self._search_alg.is_finished()
+            and not self._staged_trials
+            and self._actor_cache.total_max_objects == 0
+        ):
+            # If there are no more trials coming in, no trials are pending execution,
+            # and we don't explicitly want to cache objects, we can evict the full
+            # cache.
+            force_all = True
+        for tracked_actor in self._actor_cache.flush_cached_objects(
+            force_all=force_all
+        ):
+            logger.debug(f"Cleaning up cached actor: {tracked_actor}")
+            # Unset termination callbacks as no trial is associated
+            tracked_actor.set_on_stop(None)
+            tracked_actor.set_on_error(None)
+            self._remove_actor(tracked_actor=tracked_actor)
+    def _cleanup_stopping_actors(self, force_all: bool = False):
+        now = time.monotonic()
+        if (
+            not force_all
+            and now - self._earliest_stopping_actor <= self._actor_cleanup_timeout
+        ):
+            # If the earliest actor to timeout has not reached the timeout, return
+            return
+        # This is a bit costly, so we want to avoid running it too often
+        times = deque(
+            sorted(
+                [
+                    (timestamp, tracked_actor)
+                    for tracked_actor, timestamp in self._stopping_actors.items()
+                ],
+                key=lambda item: item[0],
+            )
+        )
+        while times and (
+            force_all or time.monotonic() - times[0][0] > self._actor_cleanup_timeout
+        ):
+            if (
+                time.monotonic() - times[0][0] < self._actor_force_cleanup_timeout
+            ) and self._actor_manager.is_actor_started(tracked_actor=times[0][1]):
+                # Even if force_all=True, we give the actors time to clean up
+                self._actor_manager.next(timeout=1)
+                continue
+            _, tracked_actor = times.popleft()
+            if tracked_actor not in self._stopping_actors:
+                # Actor stopping has been handled by the block above
+                continue
+            if self._actor_manager.is_actor_started(tracked_actor=tracked_actor):
+                logger.debug(f"Forcefully killing actor: {tracked_actor}")
+                self._actor_manager.remove_actor(tracked_actor=tracked_actor, kill=True)
+            self._stopping_actors.pop(tracked_actor)
+        if times:
+            self._earliest_stopping_actor = times[0][0]
+        else:
+            self._earliest_stopping_actor = float("inf")
+    def step(self):
+        if self.is_finished():
+            raise TuneError("Called step when all trials finished?")
+        with warn_if_slow("on_step_begin"):
+            self.on_step_begin()
+        with warn_if_slow("callbacks.on_step_begin"):
+            self._callbacks.on_step_begin(
+                iteration=self._iteration, trials=self._trials
+            )
+        # Ask searcher for more trials
+        self._maybe_update_trial_queue()
+        # Start actors for added trials
+        self._maybe_add_actors()
+        # Handle one event
+        if not self._actor_manager.next(timeout=0.1):
+            # If there are no actors running, warn about potentially
+            # insufficient resources
+            if not self._actor_manager.num_live_actors:
+                self._insufficient_resources_manager.on_no_available_trials(
+                    self.get_trials()
+                )
+        # Maybe stop whole experiment
+        self._stop_experiment_if_needed()
+        # Maybe save experiment state
+        try:
+            self.checkpoint()
+        except Exception as e:
+            logger.warning(f"Trial controller checkpointing failed: {str(e)}")
+            raise e
+        self._iteration += 1
+        with warn_if_slow("on_step_end"):
+            self.on_step_end()
+        with warn_if_slow("callbacks.on_step_end"):
+            self._callbacks.on_step_end(iteration=self._iteration, trials=self._trials)
+    def _set_trial_status(self, trial: Trial, status: str):
+        """Set trial to a specific status.
+        This will keep track of trials with specific statuses in sets.
+        For PENDING and PAUSED trials we also keep a list of trials to be able
+        to retain FIFO ordering. See ``_maybe_add_actors`` for details.
+        Lastly we also keep a mapping from resources to pending/paused trials
+        to be able to efficiently start trials for cached actors.
+        """
+        current_status = trial.status
+        if current_status == status:
+            logger.debug(f"Trial {trial} already has status {status}. Skipping update.")
+            return
+        status_str_map = {
+            Trial.PENDING: self._pending_trials,
+            Trial.RUNNING: self._running_trials,
+            Trial.PAUSED: self._paused_trials,
+            Trial.TERMINATED: self._stopped_trials,
+            Trial.ERROR: self._failed_trials,
+        }
+        logger.debug(
+            f"Setting status for trial {trial} from {current_status} to {status}"
+        )
+        assert trial in status_str_map[current_status], (trial, current_status)
+        assert trial not in status_str_map[status], (trial, status)
+        status_str_map[current_status].remove(trial)
+        status_str_map[status].add(trial)
+        # We keep a log for pending trials for FIFO scheduling.
+        # We do not need to remove from this list as we will just discard
+        # items that are in this list but not in the respective set.
+        if status == Trial.PENDING:
+            self._pending_trials_list.append(trial)
+            self._resources_to_pending_trials[trial.placement_group_factory].add(trial)
+        else:
+            self._resources_to_pending_trials[trial.placement_group_factory].discard(
+                trial
+            )
+        trial.set_status(status)
+    def _get_trial_checkpoints(self) -> Dict[str, str]:
+        for trial in self._trials_to_cache:
+            self._trial_metadata[trial.trial_id] = trial.get_json_state()
+        self._trials_to_cache.clear()
+        return self._trial_metadata
+    def _mark_trial_to_checkpoint(self, trial: Trial):
+        self._trials_to_cache.add(trial)
+    ###
+    # UPDATE TRIALS
+    def _maybe_update_trial_queue(self):
+        """Ask the searcher for more trials."""
+        if self._search_alg.is_finished():
+            return
+        dont_wait_for_trial = (
+            self._pending_trials or self._running_trials or self._paused_trials
+        )
+        while len(self._pending_trials) < self._max_pending_trials:
+            if not self._update_trial_queue(blocking=not dont_wait_for_trial):
+                break
+            dont_wait_for_trial = True
+    def _cleanup_trials(self):
+        logger.debug("CLEANING UP all trials")
+        for tracked_actor in list(self._actor_to_trial):
+            trial = self._actor_to_trial[tracked_actor]
+            logger.debug(
+                f"Scheduling trial stop at end of experiment (trial {trial}): "
+                f"{tracked_actor}"
+            )
+            self._schedule_trial_stop(trial)
+        # Clean up cached actors now
+        self._cleanup_cached_actors(force_all=True)
+        start = time.monotonic()
+        while time.monotonic() - start < 5 and self._actor_manager.num_total_actors:
+            if _dedup_logs("actor_manager_cleanup", str(start)):
+                logger.debug(
+                    "Waiting for actor manager to clean up final state [dedup]"
+                )
+            self._actor_manager.next(timeout=1)
+        logger.debug("Force cleanup of remaining actors")
+        self._cleanup_stopping_actors(force_all=True)
+        self._actor_manager.cleanup()
+    def _remove_actor(self, tracked_actor: TrackedActor):
+        stop_future = self._actor_manager.schedule_actor_task(
+            tracked_actor, "stop", _return_future=True
+        )
+        now = time.monotonic()
+        if self._actor_manager.remove_actor(
+            tracked_actor, kill=False, stop_future=stop_future
+        ):
+            # If the actor was previously alive, track
+            self._stopping_actors[tracked_actor] = now
+            self._earliest_stopping_actor = min(self._earliest_stopping_actor, now)
+    ###
+    # ADD ACTORS
+    def _maybe_add_actors(self) -> None:
+        """Add actors for pending and paused trials.
+        For actors that have not been staged, yet, we request an actor.
+        For actors that have been staged, already, we try to reuse a cached actor.
+        First, we handle the trial that the scheduler chooses to run.
+        Then, we handle all trials that are pending.
+        Lastly, we see if we have cached actors that we can assign to a pending or
+        paused trial. This can be the case when a trial has not been staged, yet,
+        for instance because the number of staging trials was too large.
+        """
+        ###
+        # 1: Start trial that the scheduler wants to run
+        with warn_if_slow("choose_trial_to_run"):
+            trial_to_run = self._scheduler_alg.choose_trial_to_run(self._wrapped())
+        if trial_to_run:
+            if _dedup_logs("trial_to_run_chosen", trial_to_run.trial_id):
+                logger.debug(
+                    f"Chose trial to run from scheduler: {trial_to_run} [dedup]"
+                )
+            if (
+                trial_to_run not in self._staged_trials
+                and trial_to_run not in self._trial_to_actor
+            ):
+                logger.debug(f"Staging trial to run: {trial_to_run}")
+                self._set_trial_status(trial_to_run, Trial.PENDING)
+                self._staged_trials.add(trial_to_run)
+                self._actor_cache.increase_max(trial_to_run.placement_group_factory)
+                # schedule_trial_actor also potentially uses cached actors
+                self._schedule_trial_actor(trial_to_run)
+            else:
+                # Otherwise, only try to use the cached actor
+                if _dedup_logs("trial_to_run_reuse", trial_to_run.trial_id):
+                    logger.debug(
+                        f"Trying to re-use actor for trial to run: {trial_to_run} "
+                        f"[dedup]"
+                    )
+                self._maybe_reuse_cached_actor(trial_to_run)
+        ###
+        # 2: Start trials that are PENDING
+        def _maybe_add_actors(candidates: List[Trial]):
+            new_candidates = []
+            while candidates:
+                if self._actor_manager.num_pending_actors >= self._max_pending_trials:
+                    break
+                trial = candidates.pop(0)
+                # If the trial is part of the list, but not of the set,
+                # we just ignore it. Removing it from the list on status
+                # change is too expensive.
+                if trial not in self._pending_trials:
+                    continue
+                if trial in self._trial_to_actor:
+                    new_candidates.append(trial)
+                    continue
+                if trial in self._staged_trials:
+                    self._maybe_reuse_cached_actor(trial)
+                    continue
+                logger.debug(f"Scheduling actor for enqueued trial: {trial}")
+                self._staged_trials.add(trial)
+                self._actor_cache.increase_max(trial.placement_group_factory)
+                self._schedule_trial_actor(trial)
+            return new_candidates + candidates
+        self._pending_trials_list = _maybe_add_actors(self._pending_trials_list)
+        ###
+        # 3: Start any trial that can be started with a cached actor
+        if self._actor_cache.num_cached_objects:
+            for resource in self._resources_to_pending_trials:
+                if not self._resources_to_pending_trials[resource]:
+                    continue
+                if not self._actor_cache.has_cached_object(resource):
+                    continue
+                start_trial = self._resources_to_pending_trials[resource].pop()
+                logger.debug(
+                    f"Trying to re-use actor for enqueued trial: {start_trial}"
+                )
+                if not self._maybe_reuse_cached_actor(start_trial):
+                    self._resources_to_pending_trials[resource].add(start_trial)
+                else:
+                    if start_trial not in self._staged_trials:
+                        self._staged_trials.add(start_trial)
+                        self._actor_cache.increase_max(
+                            start_trial.placement_group_factory
+                        )
+    def _maybe_reuse_cached_actor(self, trial: Trial) -> bool:
+        """Maybe reuse a cached actor for a trial.
+        If an actor has been scheduled for the trial already,
+        this will remove the original actor.
+        """
+        if trial in self._resetting_trials:
+            return True
+        resource_request = trial.placement_group_factory
+        if not self._actor_cache.has_cached_object(resource_request):
+            return False
+        cached_actor = self._actor_cache.pop_cached_object(resource_request)
+        logger.debug(f"Reusing ACTOR for trial {trial}: {cached_actor}")
+        if trial in self._trial_to_actor:
+            original_actor = self._trial_to_actor.pop(trial)
+            self._actor_to_trial.pop(original_actor)
+            logger.debug(f"Removing ORIGINAL ACTOR for trial {trial}: {original_actor}")
+            self._remove_actor(tracked_actor=original_actor)
+        self._trial_to_actor[trial] = cached_actor
+        self._actor_to_trial[cached_actor] = trial
+        # Todo: get rid of Trial.runner
+        ray_actor = self._actor_manager._live_actors_to_ray_actors_resources[
+            cached_actor
+        ][0]
+        trial.set_ray_actor(ray_actor)
+        self._schedule_trial_reset(trial, trial.config, trial.experiment_tag)
+        return True
+    def _schedule_trial_actor(self, trial: Trial):
+        """Schedule an actor for a trial.
+        If a cached actor is available, use it. Otherwise, request a
+        new actor.
+        """
+        logger.debug(f"Trying to schedule new ACTOR for trial {trial}")
+        assert trial.status == Trial.PENDING
+        trial.init_local_path()
+        # We checkpoint metadata here to try mitigating logdir duplication
+        self._mark_trial_to_checkpoint(trial)
+        if self._maybe_reuse_cached_actor(trial):
+            return
+        # Safeguard
+        if trial in self._trial_to_actor:
+            raise RuntimeError(
+                f"Tried to request a new actor for trial {trial}, but an old "
+                f"actor still exists. This can lead to leaked resources. The old "
+                f"actor should be removed first. "
+                f"This is an internal problem in Ray Tune. If you encounter this "
+                f"error, please raise an issue on "
+                f"https://github.com/ray-project/ray/issues"
+            )
+        trainable_cls = trial.get_trainable_cls()
+        if not trainable_cls:
+            exception = _AbortTrialExecution(
+                f"Invalid trainable: {trial.trainable_name}. If you passed "
+                f"a string, make sure the trainable was registered before."
+            )
+            trial.handle_error(exception)
+            self._schedule_trial_stop(trial, exception=exception)
+            return
+        _actor_cls = self._class_cache.get(trainable_cls)
+        trial.set_location(_Location())
+        trainable_kwargs = _get_trainable_kwargs(trial=trial)
+        with _change_working_directory(trial):
+            tracked_actor = self._actor_manager.add_actor(
+                cls=_actor_cls,
+                resource_request=trial.placement_group_factory,
+                kwargs=trainable_kwargs,
+                on_start=self._actor_started,
+                on_stop=self._actor_stopped,
+                on_error=self._actor_failed,
+            )
+            self._trial_to_actor[trial] = tracked_actor
+            self._actor_to_trial[tracked_actor] = trial
+        logger.debug(
+            f"Scheduled new ACTOR for trial {trial}: {tracked_actor}. "
+            f"Resources: {trial.placement_group_factory}"
+        )
+    def _unstage_trial_with_resources(self, trial: Trial):
+        """Unstage trial, or one with the same resources as ``trial``."""
+        # Case 1: The trial we started was staged. Just remove it
+        if trial in self._staged_trials:
+            self._staged_trials.remove(trial)
+            self._actor_cache.decrease_max(trial.placement_group_factory)
+            return
+        # Case 2: We staged a trial "A" with the same resources, but our trial "B"
+        # was selected by the scheduler to run. The resource manager does not care
+        # about "trials", it just cares about resources being available. Thus we
+        # look for a staged trial with the same resource requirements and remove it
+        resource_request = trial.placement_group_factory
+        # Remove staged trial with same resource requirements
+        candidate_trial = None
+        for staged_trial in self._staged_trials:
+            staged_resources = staged_trial.placement_group_factory
+            if staged_resources == resource_request:
+                candidate_trial = staged_trial
+                break
+        if candidate_trial:
+            self._staged_trials.remove(candidate_trial)
+            self._actor_cache.decrease_max(candidate_trial.placement_group_factory)
+            return
+        raise RuntimeError(
+            "Started a trial with resources requested by a different trial, but "
+            "this trial was lost. This is an error in Ray Tune's execution "
+            "logic. Please raise a GitHub issue at "
+            "https://github.com/ray-project/ray/issues"
+        )
+    def _maybe_cache_trial_actor(self, trial: Trial) -> bool:
+        """Cache trial actor for reuse, if needed.
+        We will only cache as many actors as are needed to fulfill any pending
+        resource requests for actors with the same resource requirements.
+        E.g. if we have 6 running trials and 4 additional staged actors, we will only
+        cache up to 4 of the running trial actors when they finish.
+        One exception is the case when we have no cached actors, yet. In that case,
+        we will always cache the actor in this method.
+        Later, in `_cleanup_cached_actors`, we will check again if we need this cached
+        actor. That method will keep the actor if we don't have any staged trials,
+        because we don't know at that point if the next trial might require the same
+        resources. But because there is no staged trial, it is safe to keep the actor
+        around, as it won't occupy resources needed by another trial until it's staged.
+        """
+        if not self._reuse_actors:
+            return False
+        if self._search_alg.is_finished() and not self._staged_trials:
+            logger.debug(
+                f"Not caching actor of trial {trial} as the search is over "
+                f"and no more trials are staged."
+            )
+            return False
+        tracked_actor = self._trial_to_actor[trial]
+        if (
+            not self._actor_manager.is_actor_started(tracked_actor)
+            or self._actor_manager.is_actor_failed(tracked_actor)
+            or tracked_actor not in self._started_actors
+        ):
+            logger.debug(
+                f"Not caching actor of trial {trial} as it has not been started, yet: "
+                f"{tracked_actor}"
+            )
+            return False
+        if not self._actor_cache.cache_object(
+            trial.placement_group_factory, tracked_actor
+        ):
+            logger.debug(
+                f"Could not cache actor of trial {trial} for "
+                "reuse, as there are no pending trials "
+                "requiring its resources."
+            )
+            return False
+        logger.debug(f"Caching actor of trial {trial} for re-use: {tracked_actor}")
+        tracked_actor = self._trial_to_actor.pop(trial)
+        self._actor_to_trial.pop(tracked_actor)
+        trial.set_ray_actor(None)
+        return True
+    def _actor_started(self, tracked_actor: TrackedActor, log: str = "STARTED"):
+        self._started_actors.add(tracked_actor)
+        trial = self._actor_to_trial[tracked_actor]
+        logger.debug(f"Actor {log} for trial {trial}: {tracked_actor}")
+        self._unstage_trial_with_resources(trial)
+        ray_actor = self._actor_manager._live_actors_to_ray_actors_resources[
+            tracked_actor
+        ][0]
+        trial.set_ray_actor(ray_actor)
+        self._callbacks.on_trial_start(
+            iteration=self._iteration, trials=self._trials, trial=trial
+        )
+        self._set_trial_status(trial, Trial.RUNNING)
+        self._mark_trial_to_checkpoint(trial)
+        if not self._schedule_trial_restore(trial):
+            self._schedule_trial_train(trial)
+    def _actor_stopped(self, tracked_actor: TrackedActor):
+        if tracked_actor in self._actor_to_trial:
+            trial = self._actor_to_trial.pop(tracked_actor)
+            logger.debug(f"Actor STOPPED for trial {trial}: {tracked_actor}")
+            self._trial_to_actor.pop(trial)
+            trial.set_ray_actor(None)
+        logger.debug(f"Actor STOPPED: {tracked_actor}")
+        self._stopping_actors.pop(tracked_actor, None)
+        self._started_actors.discard(tracked_actor)
+    def _actor_failed(self, tracked_actor: TrackedActor, exception: Exception):
+        trial = self._actor_to_trial[tracked_actor]
+        logger.debug(
+            f"Actor FAILED for trial {trial}: {tracked_actor}. "
+            f"Exception: {exception}"
+        )
+        if trial in (self._pending_trials | self._paused_trials):
+            # First, set to running (needed downstream in _process_trial_failure)
+            self._set_trial_status(trial, Trial.RUNNING)
+            logger.debug(
+                f"Trial {trial} failed in its creation task. Unstaging "
+                f"to allow it to be re-scheduled."
+            )
+            self._unstage_trial_with_resources(trial)
+            self._trial_task_failure(trial, exception=exception)
+        self._actor_manager.clear_actor_task_futures(tracked_actor)
+        # Clean up actor
+        tracked_actor.set_on_stop(None)
+        tracked_actor.set_on_error(None)
+        self._actor_manager.remove_actor(tracked_actor, kill=False)
+        # Trigger actor stopped callback
+        self._actor_stopped(tracked_actor)
+    def _schedule_trial_task(
+        self,
+        trial: Trial,
+        method_name: str,
+        args: Optional[Tuple] = None,
+        kwargs: Optional[Dict] = None,
+        on_result: Optional[Callable[[Trial, Any], None]] = None,
+        on_error: Optional[Callable[[Trial, Exception], None]] = None,
+        _return_future: bool = False,
+    ) -> Optional[ray.ObjectRef]:
+        """Schedule an actor task future for a trial.
+        This is a wrapper around ``ActorManager.schedule_actor_task``. This method
+        retrieves the tracked actor for a trial to kick off the task.
+        It also wraps around the callbacks, retrieving the trial object given the
+        tracked actor.
+        """
+        tracked_actor = self._trial_to_actor[trial]
+        _on_result = None
+        _on_error = None
+        args = args or tuple()
+        kwargs = kwargs or {}
+        if on_result:
+            def _on_result(tracked_actor: TrackedActor, *args, **kwargs):
+                assert trial == self._actor_to_trial[tracked_actor]
+                logger.debug(
+                    f"Future {method_name.upper()} RESOLVED for trial {trial}: "
+                    f"{args}, {kwargs}"
+                )
+                try:
+                    on_result(trial, *args, **kwargs)
+                except Exception as e:
+                    logger.debug(
+                        f"Error handling {method_name.upper()} result "
+                        f"for trial {trial}: {e}"
+                    )
+                    if e is TuneError or self._fail_fast == self.RAISE:
+                        raise e
+                    else:
+                        raise TuneError(traceback.format_exc())
+        if on_error:
+            def _on_error(tracked_actor: TrackedActor, exception: Exception):
+                # If the actor failed, it has already been cleaned up.
+                if tracked_actor not in self._actor_to_trial:
+                    assert isinstance(exception, RayActorError), type(exception)
+                else:
+                    assert trial == self._actor_to_trial[tracked_actor]
+                logger.debug(
+                    f"Future {method_name.upper()} FAILED for trial {trial}: "
+                    f"{exception}"
+                )
+                try:
+                    on_error(trial, exception)
+                except Exception as e:
+                    logger.debug(
+                        f"Error handling {method_name.upper()} failure "
+                        f"for trial {trial}: {e}"
+                    )
+                    if e is TuneError or self._fail_fast == self.RAISE:
+                        raise e
+                    else:
+                        raise TuneError(traceback.format_exc())
+        logger.debug(f"Future {method_name.upper()} SCHEDULED for trial {trial}")
+        with _change_working_directory(trial):
+            future = self._actor_manager.schedule_actor_task(
+                tracked_actor=tracked_actor,
+                method_name=method_name,
+                args=args,
+                kwargs=kwargs,
+                on_result=_on_result,
+                on_error=_on_error,
+                _return_future=_return_future,
+            )
+            if _return_future:
+                return future
+    def _queue_decision(self, trial, decision):
+        # Get old decision, setting it to the current decision if it isn't set
+        old_decision = self._queued_trial_decisions.setdefault(trial.trial_id, decision)
+        # Stopping always takes precedence. If we decided to stop, just quit
+        if old_decision is TrialScheduler.STOP:
+            return
+        # The old decision wasn't STOP. We update the decision only if it is
+        # STOP or PAUSE. The action will only be CONTINUE if it was set by
+        # the first received result and was never updated after that.
+        if decision is TrialScheduler.STOP or decision is TrialScheduler.PAUSE:
+            self._queued_trial_decisions[trial.trial_id] = decision
+    def _execute_action(self, trial: Trial, decision: str, after_save: bool = False):
+        """Executes action based on decision.
+        Args:
+            trial: Trial to act on.
+            decision: Scheduling decision to undertake.
+        """
+        if decision == TrialScheduler.CONTINUE:
+            self._schedule_trial_train(trial)
+        elif decision == TrialScheduler.PAUSE:
+            self.pause_trial(trial, should_checkpoint=not after_save)
+        elif decision == TrialScheduler.STOP:
+            self.stop_trial(trial)
+        elif decision == TrialScheduler.NOOP:
+            pass
+        else:
+            raise ValueError("Invalid decision: {}".format(decision))
+    def _maybe_execute_queued_decision(self, trial: Trial, after_save: bool = False):
+        # `self._queued_trial_decisions` now contains a final decision
+        # based on all results
+        final_decision = self._queued_trial_decisions.pop(trial.trial_id, None)
+        if final_decision:
+            logger.debug(
+                f"Executing final queued decision for {trial}: {final_decision}"
+            )
+            self._execute_action(trial, final_decision, after_save=after_save)
+    def _stop_experiment_if_needed(self):
+        """Stops all trials."""
+        fail_fast = self._fail_fast and self._has_errored
+        if self._stopper.stop_all() or fail_fast or self._should_stop_experiment:
+            self._search_alg.set_finished()
+            [
+                self._schedule_trial_stop(t)
+                for t in self._trials
+                if t.status not in {Trial.ERROR, Trial.TERMINATED}
+            ]
+    ###
+    # Failure
+    def _trial_task_failure(self, trial: Trial, exception: Exception):
+        if self._fail_fast == self.RAISE:
+            raise exception
+        else:
+            if self._print_trial_errors:
+                logger.error(f"Trial task failed for trial {trial}", exc_info=exception)
+            self._process_trial_failure(trial, exception=exception)
+    def _process_trial_failure(
+        self,
+        trial: Trial,
+        exception: Union[TuneError, RayTaskError, RayActorError],
+    ):
+        """Handle trial failure.
+        Attempt trial recovery if possible, clean up state otherwise.
+        Args:
+            trial: Failed trial.
+            exception: Exception prior to invoking this method.
+        """
+        self._has_errored = True
+        trial.handle_error(exception)
+        if trial.status == Trial.RUNNING and trial.should_recover():
+            self._try_recover(trial, exc=exception)
+            self._callbacks.on_trial_recover(
+                iteration=self._iteration, trials=self._trials, trial=trial
+            )
+        elif trial.status in {Trial.RUNNING, Trial.PENDING}:
+            self._scheduler_alg.on_trial_error(self, trial)
+            self._search_alg.on_trial_complete(trial.trial_id, error=True)
+            self._schedule_trial_stop(trial, exception=exception)
+            self._callbacks.on_trial_error(
+                iteration=self._iteration, trials=self._trials, trial=trial
+            )
+    def _schedule_trial_stop(self, trial: Trial, exception: Optional[Exception] = None):
+        if trial.status == Trial.ERROR:
+            logger.debug(f"Not requesting trial STOP as it is ERROR already: {trial}")
+            return
+        logger.debug(f"Requesting to STOP actor for trial {trial}")
+        if trial.is_saving:
+            logger.debug(
+                f"Trial {trial} is currently saving/pausing. Scheduling STOP after "
+                f"save resolved."
+            )
+            self._cached_trial_decisions[trial.trial_id] = TrialScheduler.STOP
+        trial.temporary_state.saving_to = None
+        trial.temporary_state.restoring_from = None
+        self._set_trial_status(trial, Trial.ERROR if exception else Trial.TERMINATED)
+        trial.set_location(_Location())
+        if trial not in self._trial_to_actor:
+            logger.debug(f"Will not STOP trial actor as it is not live: {trial}")
+            return
+        tracked_actor = self._trial_to_actor[trial]
+        self._actor_manager.clear_actor_task_futures(tracked_actor=tracked_actor)
+        self._mark_trial_to_checkpoint(trial)
+        if not exception and self._maybe_cache_trial_actor(trial):
+            # Trial runner has been cached
+            return
+        logger.debug(f"Terminating actor for trial {trial}: {tracked_actor}")
+        tracked_actor = self._trial_to_actor.pop(trial)
+        self._actor_to_trial.pop(tracked_actor)
+        trial.set_ray_actor(None)
+        self._remove_actor(tracked_actor=tracked_actor)
+    def stop_trial(self, trial):
+        """The canonical implementation of stopping a trial.
+        Trials may be in any external status when this function is called.
+        If trial is in state PENDING or PAUSED, calls `on_trial_remove` for
+        scheduler and `on_trial_complete()` for search_alg.
+        If trial is in state RUNNING, calls `on_trial_complete` for scheduler
+        and search_alg if RUNNING. Caller to ensure that there is no
+        outstanding future to be handled for the trial. If there is, the future
+        would be discarded.
+        """
+        try:
+            if trial.status in [Trial.ERROR, Trial.TERMINATED]:
+                return
+            elif trial.status in [Trial.PENDING, Trial.PAUSED]:
+                self._scheduler_alg.on_trial_remove(self, trial)
+                self._search_alg.on_trial_complete(trial.trial_id)
+            elif trial.status is Trial.RUNNING:
+                # By this time trial.last_result should have been
+                # updated already.
+                self._scheduler_alg.on_trial_complete(
+                    self, trial, flatten_dict(trial.last_result)
+                )
+                self._search_alg.on_trial_complete(
+                    trial.trial_id, result=flatten_dict(trial.last_result)
+                )
+            self._callbacks.on_trial_complete(
+                iteration=self._iteration, trials=self._trials, trial=trial
+            )
+            self._schedule_graceful_trial_stop(trial)
+            self._live_trials.discard(trial)
+        except Exception as e:
+            logger.exception("Trial %s: Error stopping trial.", trial)
+            if self._fail_fast == self.RAISE:
+                raise
+            if isinstance(e, TuneError):
+                self._process_trial_failure(trial, exception=e)
+            else:
+                self._process_trial_failure(
+                    trial, _TuneStopTrialError(traceback.format_exc())
+                )
+    def _schedule_graceful_trial_stop(self, trial: Trial):
+        self._schedule_trial_export(trial)
+        if trial.status != "ERROR":
+            self._schedule_trial_stop(trial)
+    def _schedule_trial_pause(self, trial: Trial, should_checkpoint: bool = True):
+        if trial not in self._trial_to_actor:
+            logger.debug(
+                f"Trial PAUSE requested for trial {trial} but trial is already "
+                f"stopping. Ignoring."
+            )
+            return
+        if should_checkpoint:
+            self._cached_trial_decisions[trial.trial_id] = TrialScheduler.PAUSE
+            self._schedule_trial_save(trial=trial)
+        else:
+            self._schedule_trial_stop(trial)
+            self._set_trial_status(trial, Trial.PAUSED)
+    ###
+    # TRAIN
+    def _schedule_trial_train(self, trial: Trial):
+        args = ()
+        method_name = "train"
+        buffer_length, buffer_time_s = self._maybe_buffer_training(trial)
+        if buffer_length > 1:
+            method_name = "train_buffered"
+            args = (buffer_length, buffer_time_s)
+        logger.debug(f"Scheduling future {method_name.upper()} for trial {trial}")
+        self._schedule_trial_task(
+            trial=trial,
+            method_name=method_name,
+            args=args,
+            on_result=self._on_training_result,
+            on_error=self._trial_task_failure,
+        )
+    def _maybe_buffer_training(self, trial: Trial) -> Tuple[int, float]:
+        buffer_time_s = max(
+            self._buffer_min_time_s,
+            min(self._buffer_max_time_s, self._actor_manager.num_actor_tasks // 10),
+        )
+        buffer_length = self._buffer_length
+        if buffer_length > 1 and trial.checkpoint_at_end:
+            # If a trial checkpoint can be triggered externally,
+            # it is not safe to buffer results.
+            if log_once("trial_executor_buffer_checkpoint"):
+                logger.warning(
+                    "Disabling buffered training as you passed "
+                    "`checkpoint_at_end` to `train.CheckpointConfig()`."
+                )
+            return 1, buffer_time_s
+        if buffer_length > 1 and trial.checkpoint_freq > 0:
+            return min(buffer_length, trial.checkpoint_freq), buffer_time_s
+        return buffer_length, buffer_time_s
+    ###
+    # RESULT
+    def _on_training_result(self, trial, result):
+        if not isinstance(result, list):
+            result = [result]
+        with warn_if_slow("process_trial_result"):
+            self._process_trial_results(trial, result)
+        self._maybe_execute_queued_decision(trial, after_save=False)
+    def _process_trial_results(self, trial, results):
+        logger.debug(f"Processing trial results for trial {trial}: {results}")
+        with warn_if_slow(
+            "process_trial_results",
+            message="Processing trial results took {duration:.3f} s, "
+            "which may be a performance bottleneck. Please consider "
+            "reporting results less frequently to Ray Tune.",
+        ):
+            for i, result in enumerate(results):
+                with warn_if_slow("process_trial_result"):
+                    decision = self._process_trial_result(trial, result)
+                if decision is None:
+                    # If we didn't get a decision, this means a
+                    # non-training future (e.g. a save) was scheduled.
+                    # We do not allow processing more results then.
+                    if i < len(results) - 1:
+                        if log_once("tune_controller_buffer_checkpoint"):
+                            logger.warning(
+                                f"Trial {trial} has a non-training future "
+                                f"scheduled but {len(results) - i} results "
+                                f"left to process. This means that a "
+                                f"checkpoint was requested, but buffered "
+                                f"training was continued before it was "
+                                f"saved. Consider using non-buffered "
+                                f"training by setting the env variable "
+                                f"`TUNE_RESULT_BUFFER_LENGTH=1`."
+                            )
+                elif decision == TrialScheduler.STOP:
+                    # If the decision is to stop the trial,
+                    # ignore all results that came after that.
+                    break
+    def _process_trial_result(self, trial, result):
+        result.update(trial_id=trial.trial_id)
+        is_duplicate = RESULT_DUPLICATE in result
+        force_checkpoint = result.get(SHOULD_CHECKPOINT, False)
+        # TrialScheduler and SearchAlgorithm still receive a
+        # notification because there may be special handling for
+        # the `on_trial_complete` hook.
+        if is_duplicate:
+            logger.debug("Trial finished without logging 'done'.")
+            result = trial.last_result
+            result.update(done=True)
+        self._total_time += result.get(TIME_THIS_ITER_S, 0)
+        flat_result = flatten_dict(result)
+        self._validate_result_metrics(flat_result)
+        if self._stopper(trial.trial_id, result) or trial.should_stop(flat_result):
+            decision = TrialScheduler.STOP
+        else:
+            with warn_if_slow("scheduler.on_trial_result"):
+                decision = self._scheduler_alg.on_trial_result(
+                    self._wrapped(), trial, flat_result
+                )
+        if decision == TrialScheduler.STOP:
+            result.update(done=True)
+        else:
+            # Only updating search alg if the trial is not to be stopped.
+            with warn_if_slow("search_alg.on_trial_result"):
+                self._search_alg.on_trial_result(trial.trial_id, flat_result)
+        # If this is not a duplicate result, the callbacks should
+        # be informed about the result.
+        if not is_duplicate:
+            with warn_if_slow("callbacks.on_trial_result"):
+                self._callbacks.on_trial_result(
+                    iteration=self._iteration,
+                    trials=self._trials,
+                    trial=trial,
+                    result=result.copy(),
+                )
+            trial.update_last_result(result)
+            # Include in next experiment checkpoint
+            self._mark_trial_to_checkpoint(trial)
+        # Checkpoints to disk. This should be checked even if
+        # the scheduler decision is STOP or PAUSE. Note that
+        # PAUSE only checkpoints to memory and does not update
+        # the global checkpoint state.
+        if decision != TrialScheduler.PAUSE:
+            # TODO(justinvyu): This is a temporary hack to fix pausing trials.
+            # We already schedule a save task in `pause_trial`, so no need
+            # to do it again here.
+            self._checkpoint_trial_if_needed(trial, force=force_checkpoint)
+        if trial.is_saving:
+            logger.debug(f"Caching trial decision for trial {trial}: {decision}")
+            # Cache decision to execute on after the save is processed.
+            # This prevents changing the trial's state or kicking off
+            # another training step prematurely.
+            if not self._cached_trial_decisions.get(trial.trial_id) or decision in {
+                TrialScheduler.PAUSE,
+                TrialScheduler.STOP,
+            }:
+                # If already set, only overwrite if it's a PAUSE or STOP. This is
+                # to avoid that CONTINUE decisions from a training step that resolve
+                # late overwrite PAUSE/STOP decision.
+                self._cached_trial_decisions[trial.trial_id] = decision
+            return None
+        else:
+            self._queue_decision(trial, decision)
+            return decision
+    def _validate_result_metrics(self, result):
+        """
+        Check if any of the required metrics was not reported
+        in the last result. If the only items are ``done`` or any of
+        DEBUG_METRICS, this means that no result was ever received and
+        the trial just returned. This is also okay and will not raise
+        an error.
+        This will ignore checking for the DEFAULT_METRIC.
+        """
+        if int(os.environ.get("TUNE_DISABLE_STRICT_METRIC_CHECKING", 0)) != 1 and (
+            len({k for k in result if k not in list(DEBUG_METRICS) + [DONE]}) > 1
+        ):
+            base_metric = self._metric if self._metric != DEFAULT_METRIC else None
+            scheduler_metric = (
+                self._scheduler_alg.metric
+                if self._scheduler_alg.metric != DEFAULT_METRIC
+                else None
+            )
+            search_metrics = (
+                self._search_alg.metric
+                if self._search_alg.metric != DEFAULT_METRIC
+                else None
+            )
+            if isinstance(search_metrics, str):
+                search_metrics = [search_metrics]
+            if base_metric and base_metric not in result:
+                report_metric = base_metric
+                location = "tune.TuneConfig()"
+            elif scheduler_metric and scheduler_metric not in result:
+                report_metric = scheduler_metric
+                location = type(self._scheduler_alg).__name__
+            elif search_metrics and any(
+                search_metric not in result for search_metric in search_metrics
+            ):
+                report_metric = list(
+                    filter(
+                        lambda search_metric: search_metric not in result,
+                        search_metrics,
+                    )
+                )
+                if len(report_metric) == 1:
+                    report_metric = report_metric[0]
+                location = type(self._search_alg).__name__
+            else:
+                report_metric = None
+                location = None
+            if report_metric:
+                raise ValueError(
+                    "Trial returned a result which did not include the "
+                    "specified metric(s) `{}` that `{}` expects. "
+                    "Make sure your calls to `tune.report()` include the "
+                    "metric, or set the "
+                    "TUNE_DISABLE_STRICT_METRIC_CHECKING "
+                    "environment variable to 1. Result: {}".format(
+                        report_metric, location, result
+                    )
+                )
+    ###
+    # SAVE
+    def _schedule_trial_save(
+        self,
+        trial: Trial,
+        result: Optional[Dict] = None,
+    ) -> Optional[_FutureTrainingResult]:
+        if trial not in self._trial_to_actor:
+            logger.debug(
+                f"Trial SAVE requested for trial {trial} but trial is already "
+                f"stopping. Ignoring."
+            )
+            return None
+        result = result or trial.last_result
+        future = self._schedule_trial_task(
+            trial=trial,
+            method_name="save",
+            on_result=self._on_saving_result,
+            on_error=self._trial_task_failure,
+            _return_future=True,
+        )
+        # TODO(justinvyu): `trial.saving_to` (and trial.is_saving) is needed
+        # in order to prevent a done=True result from executing a STOP decision
+        # (which clears all futures) before the save gets processed.
+        # Keep this in for now while `train` and `save` are 2 separate steps.
+        trial.temporary_state.saving_to = _FutureTrainingResult(future)
+        # `trial.saving_to` holds a future training result -- this is only used
+        # in the case of PBT to block until the checkpoint is ready.
+        # In all other situations, the checkpoint future is processed by the
+        # actor event manager when it is ready.
+        return trial.temporary_state.saving_to
+    def _on_saving_result(self, trial, checkpoint_value: _TrainingResult):
+        with warn_if_slow("process_trial_save"):
+            self._process_trial_save(trial, checkpoint_value)
+        with warn_if_slow("callbacks.on_trial_save"):
+            self._callbacks.on_trial_save(
+                iteration=self._iteration, trials=self._trials, trial=trial
+            )
+        self._maybe_execute_queued_decision(trial, after_save=True)
+    def _process_trial_save(self, trial: Trial, checkpoint_value: _TrainingResult):
+        """Processes a trial save.
+        Acts on the decision cached during the last `_process_trial` call.
+        Args:
+            trial: Trial being saved.
+        """
+        logger.debug("Trial %s: Processing trial save.", trial)
+        try:
+            if not checkpoint_value.checkpoint:
+                logger.debug(f"Got empty checkpoint for trial {trial}")
+            else:
+                try:
+                    self._callbacks.on_checkpoint(
+                        iteration=self._iteration,
+                        trials=self._trials,
+                        trial=trial,
+                        checkpoint=checkpoint_value.checkpoint,
+                    )
+                except Exception:
+                    logger.warning(
+                        "Error encountered during processing of callbacks. "
+                        "Ray Train/Tune recently changed the checkpoint interface "
+                        "that is passed to callbacks. If you implemented your own "
+                        "callback with an `on_checkpoint` handler, please review "
+                        "the checkpoint interface and adjust your code "
+                        "accordingly."
+                    )
+                    raise
+                trial.on_checkpoint(checkpoint_value)
+                self._checkpoint_manager.on_trial_checkpoint(trial)
+                self._mark_trial_to_checkpoint(trial)
+        except Exception:
+            logger.exception(
+                "Trial %s: Error handling checkpoint %s", trial, checkpoint_value
+            )
+        trial.temporary_state.saving_to = None
+        decision = self._cached_trial_decisions.pop(trial.trial_id, None)
+        if decision and checkpoint_value:
+            self._queue_decision(trial, decision)
+    def _checkpoint_trial_if_needed(self, trial, force=False):
+        """Checkpoints trial based off trial.last_result."""
+        if trial.should_checkpoint() or force:
+            # Save trial runtime if possible.
+            if trial.temporary_state.ray_actor:
+                self._schedule_trial_save(trial)
+    ###
+    # RESTORE
+    def _schedule_trial_restore(self, trial: Trial) -> bool:
+        checkpoint_result = trial.latest_checkpoint_result
+        if not checkpoint_result:
+            logger.debug(f"Not restoring trial {trial}: No checkpoint found.")
+            return False
+        # TODO(justinvyu): Is this really needed?
+        trial.temporary_state.restoring_from = checkpoint_result
+        method_name = "restore"
+        args = (checkpoint_result,)
+        self._schedule_trial_task(
+            trial=trial,
+            method_name=method_name,
+            args=args,
+            kwargs={},
+            on_result=self._on_restoring_result,
+            on_error=self._trial_task_failure,
+        )
+        return True
+    def _on_restoring_result(self, trial: Trial, result: Any):
+        self._process_trial_restore(trial)
+    def _process_trial_restore(self, trial: Trial):
+        """Processes a trial restore.
+        Args:
+            trial: Trial being restored.
+        """
+        logger.debug("Trial %s: Processing trial restore.", trial)
+        trial.on_restore()
+        logger.debug("Trial %s: Restore processed successfully", trial)
+        self._set_trial_status(trial, Trial.RUNNING)
+        self._schedule_trial_train(trial)
+        self._live_trials.add(trial)
+    def _try_recover(
+        self, trial: Trial, exc: Union[TuneError, RayTaskError, RayActorError]
+    ):
+        """Tries to recover trial.
+        Notifies SearchAlgorithm and Scheduler if failure to recover.
+        Args:
+            trial: Trial to recover.
+            exc: Exception prior to invoking this method.
+        """
+        self._cached_trial_decisions.pop(trial.trial_id, None)
+        # Resetting this, in case that the trial is in saving status when it crashes.
+        if trial.is_saving:
+            trial.temporary_state.saving_to = None
+        self._schedule_trial_stop(trial, exception=exc)
+        logger.debug("Trial %s: Notifying Scheduler and requeueing.", trial)
+        self._requeue_trial(trial)
+    def _requeue_trial(self, trial):
+        """Notification to TrialScheduler and requeue trial.
+        This does not notify the SearchAlgorithm because the function
+        evaluation is still in progress.
+        """
+        self._scheduler_alg.on_trial_error(self, trial)
+        self._set_trial_status(trial, status=Trial.PENDING)
+        # TODO(rliaw): Right now, this pushes the trial to the end of queue
+        # because restoration can be expensive. However, this is not
+        # ideal since it just hides the issue - a better fix would
+        # be to use an actor table to detect the IP of the Trainable
+        # and rsync the files there.
+        # See https://github.com/ray-project/ray/issues/5168
+        self._trials.pop(self._trials.index(trial))
+        self._trials.append(trial)
+        self._live_trials.add(trial)
+        with warn_if_slow("scheduler.on_trial_add"):
+            self._scheduler_alg.on_trial_add(self._wrapped(), trial)
+    ###
+    # EXPORT
+    def _schedule_trial_export(self, trial: Trial):
+        if not trial.export_formats or len(trial.export_formats) <= 0:
+            return
+        # Todo: We are waiting here synchronously until the task resolved.
+        # Instead, we should schedule the trial stop after the export resolved.
+        # This requires changes in TrialRunner, which we can remove once the
+        # legacy execution path has been removed.
+        future = self._schedule_trial_task(
+            trial=trial,
+            method_name="export_model",
+            args=(trial.export_formats,),
+            on_result=None,
+            on_error=self._trial_task_failure,
+            _return_future=True,
+        )
+        self._actor_manager._actor_task_events.resolve_future(future)
+    ###
+    # RESET
+    def _schedule_trial_reset(
+        self,
+        trial: Trial,
+        new_config: Dict,
+        new_experiment_tag: str,
+    ):
+        trial.set_experiment_tag(new_experiment_tag)
+        trial.set_config(new_config)
+        # Pass magic variables
+        extra_config = copy.deepcopy(new_config)
+        extra_config[TRIAL_INFO] = _TrialInfo(trial)
+        stdout_file, stderr_file = trial.log_to_file
+        extra_config[STDOUT_FILE] = stdout_file
+        extra_config[STDERR_FILE] = stderr_file
+        logger_creator = partial(
+            _noop_logger_creator, logdir=trial.storage.trial_working_directory
+        )
+        self._resetting_trials.add(trial)
+        self._schedule_trial_task(
+            trial=trial,
+            method_name="reset",
+            args=(extra_config,),
+            kwargs={
+                "logger_creator": logger_creator,
+                "storage": trial.storage,
+            },
+            on_result=self._on_trial_reset,
+            on_error=self._trial_task_failure,
+        )
+    def _on_trial_reset(self, trial: Trial, success: bool):
+        self._resetting_trials.remove(trial)
+        if not success:
+            info = (
+                "Trainable runner reuse requires reset_config() to be "
+                "implemented and return True."
+            )
+            logger.error(f"Could not re-use actor for trial {trial}: {info}")
+            exception = _AbortTrialExecution(info)
+            trial.handle_error(exception)
+            self._schedule_trial_stop(trial, exception=exception)
+            return
+        tracked_actor = self._trial_to_actor[trial]
+        self._actor_started(tracked_actor, log="REUSED")
+    def request_stop_trial(self, trial):
+        self._stop_queue.append(trial)
+    def request_stop_experiment(self):
+        self._should_stop_experiment = True
+    def _process_stop_requests(self):
+        while self._stop_queue:
+            t = self._stop_queue.pop()
+            self.stop_trial(t)
+    def pause_trial(self, trial: Trial, should_checkpoint: bool = True):
+        """Pause a trial and reset the necessary state variables for resuming later.
+        Args:
+            trial: Trial to pause.
+            should_checkpoint: Whether or not an in-memory checkpoint should be created
+                for this paused trial. Defaults to True.
+        """
+        # NOTE: The cached trial decision is not needed since we will overrule this
+        # decision with PAUSE.
+        self._cached_trial_decisions.pop(trial.trial_id, None)
+        self._schedule_trial_pause(trial, should_checkpoint=should_checkpoint)
+    def cleanup(self):
+        """Cleanup trials and callbacks."""
+        self._cleanup_trials()
+        self.end_experiment_callbacks()
+    def __getstate__(self):
+        """Gets state for trial.
+        Note that this is not used as a pickling override as
+        does not have all fields.
+        """
+        state = self.__dict__.copy()
+        for k in [
+            "_trials",
+            "_live_trials",
+            "_stop_queue",
+            "_search_alg",
+            "_placeholder_resolvers",
+            "_scheduler_alg",
+            "_pending_trial_queue_times",
+            "_callbacks",
+            "_checkpoint_manager",
+            "_storage",
+            "_insufficient_resources_manager",
+            "_actor_manager",
+            "_class_cache",
+            "_resource_updater",
+            "_trials_to_cache",
+            "_trial_metadata",
+            "_actor_to_trial",
+            "_trial_to_actor",
+            "_resources_to_pending_trials",
+            "_pending_trials",
+            "_pending_trials_list",
+            "_running_trials",
+            "_paused_trials",
+            "_stopped_trials",
+            "_failed_trials",
+            "_resetting_trials",
+            "_started_actors",
+            "_stopping_actors",
+            "_staged_trials",
+            "_actor_cache",
+        ]:
+            del state[k]
+        return state
+    def __setstate__(self, state):
+        # Use session_str from previous checkpoint if does not exist
+        session_str = state.pop("_session_str")
+        self.__dict__.setdefault("_session_str", session_str)
+        # Use start_time from previous checkpoint if does not exist
+        start_time = state.pop("_start_time")
+        self.__dict__.setdefault("_start_time", start_time)
+        self.__dict__.update(state)
+        self._checkpoint_manager = self._create_checkpoint_manager()
+class _TrialExecutorWrapper:
+    """Wraps around TrialExecutor class, intercepts API calls and warns users
+    of restricted API access.
+    This is meant to facilitate restricting
+    the current API exposure of TrialExecutor by TrialScheduler.
+    """
+    def __init__(
+        self,
+        trial_executor: "_FakeRayTrialExecutor",
+        whitelist_attr: Optional[set] = None,
+    ):
+        self._trial_executor = trial_executor
+        self._whitelist_attr = whitelist_attr or set()
+        for attr in self._whitelist_attr:
+            assert hasattr(self._trial_executor, attr)
+    def __getattr__(self, attr):
+        if attr not in self._whitelist_attr:
+            if log_once("restrict_accessing_trial_executor"):
+                logger.warning(
+                    f"You are trying to access {attr} interface of "
+                    f"TrialExecutor in TrialScheduler, which is being "
+                    f"restricted. If you believe it is reasonable for "
+                    f"your scheduler to access this TrialExecutor API, "
+                    f"please reach out to Ray team on GitHub. A more "
+                    f"strict API access pattern would be enforced "
+                    f"starting 1.12.0"
+                )
+        return getattr(self._trial_executor, attr)
+@DeveloperAPI
+class TrialRunnerWrapper:
+    """Wraps around TrialRunner class, intercepts API calls and warns users
+    of restricted API access.
+    This is meant to facilitate restricting
+    the current API exposure of TrialRunner by TrialScheduler.
+    """
+    _EXECUTOR_ATTR = "trial_executor"
+    def __init__(
+        self,
+        tune_controller: TuneController,
+        trial_executor: Any,
+        runner_whitelist_attr: Optional[set] = None,
+        executor_whitelist_attr: Optional[set] = None,
+    ):
+        self._tune_controller = tune_controller
+        self._trial_executor = _TrialExecutorWrapper(
+            trial_executor, executor_whitelist_attr
+        )
+        self._runner_whitelist_attr = runner_whitelist_attr or set()
+        for attr in self._runner_whitelist_attr:
+            assert hasattr(self, attr)
+    def __getattr__(self, attr):
+        if attr == self._EXECUTOR_ATTR:
+            return self._trial_executor
+        if attr not in self._runner_whitelist_attr:
+            if log_once("restrict_accessing_tune_controller"):
+                logger.warning(
+                    f"You are trying to access {attr} interface of "
+                    f"TrialRunner in TrialScheduler, which is being "
+                    f"restricted. If you believe it is reasonable for "
+                    f"your scheduler to access this TrialRunner API, "
+                    f"please reach out to Ray team on GitHub. A more "
+                    f"strict API access pattern would be enforced "
+                    f"starting 1.12s.0"
+                )
+        return getattr(self._tune_controller, attr)
+def _get_max_pending_trials(search_alg: SearchAlgorithm) -> int:
+    max_pending_trials = os.getenv("TUNE_MAX_PENDING_TRIALS_PG", "auto")
+    if max_pending_trials != "auto":
+        return int(max_pending_trials)
+    # Else, auto detect.
+    # Only BasicVariantGenerator supports > 1 pending trials.
+    # This is because we don't want to generate too many trials
+    # before we fit the searcher model.
+    if not isinstance(search_alg, BasicVariantGenerator):
+        return 1
+    # Allow up to at least 200 pending trials to trigger fast autoscaling
+    min_autoscaling_rate = 200
+    # Allow more pending trials for larger clusters (based on number of CPUs)
+    cluster_cpus = ray.cluster_resources().get("CPU", 1.0)
+    max_pending_trials = max(min_autoscaling_rate, int(cluster_cpus * 1.1))
+    if max_pending_trials > min_autoscaling_rate:
+        logger.warning(
+            f"The maximum number of pending trials has been "
+            f"automatically set to the number of available "
+            f"cluster CPUs, which is high "
+            f"({max_pending_trials} CPUs/pending trials). "
+            f"If you're running an experiment with a large number "
+            f"of trials, this could lead to scheduling overhead. "
+            f"In this case, consider setting the "
+            f"`TUNE_MAX_PENDING_TRIALS_PG` environment variable "
+            f"to the desired maximum number of concurrent pending trials."
+        )
+    return max_pending_trials
+class _FakeRayTrialExecutor:
+    """The TuneController does not use a RayTrialExecutor anymore.
+    Instead, we pass this fake executor for searchers/schedulers to use
+    as an interface.
+    In the future, we should have the searchers/schedulers either interact with
+    the tune controller, or define a different API for more fine-grained scheduler
+    control.
+    """
+    def __init__(self, tune_controller: TuneController):
+        self._tune_controller = tune_controller
+    def pause_trial(self, trial: Trial, should_checkpoint: bool = True):
+        return self._tune_controller._schedule_trial_pause(
+            trial, should_checkpoint=should_checkpoint
+        )
+    def save(
+        self,
+        trial: Trial,
+        result: Optional[Dict] = None,
+    ) -> Optional[_FutureTrainingResult]:
+        return self._tune_controller._schedule_trial_save(trial=trial, result=result)
+    def has_resources_for_trial(self, trial: Trial):
+        return True
+    @property
+    def _resource_updater(self):
+        return self._tune_controller._resource_updater
+    def force_reconcilation_on_next_step_end(self):
+        pass

.venv/lib/python3.11/site-packages/ray/tune/experiment/__init__.py ADDED Viewed

	@@ -0,0 +1,4 @@

+from ray.tune.experiment.experiment import Experiment, _convert_to_experiment_list
+from ray.tune.experiment.trial import Trial
+__all__ = ["Experiment", "_convert_to_experiment_list", "Trial"]

.venv/lib/python3.11/site-packages/ray/tune/experiment/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (429 Bytes). View file

.venv/lib/python3.11/site-packages/ray/tune/experiment/__pycache__/config_parser.cpython-311.pyc ADDED Viewed

Binary file (8.23 kB). View file

.venv/lib/python3.11/site-packages/ray/tune/experiment/__pycache__/experiment.cpython-311.pyc ADDED Viewed

Binary file (20.6 kB). View file

.venv/lib/python3.11/site-packages/ray/tune/experiment/__pycache__/trial.cpython-311.pyc ADDED Viewed

Binary file (51 kB). View file

.venv/lib/python3.11/site-packages/ray/tune/experiment/config_parser.py ADDED Viewed

	@@ -0,0 +1,210 @@

+import argparse
+import json
+from ray.train import CheckpointConfig
+from ray.tune.error import TuneError
+from ray.tune.experiment import Trial
+from ray.tune.resources import json_to_resources
+# For compatibility under py2 to consider unicode as str
+from ray.tune.utils.serialization import TuneFunctionEncoder
+from ray.tune.utils.util import SafeFallbackEncoder
+def _make_parser(parser_creator=None, **kwargs):
+    """Returns a base argument parser for the ray.tune tool.
+    Args:
+        parser_creator: A constructor for the parser class.
+        kwargs: Non-positional args to be passed into the
+            parser class constructor.
+    """
+    if parser_creator:
+        parser = parser_creator(**kwargs)
+    else:
+        parser = argparse.ArgumentParser(**kwargs)
+    # Note: keep this in sync with rllib/train.py
+    parser.add_argument(
+        "--run",
+        default=None,
+        type=str,
+        help="The algorithm or model to train. This may refer to the name "
+        "of a built-on algorithm (e.g. RLlib's DQN or PPO), or a "
+        "user-defined trainable function or class registered in the "
+        "tune registry.",
+    )
+    parser.add_argument(
+        "--stop",
+        default="{}",
+        type=json.loads,
+        help="The stopping criteria, specified in JSON. The keys may be any "
+        "field returned by 'train()' e.g. "
+        '\'{"time_total_s": 600, "training_iteration": 100000}\' to stop '
+        "after 600 seconds or 100k iterations, whichever is reached first.",
+    )
+    parser.add_argument(
+        "--config",
+        default="{}",
+        type=json.loads,
+        help="Algorithm-specific configuration (e.g. env, hyperparams), "
+        "specified in JSON.",
+    )
+    parser.add_argument(
+        "--resources-per-trial",
+        default=None,
+        type=json_to_resources,
+        help="Override the machine resources to allocate per trial, e.g. "
+        '\'{"cpu": 64, "gpu": 8}\'. Note that GPUs will not be assigned '
+        "unless you specify them here. For RLlib, you probably want to "
+        "leave this alone and use RLlib configs to control parallelism.",
+    )
+    parser.add_argument(
+        "--num-samples",
+        default=1,
+        type=int,
+        help="Number of times to repeat each trial.",
+    )
+    parser.add_argument(
+        "--checkpoint-freq",
+        default=0,
+        type=int,
+        help="How many training iterations between checkpoints. "
+        "A value of 0 (default) disables checkpointing.",
+    )
+    parser.add_argument(
+        "--checkpoint-at-end",
+        action="store_true",
+        help="Whether to checkpoint at the end of the experiment. Default is False.",
+    )
+    parser.add_argument(
+        "--keep-checkpoints-num",
+        default=None,
+        type=int,
+        help="Number of best checkpoints to keep. Others get "
+        "deleted. Default (None) keeps all checkpoints.",
+    )
+    parser.add_argument(
+        "--checkpoint-score-attr",
+        default="training_iteration",
+        type=str,
+        help="Specifies by which attribute to rank the best checkpoint. "
+        "Default is increasing order. If attribute starts with min- it "
+        "will rank attribute in decreasing order. Example: "
+        "min-validation_loss",
+    )
+    parser.add_argument(
+        "--export-formats",
+        default=None,
+        help="List of formats that exported at the end of the experiment. "
+        "Default is None. For RLlib, 'checkpoint' and 'model' are "
+        "supported for TensorFlow policy graphs.",
+    )
+    parser.add_argument(
+        "--max-failures",
+        default=3,
+        type=int,
+        help="Try to recover a trial from its last checkpoint at least this "
+        "many times. Only applies if checkpointing is enabled.",
+    )
+    parser.add_argument(
+        "--scheduler",
+        default="FIFO",
+        type=str,
+        help="FIFO (default), MedianStopping, AsyncHyperBand, "
+        "HyperBand, or HyperOpt.",
+    )
+    parser.add_argument(
+        "--scheduler-config",
+        default="{}",
+        type=json.loads,
+        help="Config options to pass to the scheduler.",
+    )
+    # Note: this currently only makes sense when running a single trial
+    parser.add_argument(
+        "--restore",
+        default=None,
+        type=str,
+        help="If specified, restore from this checkpoint.",
+    )
+    return parser
+def _to_argv(config):
+    """Converts configuration to a command line argument format."""
+    argv = []
+    for k, v in config.items():
+        if "-" in k:
+            raise ValueError("Use '_' instead of '-' in `{}`".format(k))
+        if v is None:
+            continue
+        if not isinstance(v, bool) or v:  # for argparse flags
+            argv.append("--{}".format(k.replace("_", "-")))
+        if isinstance(v, str):
+            argv.append(v)
+        elif isinstance(v, bool):
+            pass
+        elif callable(v):
+            argv.append(json.dumps(v, cls=TuneFunctionEncoder))
+        else:
+            argv.append(json.dumps(v, cls=SafeFallbackEncoder))
+    return argv
+_cached_pgf = {}
+def _create_trial_from_spec(
+    spec: dict, parser: argparse.ArgumentParser, **trial_kwargs
+):
+    """Creates a Trial object from parsing the spec.
+    Args:
+        spec: A resolved experiment specification. Arguments should
+            The args here should correspond to the command line flags
+            in ray.tune.experiment.config_parser.
+        parser: An argument parser object from
+            make_parser.
+        trial_kwargs: Extra keyword arguments used in instantiating the Trial.
+    Returns:
+        A trial object with corresponding parameters to the specification.
+    """
+    global _cached_pgf
+    spec = spec.copy()
+    resources = spec.pop("resources_per_trial", None)
+    try:
+        args, _ = parser.parse_known_args(_to_argv(spec))
+    except SystemExit:
+        raise TuneError("Error parsing args, see above message", spec)
+    if resources:
+        trial_kwargs["placement_group_factory"] = resources
+    checkpoint_config = spec.get("checkpoint_config", CheckpointConfig())
+    return Trial(
+        # Submitting trial via server in py2.7 creates Unicode, which does not
+        # convert to string in a straightforward manner.
+        trainable_name=spec["run"],
+        # json.load leads to str -> unicode in py2.7
+        config=spec.get("config", {}),
+        # json.load leads to str -> unicode in py2.7
+        stopping_criterion=spec.get("stop", {}),
+        checkpoint_config=checkpoint_config,
+        export_formats=spec.get("export_formats", []),
+        # str(None) doesn't create None
+        restore_path=spec.get("restore"),
+        trial_name_creator=spec.get("trial_name_creator"),
+        trial_dirname_creator=spec.get("trial_dirname_creator"),
+        log_to_file=spec.get("log_to_file"),
+        # str(None) doesn't create None
+        max_failures=args.max_failures,
+        storage=spec.get("storage"),
+        **trial_kwargs,
+    )

.venv/lib/python3.11/site-packages/ray/tune/experiment/experiment.py ADDED Viewed

	@@ -0,0 +1,445 @@

+import copy
+import datetime
+import logging
+import pprint as pp
+import traceback
+from functools import partial
+from pathlib import Path
+from pickle import PicklingError
+from typing import (
+    TYPE_CHECKING,
+    Any,
+    Callable,
+    Dict,
+    List,
+    Mapping,
+    Optional,
+    Sequence,
+    Type,
+    Union,
+)
+import ray
+from ray.exceptions import RpcError
+from ray.train import CheckpointConfig, SyncConfig
+from ray.train._internal.storage import StorageContext
+from ray.train.constants import DEFAULT_STORAGE_PATH
+from ray.tune.error import TuneError
+from ray.tune.registry import is_function_trainable, register_trainable
+from ray.tune.stopper import CombinedStopper, FunctionStopper, Stopper, TimeoutStopper
+from ray.util.annotations import Deprecated, DeveloperAPI
+if TYPE_CHECKING:
+    import pyarrow.fs
+    from ray.tune import PlacementGroupFactory
+    from ray.tune.experiment import Trial
+logger = logging.getLogger(__name__)
+def _validate_log_to_file(log_to_file):
+    """Validate ``train.RunConfig``'s ``log_to_file`` parameter. Return
+    validated relative stdout and stderr filenames."""
+    if not log_to_file:
+        stdout_file = stderr_file = None
+    elif isinstance(log_to_file, bool) and log_to_file:
+        stdout_file = "stdout"
+        stderr_file = "stderr"
+    elif isinstance(log_to_file, str):
+        stdout_file = stderr_file = log_to_file
+    elif isinstance(log_to_file, Sequence):
+        if len(log_to_file) != 2:
+            raise ValueError(
+                "If you pass a Sequence to `log_to_file` it has to have "
+                "a length of 2 (for stdout and stderr, respectively). The "
+                "Sequence you passed has length {}.".format(len(log_to_file))
+            )
+        stdout_file, stderr_file = log_to_file
+    else:
+        raise ValueError(
+            "You can pass a boolean, a string, or a Sequence of length 2 to "
+            "`log_to_file`, but you passed something else ({}).".format(
+                type(log_to_file)
+            )
+        )
+    return stdout_file, stderr_file
+@DeveloperAPI
+class Experiment:
+    """Tracks experiment specifications.
+    Implicitly registers the Trainable if needed. The args here take
+    the same meaning as the arguments defined `tune.py:run`.
+    .. code-block:: python
+        experiment_spec = Experiment(
+            "my_experiment_name",
+            my_func,
+            stop={"mean_accuracy": 100},
+            config={
+                "alpha": tune.grid_search([0.2, 0.4, 0.6]),
+                "beta": tune.grid_search([1, 2]),
+            },
+            resources_per_trial={
+                "cpu": 1,
+                "gpu": 0
+            },
+            num_samples=10,
+            local_dir="~/ray_results",
+            checkpoint_freq=10,
+            max_failures=2)
+    """
+    # Keys that will be present in `public_spec` dict.
+    PUBLIC_KEYS = {"stop", "num_samples", "time_budget_s"}
+    _storage_context_cls = StorageContext
+    def __init__(
+        self,
+        name: str,
+        run: Union[str, Callable, Type],
+        *,
+        stop: Optional[Union[Mapping, Stopper, Callable[[str, Mapping], bool]]] = None,
+        time_budget_s: Optional[Union[int, float, datetime.timedelta]] = None,
+        config: Optional[Dict[str, Any]] = None,
+        resources_per_trial: Union[
+            None, Mapping[str, Union[float, int, Mapping]], "PlacementGroupFactory"
+        ] = None,
+        num_samples: int = 1,
+        storage_path: Optional[str] = None,
+        storage_filesystem: Optional["pyarrow.fs.FileSystem"] = None,
+        sync_config: Optional[Union[SyncConfig, dict]] = None,
+        checkpoint_config: Optional[Union[CheckpointConfig, dict]] = None,
+        trial_name_creator: Optional[Callable[["Trial"], str]] = None,
+        trial_dirname_creator: Optional[Callable[["Trial"], str]] = None,
+        log_to_file: bool = False,
+        export_formats: Optional[Sequence] = None,
+        max_failures: int = 0,
+        restore: Optional[str] = None,
+        # Deprecated
+        local_dir: Optional[str] = None,
+    ):
+        if isinstance(checkpoint_config, dict):
+            checkpoint_config = CheckpointConfig(**checkpoint_config)
+        else:
+            checkpoint_config = checkpoint_config or CheckpointConfig()
+        if is_function_trainable(run):
+            if checkpoint_config.checkpoint_at_end:
+                raise ValueError(
+                    "'checkpoint_at_end' cannot be used with a function trainable. "
+                    "You should include one last call to "
+                    "`ray.train.report(metrics=..., checkpoint=...)` "
+                    "at the end of your training loop to get this behavior."
+                )
+            if checkpoint_config.checkpoint_frequency:
+                raise ValueError(
+                    "'checkpoint_frequency' cannot be set for a function trainable. "
+                    "You will need to report a checkpoint every "
+                    "`checkpoint_frequency` iterations within your training loop using "
+                    "`ray.train.report(metrics=..., checkpoint=...)` "
+                    "to get this behavior."
+                )
+        try:
+            self._run_identifier = Experiment.register_if_needed(run)
+        except RpcError as e:
+            if e.rpc_code == ray._raylet.GRPC_STATUS_CODE_RESOURCE_EXHAUSTED:
+                raise TuneError(
+                    f"The Trainable/training function is too large for grpc resource "
+                    f"limit. Check that its definition is not implicitly capturing a "
+                    f"large array or other object in scope. "
+                    f"Tip: use tune.with_parameters() to put large objects "
+                    f"in the Ray object store. \n"
+                    f"Original exception: {traceback.format_exc()}"
+                )
+            else:
+                raise e
+        if not name:
+            name = StorageContext.get_experiment_dir_name(run)
+        storage_path = storage_path or DEFAULT_STORAGE_PATH
+        self.storage = self._storage_context_cls(
+            storage_path=storage_path,
+            storage_filesystem=storage_filesystem,
+            sync_config=sync_config,
+            experiment_dir_name=name,
+        )
+        logger.debug(f"StorageContext on the DRIVER:\n{self.storage}")
+        config = config or {}
+        if not isinstance(config, dict):
+            raise ValueError(
+                f"`Experiment(config)` must be a dict, got: {type(config)}. "
+                "Please convert your search space to a dict before passing it in."
+            )
+        self._stopper = None
+        stopping_criteria = {}
+        if not stop:
+            pass
+        elif isinstance(stop, list):
+            bad_stoppers = [s for s in stop if not isinstance(s, Stopper)]
+            if bad_stoppers:
+                stopper_types = [type(s) for s in stop]
+                raise ValueError(
+                    "If you pass a list as the `stop` argument to "
+                    "`train.RunConfig()`, each element must be an instance of "
+                    f"`tune.stopper.Stopper`. Got {stopper_types}."
+                )
+            self._stopper = CombinedStopper(*stop)
+        elif isinstance(stop, dict):
+            stopping_criteria = stop
+        elif callable(stop):
+            if FunctionStopper.is_valid_function(stop):
+                self._stopper = FunctionStopper(stop)
+            elif isinstance(stop, Stopper):
+                self._stopper = stop
+            else:
+                raise ValueError(
+                    "Provided stop object must be either a dict, "
+                    "a function, or a subclass of "
+                    f"`ray.tune.Stopper`. Got {type(stop)}."
+                )
+        else:
+            raise ValueError(
+                f"Invalid stop criteria: {stop}. Must be a "
+                f"callable or dict. Got {type(stop)}."
+            )
+        if time_budget_s:
+            if self._stopper:
+                self._stopper = CombinedStopper(
+                    self._stopper, TimeoutStopper(time_budget_s)
+                )
+            else:
+                self._stopper = TimeoutStopper(time_budget_s)
+        stdout_file, stderr_file = _validate_log_to_file(log_to_file)
+        spec = {
+            "run": self._run_identifier,
+            "stop": stopping_criteria,
+            "time_budget_s": time_budget_s,
+            "config": config,
+            "resources_per_trial": resources_per_trial,
+            "num_samples": num_samples,
+            "checkpoint_config": checkpoint_config,
+            "trial_name_creator": trial_name_creator,
+            "trial_dirname_creator": trial_dirname_creator,
+            "log_to_file": (stdout_file, stderr_file),
+            "export_formats": export_formats or [],
+            "max_failures": max_failures,
+            "restore": (
+                Path(restore).expanduser().absolute().as_posix() if restore else None
+            ),
+            "storage": self.storage,
+        }
+        self.spec = spec
+    @classmethod
+    def from_json(cls, name: str, spec: dict):
+        """Generates an Experiment object from JSON.
+        Args:
+            name: Name of Experiment.
+            spec: JSON configuration of experiment.
+        """
+        if "run" not in spec:
+            raise TuneError("No trainable specified!")
+        # Special case the `env` param for RLlib by automatically
+        # moving it into the `config` section.
+        if "env" in spec:
+            spec["config"] = spec.get("config", {})
+            spec["config"]["env"] = spec["env"]
+            del spec["env"]
+        if "sync_config" in spec and isinstance(spec["sync_config"], dict):
+            spec["sync_config"] = SyncConfig(**spec["sync_config"])
+        if "checkpoint_config" in spec and isinstance(spec["checkpoint_config"], dict):
+            spec["checkpoint_config"] = CheckpointConfig(**spec["checkpoint_config"])
+        spec = copy.deepcopy(spec)
+        run_value = spec.pop("run")
+        try:
+            exp = cls(name, run_value, **spec)
+        except TypeError as e:
+            raise TuneError(
+                f"Failed to load the following Tune experiment "
+                f"specification:\n\n {pp.pformat(spec)}.\n\n"
+                f"Please check that the arguments are valid. "
+                f"Experiment creation failed with the following "
+                f"error:\n {e}"
+            )
+        return exp
+    @classmethod
+    def get_trainable_name(cls, run_object: Union[str, Callable, Type]):
+        """Get Trainable name.
+        Args:
+            run_object: Trainable to run. If string,
+                assumes it is an ID and does not modify it. Otherwise,
+                returns a string corresponding to the run_object name.
+        Returns:
+            A string representing the trainable identifier.
+        Raises:
+            TuneError: if ``run_object`` passed in is invalid.
+        """
+        from ray.tune.search.sample import Domain
+        if isinstance(run_object, str) or isinstance(run_object, Domain):
+            return run_object
+        elif isinstance(run_object, type) or callable(run_object):
+            name = "DEFAULT"
+            if hasattr(run_object, "_name"):
+                name = run_object._name
+            elif hasattr(run_object, "__name__"):
+                fn_name = run_object.__name__
+                if fn_name == "<lambda>":
+                    name = "lambda"
+                elif fn_name.startswith("<"):
+                    name = "DEFAULT"
+                else:
+                    name = fn_name
+            elif (
+                isinstance(run_object, partial)
+                and hasattr(run_object, "func")
+                and hasattr(run_object.func, "__name__")
+            ):
+                name = run_object.func.__name__
+            else:
+                logger.warning("No name detected on trainable. Using {}.".format(name))
+            return name
+        else:
+            raise TuneError("Improper 'run' - not string nor trainable.")
+    @classmethod
+    def register_if_needed(cls, run_object: Union[str, Callable, Type]):
+        """Registers Trainable or Function at runtime.
+        Assumes already registered if run_object is a string.
+        Also, does not inspect interface of given run_object.
+        Args:
+            run_object: Trainable to run. If string,
+                assumes it is an ID and does not modify it. Otherwise,
+                returns a string corresponding to the run_object name.
+        Returns:
+            A string representing the trainable identifier.
+        """
+        from ray.tune.search.sample import Domain
+        if isinstance(run_object, str):
+            return run_object
+        elif isinstance(run_object, Domain):
+            logger.warning("Not registering trainable. Resolving as variant.")
+            return run_object
+        name = cls.get_trainable_name(run_object)
+        try:
+            register_trainable(name, run_object)
+        except (TypeError, PicklingError) as e:
+            extra_msg = (
+                "Other options: "
+                "\n-Try reproducing the issue by calling "
+                "`pickle.dumps(trainable)`. "
+                "\n-If the error is typing-related, try removing "
+                "the type annotations and try again."
+            )
+            raise type(e)(str(e) + " " + extra_msg) from None
+        return name
+    @property
+    def stopper(self):
+        return self._stopper
+    @property
+    def local_path(self) -> Optional[str]:
+        return self.storage.experiment_driver_staging_path
+    @property
+    @Deprecated("Replaced by `local_path`")
+    def local_dir(self):
+        # TODO(justinvyu): [Deprecated] Remove in 2.11.
+        raise DeprecationWarning("Use `local_path` instead of `local_dir`.")
+    @property
+    def remote_path(self) -> Optional[str]:
+        return self.storage.experiment_fs_path
+    @property
+    def path(self) -> Optional[str]:
+        return self.remote_path or self.local_path
+    @property
+    def checkpoint_config(self):
+        return self.spec.get("checkpoint_config")
+    @property
+    @Deprecated("Replaced by `local_path`")
+    def checkpoint_dir(self):
+        # TODO(justinvyu): [Deprecated] Remove in 2.11.
+        raise DeprecationWarning("Use `local_path` instead of `checkpoint_dir`.")
+    @property
+    def run_identifier(self):
+        """Returns a string representing the trainable identifier."""
+        return self._run_identifier
+    @property
+    def public_spec(self) -> Dict[str, Any]:
+        """Returns the spec dict with only the public-facing keys.
+        Intended to be used for passing information to callbacks,
+        Searchers and Schedulers.
+        """
+        return {k: v for k, v in self.spec.items() if k in self.PUBLIC_KEYS}
+def _convert_to_experiment_list(experiments: Union[Experiment, List[Experiment], Dict]):
+    """Produces a list of Experiment objects.
+    Converts input from dict, single experiment, or list of
+    experiments to list of experiments. If input is None,
+    will return an empty list.
+    Arguments:
+        experiments: Experiments to run.
+    Returns:
+        List of experiments.
+    """
+    exp_list = experiments
+    # Transform list if necessary
+    if experiments is None:
+        exp_list = []
+    elif isinstance(experiments, Experiment):
+        exp_list = [experiments]
+    elif type(experiments) is dict:
+        exp_list = [
+            Experiment.from_json(name, spec) for name, spec in experiments.items()
+        ]
+    # Validate exp_list
+    if type(exp_list) is list and all(isinstance(exp, Experiment) for exp in exp_list):
+        if len(exp_list) > 1:
+            logger.info(
+                "Running with multiple concurrent experiments. "
+                "All experiments will be using the same SearchAlgorithm."
+            )
+    else:
+        raise TuneError("Invalid argument: {}".format(experiments))
+    return exp_list

.venv/lib/python3.11/site-packages/ray/tune/experiment/trial.py ADDED Viewed

	@@ -0,0 +1,1073 @@

+import copy
+import json
+import logging
+import os
+import platform
+import re
+import time
+import uuid
+from contextlib import contextmanager
+from functools import partial
+from numbers import Number
+from pathlib import Path
+from typing import Any, Callable, Dict, List, Optional, Sequence, Tuple, Union
+import ray
+import ray.cloudpickle as cloudpickle
+from ray._private.utils import binary_to_hex, hex_to_binary
+from ray.air.constants import (
+    EXPR_ERROR_FILE,
+    EXPR_ERROR_PICKLE_FILE,
+    TRAINING_ITERATION,
+)
+from ray.exceptions import RayActorError, RayTaskError
+from ray.train import Checkpoint, CheckpointConfig
+from ray.train._internal.checkpoint_manager import _CheckpointManager
+from ray.train._internal.session import _FutureTrainingResult, _TrainingResult
+from ray.train._internal.storage import StorageContext, _exists_at_fs_path
+from ray.train.constants import (
+    RAY_CHDIR_TO_TRIAL_DIR,
+    RAY_TRAIN_COUNT_PREEMPTION_AS_FAILURE,
+)
+from ray.tune.error import TuneError
+from ray.tune.execution.placement_groups import (
+    PlacementGroupFactory,
+    resource_dict_to_pg_factory,
+)
+from ray.tune.logger import NoopLogger
+# NOTE(rkn): We import ray.tune.registry here instead of importing the names we
+# need because there are cyclic imports that may cause specific names to not
+# have been defined yet. See https://github.com/ray-project/ray/issues/1716.
+from ray.tune.registry import get_trainable_cls, validate_trainable
+from ray.tune.result import (
+    DEBUG_METRICS,
+    DONE,
+    NODE_IP,
+    PID,
+    STDERR_FILE,
+    STDOUT_FILE,
+    TRIAL_ID,
+    TRIAL_INFO,
+)
+from ray.tune.trainable.metadata import _TrainingRunMetadata
+from ray.tune.utils import date_str, flatten_dict
+from ray.tune.utils.serialization import TuneFunctionDecoder, TuneFunctionEncoder
+from ray.util import log_once
+from ray.util.annotations import Deprecated, DeveloperAPI
+DEBUG_PRINT_INTERVAL = 5
+_DEFAULT_WIN_MAX_PATH_LENGTH = 260
+TRIAL_STATE_FILENAME = "trial_metadata.json"
+logger = logging.getLogger(__name__)
+class _Location:
+    """Describes the location at which Trial is placed to run."""
+    def __init__(self, hostname=None, pid=None):
+        self.hostname = hostname
+        self.pid = pid
+    def __str__(self):
+        if not self.pid:
+            return ""
+        elif self.hostname == platform.node():
+            return "pid={}".format(self.pid)
+        else:
+            return "{}:{}".format(self.hostname, self.pid)
+@DeveloperAPI
+class ExportFormat:
+    """Describes the format to import/export the trial Trainable.
+    This may correspond to different file formats based on the
+    Trainable implementation.
+    """
+    CHECKPOINT = "checkpoint"
+    MODEL = "model"
+    ONNX = "onnx"
+    H5 = "h5"
+    @staticmethod
+    def validate(formats):
+        """Validates formats.
+        Raises:
+            ValueError: if the format is unknown.
+        """
+        for i in range(len(formats)):
+            formats[i] = formats[i].strip().lower()
+            if formats[i] not in [
+                ExportFormat.CHECKPOINT,
+                ExportFormat.MODEL,
+                ExportFormat.ONNX,
+                ExportFormat.H5,
+            ]:
+                raise TuneError("Unsupported import/export format: " + formats[i])
+class _TrialInfo:
+    """Serializable struct for holding information for a Trial.
+    Attributes:
+        trial_name: String name of the current trial.
+        trial_id: trial_id of the trial
+        trial_resources: resources used by trial.
+    """
+    def __init__(self, trial: "Trial"):
+        self._trial_name = str(trial)
+        self._trial_id = trial.trial_id
+        self._trial_resources = trial.placement_group_factory
+        self._experiment_name = trial.experiment_dir_name
+    @property
+    def experiment_name(self):
+        return self._experiment_name
+    @property
+    def trial_name(self):
+        return self._trial_name
+    @property
+    def trial_id(self):
+        return self._trial_id
+    @property
+    def trial_resources(self) -> PlacementGroupFactory:
+        return self._trial_resources
+    @trial_resources.setter
+    def trial_resources(self, new_resources: PlacementGroupFactory):
+        self._trial_resources = new_resources
+class _TemporaryTrialState:
+    """Temporary trial state.
+    Values saved here should not be restored on resume.
+    """
+    def __init__(self):
+        self.location = _Location()
+        self.ray_actor: Optional[ray.actor.ActorHandle] = None
+        self.saving_to: Optional[_FutureTrainingResult] = None
+        self.restoring_from: Optional[_TrainingResult] = None
+        self.num_restore_failures: int = 0
+    def __getstate__(self):
+        return {}
+def _get_max_path_length() -> int:
+    if hasattr(os, "pathconf"):
+        return os.pathconf("/", "PC_PATH_MAX")
+    # Windows
+    return _DEFAULT_WIN_MAX_PATH_LENGTH
+def _create_unique_logdir_name(root: str, relative_logdir: str) -> str:
+    candidate = Path(root).expanduser().joinpath(relative_logdir)
+    if candidate.exists():
+        relative_logdir_old = relative_logdir
+        relative_logdir += "_" + uuid.uuid4().hex[:4]
+        logger.info(
+            f"Creating a new dirname {relative_logdir} because "
+            f"trial dirname '{relative_logdir_old}' already exists."
+        )
+    return relative_logdir
+def _noop_logger_creator(config: Dict[str, Any], logdir: str):
+    # Upon remote process setup, record the actor's original working dir before
+    # changing to the Tune logdir
+    os.environ.setdefault("TUNE_ORIG_WORKING_DIR", os.getcwd())
+    os.makedirs(logdir, exist_ok=True)
+    if bool(int(os.environ.get(RAY_CHDIR_TO_TRIAL_DIR, "1"))):
+        # Set the working dir to the trial directory in the remote process,
+        # for user file writes
+        if not ray._private.worker._mode() == ray._private.worker.LOCAL_MODE:
+            os.chdir(logdir)
+    return NoopLogger(config, logdir)
+def _get_trainable_kwargs(trial: "Trial") -> Dict[str, Any]:
+    trial.init_local_path()
+    logger_creator = partial(
+        _noop_logger_creator, logdir=trial.storage.trial_working_directory
+    )
+    trial_config = copy.deepcopy(trial.config)
+    trial_config[TRIAL_INFO] = _TrialInfo(trial)
+    stdout_file, stderr_file = trial.log_to_file
+    trial_config[STDOUT_FILE] = stdout_file
+    trial_config[STDERR_FILE] = stderr_file
+    assert trial.storage.trial_dir_name
+    kwargs = {
+        "config": trial_config,
+        "logger_creator": logger_creator,
+        "storage": trial.storage,
+    }
+    return kwargs
+@contextmanager
+def _change_working_directory(trial):
+    """Context manager changing working directory to trial logdir.
+    Used in local mode.
+    For non-local mode it is no-op.
+    """
+    if ray._private.worker._mode() == ray._private.worker.LOCAL_MODE:
+        old_dir = os.getcwd()
+        try:
+            os.chdir(trial.local_path)
+            yield
+        finally:
+            os.chdir(old_dir)
+    else:
+        yield
+@DeveloperAPI
+class Trial:
+    """A trial object holds the state for one model training run.
+    Trials are themselves managed by the TrialRunner class, which implements
+    the event loop for submitting trial runs to a Ray cluster.
+    Trials start in the PENDING state, and transition to RUNNING once started.
+    On error, it transitions to ERROR, otherwise TERMINATED on success.
+    There are resources allocated to each trial. These should be specified
+    using ``PlacementGroupFactory``.
+    Attributes:
+        trainable_name: Name of the trainable object to be executed.
+        config: Provided configuration dictionary with evaluated params.
+        trial_id: Unique identifier for the trial.
+        path: Path where results for this trial are stored. Can be on
+            the local node or on cloud storage.
+        local_path: Path on the local disk where results are stored.
+        remote_path: Path on cloud storage where results are stored,
+            or None if not set.
+        relative_logdir: Directory of the trial relative to its
+            experiment directory.
+        evaluated_params: Evaluated parameters by search algorithm,
+        experiment_tag: Identifying trial name to show in the console
+        status: One of PENDING, RUNNING, PAUSED, TERMINATED, ERROR/
+        error_file: Path to the errors that this trial has raised.
+    """
+    _nonjson_fields = [
+        "results",
+        "extra_arg",
+        "placement_group_factory",
+        "_resources",
+        "_default_placement_group_factory",
+    ]
+    PENDING = "PENDING"
+    RUNNING = "RUNNING"
+    PAUSED = "PAUSED"
+    TERMINATED = "TERMINATED"
+    ERROR = "ERROR"
+    def __init__(
+        self,
+        trainable_name: str,
+        *,
+        config: Optional[Dict] = None,
+        trial_id: Optional[str] = None,
+        storage: Optional[StorageContext] = None,
+        evaluated_params: Optional[Dict] = None,
+        experiment_tag: str = "",
+        placement_group_factory: Optional[PlacementGroupFactory] = None,
+        stopping_criterion: Optional[Dict[str, float]] = None,
+        checkpoint_config: Optional[CheckpointConfig] = None,
+        export_formats: Optional[List[str]] = None,
+        restore_path: Optional[str] = None,
+        trial_name_creator: Optional[Callable[["Trial"], str]] = None,
+        trial_dirname_creator: Optional[Callable[["Trial"], str]] = None,
+        log_to_file: Union[Optional[str], Tuple[Optional[str], Optional[str]]] = None,
+        max_failures: int = 0,
+        stub: bool = False,
+        _setup_default_resource: bool = True,
+    ):
+        """Initialize a new trial.
+        The args here take the same meaning as the command line flags defined
+        in ray.tune.experiment.config_parser.
+        Args:
+            _setup_default_resource: Whether to set up default resources.
+                When initializing trials from checkpoints, this field is set to false,
+                so that setting up default resources can be delayed till after
+                ``trial.config`` is loaded from checkpoints.
+        """
+        # If this is set, trainables are not validated or looked up.
+        # This can be used e.g. to initialize Trial objects from checkpoints
+        # without loading the trainable first.
+        self.stub = stub
+        if not self.stub:
+            validate_trainable(trainable_name)
+        # Trial config
+        self.trainable_name = trainable_name
+        self.trial_id = Trial.generate_id() if trial_id is None else trial_id
+        self.temporary_state = _TemporaryTrialState()
+        self.run_metadata = _TrainingRunMetadata()
+        # Create a copy, since `init_local_path` updates the context with the
+        # generated trial dirname.
+        self.storage = copy.copy(storage)
+        self.config = config or {}
+        # Save a copy of the original unresolved config so that we can swap
+        # out and update any reference config values after restoration.
+        self.__unresolved_config = self.config
+        # Parameters that Tune varies across searches.
+        self.evaluated_params = evaluated_params or {}
+        self.experiment_tag = experiment_tag
+        self.stopping_criterion = stopping_criterion or {}
+        self._setup_default_resource = _setup_default_resource
+        if placement_group_factory and not isinstance(
+            placement_group_factory, PlacementGroupFactory
+        ):
+            placement_group_factory = resource_dict_to_pg_factory(
+                placement_group_factory
+            )
+        self._default_placement_group_factory = placement_group_factory
+        # Will be created in create_placement_group_factory().
+        self.placement_group_factory = None
+        self.log_to_file = log_to_file
+        # Make sure `stdout_file, stderr_file = Trial.log_to_file` works
+        if (
+            not self.log_to_file
+            or not isinstance(self.log_to_file, Sequence)
+            or not len(self.log_to_file) == 2
+        ):
+            self.log_to_file = (None, None)
+        self.max_failures = max_failures
+        # Local trial state that is updated during the run
+        self._default_result_or_future: Union[ray.ObjectRef, dict, None] = None
+        self.export_formats = export_formats
+        self.status = Trial.PENDING
+        self.relative_logdir = None
+        self.trial_name_creator = trial_name_creator
+        self.trial_dirname_creator = trial_dirname_creator
+        self.custom_trial_name = None
+        self.custom_dirname = None
+        # Checkpoint config
+        checkpoint_config = checkpoint_config or CheckpointConfig()
+        self.run_metadata.checkpoint_manager = _CheckpointManager(
+            checkpoint_config=checkpoint_config
+        )
+        # Restoration fields
+        self.restore_path = restore_path
+        self._restore_checkpoint_result: Optional[_TrainingResult] = None
+        if restore_path:
+            # tune.run(restore) passes in a path without metrics.
+            self._restore_checkpoint_result = _TrainingResult(
+                checkpoint=Checkpoint.from_directory(restore_path), metrics={}
+            )
+        if trial_name_creator:
+            self.custom_trial_name = trial_name_creator(self)
+        if trial_dirname_creator:
+            self.custom_dirname = trial_dirname_creator(self)
+            if os.path.sep in self.custom_dirname:
+                raise ValueError(
+                    f"Trial dirname must not contain '/'. Got {self.custom_dirname}"
+                )
+        self._state_json = None
+    def create_placement_group_factory(self):
+        """Compute placement group factory if needed.
+        Note: this must be called after all the placeholders in
+        self.config are resolved.
+        """
+        trainable_cls = self.get_trainable_cls()
+        if not trainable_cls or not self._setup_default_resource:
+            # Create placement group factory using default resources.
+            self.placement_group_factory = (
+                self._default_placement_group_factory or resource_dict_to_pg_factory()
+            )
+            return
+        default_resources = trainable_cls.default_resource_request(self.config)
+        # If Trainable returns resources, do not allow manual override via
+        # `resources_per_trial` by the user.
+        if default_resources and self._default_placement_group_factory:
+            raise TuneError(
+                "Resources for {} have been automatically set to {} "
+                "by its `default_resource_request()` method. Please "
+                "clear the `resources_per_trial` option.".format(
+                    trainable_cls, default_resources
+                )
+            )
+        if default_resources and not isinstance(
+            default_resources, PlacementGroupFactory
+        ):
+            default_resources = resource_dict_to_pg_factory(default_resources)
+        self.placement_group_factory = (
+            # default_resource_request
+            default_resources
+            # resources_per_trial
+            or self._default_placement_group_factory
+            # cpu=1
+            or resource_dict_to_pg_factory()
+        )
+    def _get_default_result_or_future(self) -> Optional[dict]:
+        """Calls ray.get on self._default_result_or_future and assigns back.
+        Returns None in case of exceptions.
+        Will also set the trial location if runner is set.
+        """
+        if self._default_result_or_future and isinstance(
+            self._default_result_or_future, ray.ObjectRef
+        ):
+            try:
+                self._default_result_or_future = ray.get(self._default_result_or_future)
+            except RayActorError:  # error during initialization
+                self._default_result_or_future = None
+        if self._default_result_or_future and self.temporary_state.ray_actor:
+            self.set_location(
+                _Location(
+                    self._default_result_or_future.get(NODE_IP),
+                    self._default_result_or_future.get(PID),
+                )
+            )
+        return self._default_result_or_future
+    def resolve_config_placeholders(self, placeholder_resolvers: Dict[Tuple, Any]):
+        from ray.tune.impl.placeholder import resolve_placeholders
+        # Make a copy of the unresolved config before resolve it.
+        self.config = copy.deepcopy(self.__unresolved_config)
+        resolve_placeholders(self.config, placeholder_resolvers)
+    @property
+    def last_result(self) -> dict:
+        # The logic in here is as follows:
+        # 1. If the trial has reported at least once, last_result would have
+        #    been set and therefore would not be empty. We can just return it.
+        # 2. If the trial has not reported at least once but we have the
+        #    future for the default results dict, (obtained through
+        #    Trainable.get_auto_filled_metrics), we get that future
+        #    and return it.
+        # 3. In the worst case where we have nothing, we just set the
+        #    trial_id and return that.
+        result = self.run_metadata.last_result
+        if not {k for k in result if k != TRIAL_ID}:
+            self._get_default_result_or_future()
+            result = self._default_result_or_future or result
+        result.setdefault(TRIAL_ID, self.trial_id)
+        return result
+    @property
+    def metric_analysis(self):
+        return self.run_metadata.metric_analysis
+    @property
+    def metric_n_steps(self):
+        return self.run_metadata.metric_n_steps
+    def get_ray_actor_ip(self) -> Optional[str]:
+        if self.temporary_state.location.hostname:
+            return self.temporary_state.location.hostname
+        if not self.temporary_state.ray_actor:
+            return None
+        hostname, pid = ray.get(
+            self.temporary_state.ray_actor.get_current_ip_pid.remote()
+        )
+        self.temporary_state.location = _Location(hostname, pid)
+        return self.temporary_state.location.hostname
+    @property
+    @Deprecated("Replaced by `local_experiment_path`")
+    def local_dir(self):
+        return self.local_experiment_path
+    @property
+    def experiment_dir_name(self):
+        return self.storage.experiment_dir_name
+    @property
+    def remote_experiment_path(self) -> str:
+        return self.storage.experiment_fs_path
+    @property
+    def local_experiment_path(self) -> str:
+        return self.storage.experiment_driver_staging_path
+    @property
+    @Deprecated("Replaced by `local_path`")
+    def logdir(self) -> Optional[str]:
+        # TODO(justinvyu): [Deprecated] Remove in 2.11.
+        raise DeprecationWarning("Use `local_path` instead of `logdir`.")
+    @property
+    def local_path(self) -> Optional[str]:
+        return self.storage.trial_driver_staging_path
+    @property
+    def path(self) -> Optional[str]:
+        return self.storage.trial_fs_path
+    @property
+    def has_reported_at_least_once(self) -> bool:
+        return bool(self.run_metadata.last_result)
+    @property
+    def node_ip(self):
+        return self.temporary_state.location.hostname
+    @property
+    def checkpoint_at_end(self):
+        config = self.run_metadata.checkpoint_manager.checkpoint_config
+        return config.checkpoint_at_end
+    @property
+    def checkpoint_freq(self):
+        config = self.run_metadata.checkpoint_manager.checkpoint_config
+        return config.checkpoint_frequency
+    @property
+    def latest_checkpoint_result(self) -> Optional[_TrainingResult]:
+        # NOTE: Fallback to the checkpoint passed in from `tune.run(restore)`
+        # if the trial hasn't saved any checkpoints itself yet.
+        return (
+            self.run_metadata.checkpoint_manager.latest_checkpoint_result
+            or self._restore_checkpoint_result
+        )
+    @property
+    def checkpoint(self) -> Optional[Checkpoint]:
+        """Returns the most recent checkpoint if one has been saved."""
+        return (
+            self.latest_checkpoint_result.checkpoint
+            if self.latest_checkpoint_result
+            else None
+        )
+    @classmethod
+    def generate_id(cls):
+        return str(uuid.uuid4().hex)[:8]
+    def reset(self) -> "Trial":
+        # If there is `default_resource_request` associated with the trainable,
+        # clear `resources` and `placement_group_factory`.
+        # This is mainly relevant for RLlib tuning jobs, where we save users
+        # of the trouble to specify the resources themselves by having some
+        # default resources for popular RLlib algorithms.
+        trainable_cls = self.get_trainable_cls()
+        clear_resources = trainable_cls and trainable_cls.default_resource_request(
+            self.config
+        )
+        placement_group_factory = (
+            self.placement_group_factory if not clear_resources else None
+        )
+        checkpoint_config = self.run_metadata.checkpoint_manager.checkpoint_config
+        return Trial(
+            self.trainable_name,
+            config=self.config,
+            trial_id=None,
+            evaluated_params=self.evaluated_params,
+            experiment_tag=self.experiment_tag,
+            placement_group_factory=placement_group_factory,
+            stopping_criterion=self.stopping_criterion,
+            checkpoint_config=checkpoint_config,
+            export_formats=self.export_formats,
+            restore_path=self.restore_path,
+            trial_name_creator=self.trial_name_creator,
+            trial_dirname_creator=self.trial_dirname_creator,
+            log_to_file=self.log_to_file,
+            max_failures=self.max_failures,
+            storage=self.storage,
+        )
+    @Deprecated("Replaced by `init_local_path()`")
+    def init_logdir(self):
+        # TODO(justinvyu): [Deprecated] Remove in 2.11.
+        raise DeprecationWarning("Use `init_local_path` instead of `init_logdir`.")
+    def init_local_path(self):
+        """Init logdir."""
+        if not self.relative_logdir:
+            self.relative_logdir = _create_unique_logdir_name(
+                str(self.local_experiment_path), self._generate_dirname()
+            )
+            # Populate the storage context with the trial dir name we just generated.
+            self.storage.trial_dir_name = self.relative_logdir
+        assert self.local_path
+        logdir_path = Path(self.local_path)
+        max_path_length = _get_max_path_length()
+        if len(str(logdir_path)) >= max_path_length:
+            logger.warning(
+                f"The path to the trial log directory is too long "
+                f"(max length: {max_path_length}. "
+                f"Consider using `trial_dirname_creator` to shorten the path. "
+                f"Path: {logdir_path}"
+            )
+        logdir_path.mkdir(parents=True, exist_ok=True)
+        self.invalidate_json_state()
+    def update_resources(self, resources: Union[dict, PlacementGroupFactory]):
+        """EXPERIMENTAL: Updates the resource requirements.
+        Should only be called when the trial is not running.
+        Raises:
+            ValueError: if trial status is running.
+        """
+        if self.status is Trial.RUNNING:
+            raise ValueError("Cannot update resources while Trial is running.")
+        placement_group_factory = resources
+        if isinstance(resources, dict):
+            placement_group_factory = resource_dict_to_pg_factory(resources)
+        self.placement_group_factory = placement_group_factory
+        self.invalidate_json_state()
+    def set_ray_actor(self, ray_actor):
+        self.temporary_state.ray_actor = ray_actor
+        if ray_actor:
+            # Do not block here, the result will be gotten when last_result
+            # property is accessed
+            self._default_result_or_future = ray_actor.get_auto_filled_metrics.remote(
+                debug_metrics_only=True
+            )
+    def set_location(self, location):
+        """Sets the location of the trial."""
+        self.temporary_state.location = location
+    def set_status(self, status):
+        """Sets the status of the trial."""
+        self.status = status
+        if status == Trial.RUNNING:
+            if self.run_metadata.start_time is None:
+                self.run_metadata.start_time = time.time()
+        self.invalidate_json_state()
+    def set_config(self, config):
+        self.config = config
+        self.invalidate_json_state()
+    def set_experiment_tag(self, experiment_tag):
+        self.experiment_tag = experiment_tag
+        self.invalidate_json_state()
+    def set_storage(self, new_storage: StorageContext):
+        """Updates the storage context of the trial.
+        If the `storage_path` or `experiment_dir_name` has changed, then this setter
+        also updates the paths of all checkpoints tracked by the checkpoint manager.
+        This enables restoration from a checkpoint if the user moves the directory.
+        """
+        original_storage = self.storage
+        checkpoint_manager = self.run_metadata.checkpoint_manager
+        for checkpoint_result in checkpoint_manager.best_checkpoint_results:
+            checkpoint_result.checkpoint = Checkpoint(
+                path=checkpoint_result.checkpoint.path.replace(
+                    original_storage.trial_fs_path, new_storage.trial_fs_path, 1
+                ),
+                filesystem=new_storage.storage_filesystem,
+            )
+        latest_checkpoint_result = checkpoint_manager.latest_checkpoint_result
+        if latest_checkpoint_result:
+            latest_checkpoint_result.checkpoint = Checkpoint(
+                path=latest_checkpoint_result.checkpoint.path.replace(
+                    original_storage.trial_fs_path, new_storage.trial_fs_path, 1
+                ),
+                filesystem=new_storage.storage_filesystem,
+            )
+        self.storage = new_storage
+        self.invalidate_json_state()
+    @property
+    def num_failures(self):
+        return self.run_metadata.num_failures
+    @property
+    def num_failures_after_restore(self):
+        return self.run_metadata.num_failures_after_restore
+    @property
+    def error_file(self):
+        if not self.local_path or not self.run_metadata.error_filename:
+            return None
+        return Path(self.local_path, self.run_metadata.error_filename).as_posix()
+    @property
+    def pickled_error_file(self):
+        if not self.local_path or not self.run_metadata.pickled_error_filename:
+            return None
+        return Path(
+            self.local_path, self.run_metadata.pickled_error_filename
+        ).as_posix()
+    def get_pickled_error(self) -> Optional[Exception]:
+        """Returns the pickled error object if it exists in storage.
+        This is a pickled version of the latest error that the trial encountered.
+        """
+        error_filename = self.run_metadata.pickled_error_filename
+        if error_filename is None:
+            return None
+        fs = self.storage.storage_filesystem
+        pickled_error_fs_path = Path(
+            self.storage.trial_fs_path, error_filename
+        ).as_posix()
+        if _exists_at_fs_path(fs=fs, fs_path=pickled_error_fs_path):
+            with fs.open_input_stream(pickled_error_fs_path) as f:
+                return cloudpickle.loads(f.readall())
+        return None
+    def get_error(self) -> Optional[TuneError]:
+        """Returns the error text file trace as a TuneError object
+        if it exists in storage.
+        This is a text trace of the latest error that the trial encountered,
+        which is used in the case that the error is not picklable.
+        """
+        error_filename = self.run_metadata.error_filename
+        if error_filename is None:
+            return None
+        fs = self.storage.storage_filesystem
+        txt_error_fs_path = Path(self.storage.trial_fs_path, error_filename).as_posix()
+        if _exists_at_fs_path(fs=fs, fs_path=txt_error_fs_path):
+            with fs.open_input_stream(txt_error_fs_path) as f:
+                return f.readall().decode()
+        return None
+    def _handle_restore_error(self, exc: Exception):
+        # For Restoration errors, we only increment the restore failure count
+        # if the number of failures exceeds the restore retry limit.
+        if self.temporary_state.num_restore_failures >= int(
+            os.environ.get("TUNE_RESTORE_RETRY_NUM", 0)
+        ):
+            self.run_metadata.num_failures += 1
+        else:
+            self.temporary_state.num_restore_failures += 1
+    def _handle_ray_actor_error(self, exc: RayActorError):
+        count_preemption_errors = bool(
+            int(os.environ.get(RAY_TRAIN_COUNT_PREEMPTION_AS_FAILURE, "0"))
+        )
+        if not exc.preempted or count_preemption_errors:
+            # Only count non-preempted actor errors as failures.
+            self.run_metadata.num_failures += 1
+    def _handle_ray_task_error(self, exc: RayTaskError):
+        cause = exc.as_instanceof_cause()
+        if isinstance(cause, RayActorError):
+            # Handle the RayActorError directly (ex: Ray Train worker actor errors)
+            return self._handle_ray_actor_error(cause)
+        # Increment failures for all user errors (which get raised as RayTaskError)
+        self.run_metadata.num_failures += 1
+    def handle_error(
+        self, exc: Optional[Union[TuneError, RayTaskError, RayActorError]] = None
+    ):
+        if self.is_restoring:
+            self._handle_restore_error(exc)
+        elif isinstance(exc, RayActorError):
+            self._handle_ray_actor_error(exc)
+        elif isinstance(exc, RayTaskError):
+            self._handle_ray_task_error(exc)
+        else:
+            self.run_metadata.num_failures += 1
+        if self.local_path:
+            self.run_metadata.error_filename = EXPR_ERROR_FILE
+            if isinstance(exc, (RayTaskError, RayActorError)):
+                # Piping through the actual error to result grid.
+                self.run_metadata.pickled_error_filename = EXPR_ERROR_PICKLE_FILE
+                with open(self.pickled_error_file, "wb") as f:
+                    cloudpickle.dump(exc, f)
+            with open(self.error_file, "a+") as f:
+                f.write(
+                    "Failure # {} (occurred at {})\n".format(
+                        self.run_metadata.num_failures, date_str()
+                    )
+                )
+                f.write(str(exc) + "\n")
+        self.run_metadata.invalidate_cache()
+    def should_stop(self, result):
+        """Whether the given result meets this trial's stopping criteria."""
+        if result.get(DONE):
+            return True
+        for criterion, stop_value in self.stopping_criterion.items():
+            if isinstance(criterion, dict):
+                raise ValueError(
+                    "Stopping criteria is now flattened by default. "
+                    "Use forward slashes to nest values `key1/key2/key3`."
+                )
+            elif criterion not in result:
+                if log_once("tune_trial_stop_criterion_not_found"):
+                    logger.warning(
+                        f"Stopping criterion '{criterion}' not found in result dict! "
+                        f"Available keys are {list(result.keys())}. If '{criterion}' is"
+                        " never reported, the run will continue until training is "
+                        "finished."
+                    )
+            elif result[criterion] >= stop_value:
+                return True
+        return False
+    def should_checkpoint(self):
+        """Whether this trial is due for checkpointing."""
+        result = self.last_result or {}
+        if result.get(DONE) and self.checkpoint_at_end:
+            return True
+        return (
+            self.checkpoint_freq
+            and result.get(TRAINING_ITERATION, 0) % self.checkpoint_freq == 0
+        )
+    def has_checkpoint(self) -> bool:
+        return self.checkpoint is not None
+    def on_checkpoint(self, checkpoint_result: _TrainingResult):
+        """Hook for handling checkpoints taken by the Trainable.
+        Args:
+            checkpoint: Checkpoint taken.
+        """
+        self.run_metadata.checkpoint_manager.register_checkpoint(checkpoint_result)
+        # Update the checkpoint index to keep the checkpoint index in sync.
+        # This index will get restored when the trial is restored and will
+        # be passed to the Trainable as the starting checkpoint index.
+        self.storage._update_checkpoint_index(checkpoint_result.metrics)
+        self.invalidate_json_state()
+        self.run_metadata.invalidate_cache()
+    def on_restore(self):
+        """Handles restoration completion."""
+        assert self.is_restoring
+        self.run_metadata.last_result = self.temporary_state.restoring_from.metrics
+        self.run_metadata.last_result.setdefault("config", self.config)
+        self.temporary_state.restoring_from = None
+        self.temporary_state.num_restore_failures = 0
+    def should_recover(self):
+        """Returns whether the trial qualifies for retrying.
+        `num_failures` should represent the number of times the trial has
+        failed *up to the moment this method is called.* If we've failed
+        5 times and `max_failures=5`, then we should recover, since
+        we only pass the limit on the 6th failure.
+        Note this may return true even when there is no checkpoint, either because
+        `self.checkpoint_freq` is `0` or because the trial failed before
+        a checkpoint has been made.
+        """
+        return (
+            self.run_metadata.num_failures <= self.max_failures or self.max_failures < 0
+        )
+    def update_last_result(self, result):
+        if self.experiment_tag:
+            result.update(experiment_tag=self.experiment_tag)
+        self.set_location(_Location(result.get(NODE_IP), result.get(PID)))
+        self.run_metadata.last_result = result
+        self.run_metadata.last_result_time = time.time()
+        metric_result = self.last_result.copy()
+        for remove_metric in DEBUG_METRICS:
+            metric_result.pop(remove_metric, None)
+        for metric, value in flatten_dict(metric_result).items():
+            if isinstance(value, Number):
+                self.run_metadata.update_metric(
+                    metric, value, step=result.get("training_iteration")
+                )
+    def get_trainable_cls(self):
+        if self.stub:
+            return None
+        return get_trainable_cls(self.trainable_name)
+    def is_finished(self):
+        return self.status in [Trial.ERROR, Trial.TERMINATED]
+    @property
+    def is_restoring(self):
+        return self.temporary_state.restoring_from is not None
+    @property
+    def is_saving(self):
+        return self.temporary_state.saving_to is not None
+    def __repr__(self):
+        return self._trainable_name(include_trial_id=True)
+    def __str__(self):
+        return self._trainable_name(include_trial_id=True)
+    def _trainable_name(self, include_trial_id=False):
+        """Combines ``env`` with ``trainable_name`` and ``trial_id``.
+        Can be overridden with a custom string creator.
+        """
+        if self.custom_trial_name:
+            return self.custom_trial_name
+        if "env" in self.config:
+            env = self.config["env"]
+            if isinstance(env, type):
+                env = env.__name__
+            identifier = "{}_{}".format(self.trainable_name, env)
+        else:
+            identifier = self.trainable_name
+        if include_trial_id:
+            identifier += "_" + self.trial_id
+        return identifier.replace("/", "_")
+    def _generate_dirname(self):
+        if self.custom_dirname:
+            generated_dirname = self.custom_dirname
+        else:
+            MAX_LEN_IDENTIFIER = int(os.environ.get("TUNE_MAX_LEN_IDENTIFIER", "130"))
+            generated_dirname = f"{str(self)}_{self.experiment_tag}"
+            generated_dirname = generated_dirname[:MAX_LEN_IDENTIFIER]
+            generated_dirname += f"_{date_str()}"
+        # This is the file path used by rsync. ['/', '(', ')'] are not allowed.
+        return re.sub("[/()]", "_", generated_dirname)
+    def invalidate_json_state(self):
+        self._state_json = None
+    def get_json_state(self) -> Tuple[str, str]:
+        if self._state_json is None:
+            state = self.__getstate__()
+            state.pop("run_metadata", None)
+            self._state_json = json.dumps(state, indent=2, cls=TuneFunctionEncoder)
+        runtime_metadata_json = self.run_metadata.get_json_state()
+        return self._state_json, runtime_metadata_json
+    @classmethod
+    def from_json_state(cls, json_state: str, stub: bool = False) -> "Trial":
+        state = json.loads(json_state, cls=TuneFunctionDecoder)
+        new_trial = Trial(
+            state["trainable_name"],
+            stub=stub,
+            _setup_default_resource=False,
+        )
+        new_trial.__setstate__(state)
+        return new_trial
+    def restore_run_metadata(self, run_metadata: str):
+        self.run_metadata = _TrainingRunMetadata.from_json_state(run_metadata)
+    @classmethod
+    def from_directory(
+        cls, path: Union[str, os.PathLike], stub: bool = False
+    ) -> "Trial":
+        metadata_path = Path(path, TRIAL_STATE_FILENAME)
+        if not metadata_path.exists():
+            raise FileNotFoundError(
+                f"Can't restore trial from path: File `{metadata_path}` not found."
+            )
+        json_state = metadata_path.read_text()
+        return cls.from_json_state(json_state, stub=stub)
+    def __getstate__(self):
+        """Memento generator for Trial.
+        Sets RUNNING trials to PENDING.
+        Note this can only occur if the trial holds a PERSISTENT checkpoint.
+        """
+        state = self.__dict__.copy()
+        for key in self._nonjson_fields:
+            state[key] = binary_to_hex(cloudpickle.dumps(state.get(key)))
+        state.pop("temporary_state", None)
+        state["_state_json"] = None
+        state["_default_result_or_future"] = None
+        return state
+    def __setstate__(self, state):
+        if state["status"] == Trial.RUNNING:
+            state["status"] = Trial.PENDING
+        for key in self._nonjson_fields:
+            if key in state:
+                state[key] = cloudpickle.loads(hex_to_binary(state[key]))
+        # Ensure that stub doesn't get overriden
+        stub = state.pop("stub", True)
+        self.__dict__.update(state)
+        self.stub = stub or getattr(self, "stub", False)
+        if not self.stub:
+            validate_trainable(self.trainable_name)
+        self.temporary_state = _TemporaryTrialState()
+        assert self.placement_group_factory

.venv/lib/python3.11/site-packages/ray/tune/integration/__init__.py ADDED Viewed

File without changes

.venv/lib/python3.11/site-packages/ray/tune/integration/__pycache__/keras.cpython-311.pyc ADDED Viewed

Binary file (1.64 kB). View file

.venv/lib/python3.11/site-packages/ray/tune/integration/__pycache__/lightgbm.cpython-311.pyc ADDED Viewed

Binary file (1.02 kB). View file

.venv/lib/python3.11/site-packages/ray/tune/integration/__pycache__/pytorch_lightning.cpython-311.pyc ADDED Viewed

Binary file (11.5 kB). View file

.venv/lib/python3.11/site-packages/ray/tune/integration/__pycache__/ray_train.cpython-311.pyc ADDED Viewed

Binary file (1.69 kB). View file

.venv/lib/python3.11/site-packages/ray/tune/integration/__pycache__/xgboost.cpython-311.pyc ADDED Viewed

Binary file (1.01 kB). View file

.venv/lib/python3.11/site-packages/ray/tune/integration/keras.py ADDED Viewed

	@@ -0,0 +1,28 @@

+_DEPRECATION_MESSAGE = (
+    "The `ray.tune.integration.keras` module is deprecated in favor of "
+    "`ray.train.tensorflow.keras.ReportCheckpointCallback`."
+)
+class TuneReportCallback:
+    """Deprecated.
+    Use :class:`ray.train.tensorflow.keras.ReportCheckpointCallback` instead."""
+    def __new__(cls, *args, **kwargs):
+        raise DeprecationWarning(_DEPRECATION_MESSAGE)
+class _TuneCheckpointCallback:
+    """Deprecated.
+    Use :class:`ray.train.tensorflow.keras.ReportCheckpointCallback` instead."""
+    def __new__(cls, *args, **kwargs):
+        raise DeprecationWarning(_DEPRECATION_MESSAGE)
+class TuneReportCheckpointCallback:
+    """Deprecated.
+    Use :class:`ray.train.tensorflow.keras.ReportCheckpointCallback` instead."""
+    def __new__(cls, *args, **kwargs):
+        raise DeprecationWarning(_DEPRECATION_MESSAGE)