diff --git a/.venv/lib/python3.11/site-packages/ray/train/__init__.py b/.venv/lib/python3.11/site-packages/ray/train/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..1f8c1343d68b47aa75279c0c7047314cbf1df389 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/train/__init__.py @@ -0,0 +1,90 @@ +# Try import ray[train] core requirements (defined in setup.py) +# isort: off +try: + import fsspec # noqa: F401 + import pandas # noqa: F401 + import pyarrow # noqa: F401 + import requests # noqa: F401 +except ImportError as exc: + raise ImportError( + "Can't import ray.train as some dependencies are missing. " + 'Run `pip install "ray[train]"` to fix.' + ) from exc +# isort: on + + +from ray._private.usage import usage_lib +from ray.air.config import CheckpointConfig, FailureConfig, RunConfig, ScalingConfig +from ray.air.result import Result + +# Import this first so it can be used in other modules +from ray.train._checkpoint import Checkpoint +from ray.train._internal.data_config import DataConfig +from ray.train._internal.session import get_checkpoint, get_dataset_shard, report +from ray.train._internal.syncer import SyncConfig +from ray.train.backend import BackendConfig +from ray.train.constants import TRAIN_DATASET_KEY +from ray.train.context import get_context +from ray.train.trainer import TrainingIterator +from ray.train.v2._internal.constants import is_v2_enabled + +if is_v2_enabled(): + from ray.train.v2.api.callback import UserCallback # noqa: F811 + from ray.train.v2.api.config import ( # noqa: F811 + FailureConfig, + RunConfig, + ScalingConfig, + ) + from ray.train.v2.api.result import Result # noqa: F811 + from ray.train.v2.api.train_fn_utils import ( # noqa: F811 + get_checkpoint, + get_context, + get_dataset_shard, + report, + ) + + +usage_lib.record_library_usage("train") + +Checkpoint.__module__ = "ray.train" + +__all__ = [ + "get_checkpoint", + "get_context", + "get_dataset_shard", + "report", + "BackendConfig", + "Checkpoint", + "CheckpointConfig", + "DataConfig", + "FailureConfig", + "Result", + "RunConfig", + "ScalingConfig", + "SyncConfig", + "TrainingIterator", + "TRAIN_DATASET_KEY", +] + +get_checkpoint.__module__ = "ray.train" +get_context.__module__ = "ray.train" +get_dataset_shard.__module__ = "ray.train" +report.__module__ = "ray.train" +BackendConfig.__module__ = "ray.train" +Checkpoint.__module__ = "ray.train" +CheckpointConfig.__module__ = "ray.train" +DataConfig.__module__ = "ray.train" +FailureConfig.__module__ = "ray.train" +Result.__module__ = "ray.train" +RunConfig.__module__ = "ray.train" +ScalingConfig.__module__ = "ray.train" +SyncConfig.__module__ = "ray.train" +TrainingIterator.__module__ = "ray.train" + + +if is_v2_enabled(): + __all__.append("UserCallback") + UserCallback.__module__ = "ray.train" + + +# DO NOT ADD ANYTHING AFTER THIS LINE. diff --git a/.venv/lib/python3.11/site-packages/ray/train/_checkpoint.py b/.venv/lib/python3.11/site-packages/ray/train/_checkpoint.py new file mode 100644 index 0000000000000000000000000000000000000000..5ee65be4f20fe17126b315726bcfb825e9c89e45 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/train/_checkpoint.py @@ -0,0 +1,424 @@ +import contextlib +import glob +import json +import logging +import os +import platform +import shutil +import tempfile +import traceback +import uuid +from pathlib import Path +from typing import Any, Dict, Iterator, List, Optional, Union + +import pyarrow.fs + +from ray.air._internal.filelock import TempFileLock +from ray.train._internal.storage import _download_from_fs_path, _exists_at_fs_path +from ray.util.annotations import PublicAPI + +logger = logging.getLogger(__name__) + +# The filename of the file that stores user metadata set on the checkpoint. +_METADATA_FILE_NAME = ".metadata.json" + +# The prefix of the temp checkpoint directory that `to_directory` downloads to +# on the local filesystem. +_CHECKPOINT_TEMP_DIR_PREFIX = "checkpoint_tmp_" + + +class _CheckpointMetaClass(type): + def __getattr__(self, item): + try: + return super().__getattribute__(item) + except AttributeError as exc: + if item in { + "from_dict", + "to_dict", + "from_bytes", + "to_bytes", + "get_internal_representation", + }: + raise _get_migration_error(item) from exc + elif item in { + "from_uri", + "to_uri", + "uri", + }: + raise _get_uri_error(item) from exc + elif item in {"get_preprocessor", "set_preprocessor"}: + raise _get_preprocessor_error(item) from exc + + raise exc + + +@PublicAPI(stability="beta") +class Checkpoint(metaclass=_CheckpointMetaClass): + """A reference to data persisted as a directory in local or remote storage. + + Access the checkpoint contents locally using ``checkpoint.to_directory()`` + or ``checkpoint.as_directory``. + + Attributes + ---------- + path: A path on the filesystem containing the checkpoint contents. + filesystem: PyArrow FileSystem that can be used to access data at the `path`. + + See Also + -------- + ray.train.report : Report a checkpoint during training (with Ray Train/Tune). + ray.train.get_checkpoint : Get the latest checkpoint during training + (for restoration). + + :ref:`train-checkpointing` + :ref:`persistent-storage-guide` + + Examples + -------- + + Creating a checkpoint using ``Checkpoint.from_directory``: + + >>> from ray.train import Checkpoint + >>> checkpoint = Checkpoint.from_directory("/tmp/example_checkpoint_dir") + >>> checkpoint.filesystem # doctest: +ELLIPSIS + >> checkpoint.path + '/tmp/example_checkpoint_dir' + + Creating a checkpoint from a remote URI: + + >>> checkpoint = Checkpoint("s3://bucket/path/to/checkpoint") + >>> checkpoint.filesystem # doctest: +ELLIPSIS + >> checkpoint.path + 'bucket/path/to/checkpoint' + + Creating a checkpoint with a custom filesystem: + + >>> checkpoint = Checkpoint( + ... path="bucket/path/to/checkpoint", + ... filesystem=pyarrow.fs.S3FileSystem(), + ... ) + >>> checkpoint.filesystem # doctest: +ELLIPSIS + >> checkpoint.path + 'bucket/path/to/checkpoint' + + Accessing a checkpoint's contents: + + >>> import os # doctest: +SKIP + >>> with checkpoint.as_directory() as local_checkpoint_dir: # doctest: +SKIP + ... print(os.listdir(local_checkpoint_dir)) # doctest: +SKIP + ['model.pt', 'optimizer.pt', 'misc.pt'] + """ + + def __init__( + self, + path: Union[str, os.PathLike], + filesystem: Optional["pyarrow.fs.FileSystem"] = None, + ): + """Construct a Checkpoint. + + Args: + path: A local path or remote URI containing the checkpoint data. + If a filesystem is provided, then this path must NOT be a URI. + It should be a path on the filesystem with the prefix already stripped. + filesystem: PyArrow FileSystem to use to access data at the path. + If not specified, this is inferred from the URI scheme. + """ + self.path = str(path) + self.filesystem = filesystem + + if path and not filesystem: + self.filesystem, self.path = pyarrow.fs.FileSystem.from_uri(path) + + # This random UUID is used to create a temporary directory name on the + # local filesystem, which will be used for downloading checkpoint data. + # This ensures that if multiple processes download the same checkpoint object + # only one process performs the actual download while the others wait. + # This prevents duplicated download efforts and data. + # NOTE: Calling `to_directory` from multiple `Checkpoint` objects + # that point to the same (fs, path) will still download the data multiple times. + # This only ensures a canonical temp directory name for a single `Checkpoint`. + self._uuid = uuid.uuid4() + + def __repr__(self): + return f"Checkpoint(filesystem={self.filesystem.type_name}, path={self.path})" + + def get_metadata(self) -> Dict[str, Any]: + """Return the metadata dict stored with the checkpoint. + + If no metadata is stored, an empty dict is returned. + """ + metadata_path = Path(self.path, _METADATA_FILE_NAME).as_posix() + if not _exists_at_fs_path(self.filesystem, metadata_path): + return {} + + with self.filesystem.open_input_file(metadata_path) as f: + return json.loads(f.readall().decode("utf-8")) + + def set_metadata(self, metadata: Dict[str, Any]) -> None: + """Set the metadata stored with this checkpoint. + + This will overwrite any existing metadata stored with this checkpoint. + """ + metadata_path = Path(self.path, _METADATA_FILE_NAME).as_posix() + with self.filesystem.open_output_stream(metadata_path) as f: + f.write(json.dumps(metadata).encode("utf-8")) + + def update_metadata(self, metadata: Dict[str, Any]) -> None: + """Update the metadata stored with this checkpoint. + + This will update any existing metadata stored with this checkpoint. + """ + existing_metadata = self.get_metadata() + existing_metadata.update(metadata) + self.set_metadata(existing_metadata) + + @classmethod + def from_directory(cls, path: Union[str, os.PathLike]) -> "Checkpoint": + """Create checkpoint object from a local directory. + + Args: + path: Local directory containing checkpoint data. + + Returns: + A ray.train.Checkpoint object. + """ + return cls(path, filesystem=pyarrow.fs.LocalFileSystem()) + + def to_directory(self, path: Optional[Union[str, os.PathLike]] = None) -> str: + """Write checkpoint data to a local directory. + + *If multiple processes on the same node call this method simultaneously,* + only a single process will perform the download, while the others + wait for the download to finish. Once the download finishes, all processes + receive the same local directory to read from. + + Args: + path: Target directory to download data to. If not specified, + this method will use a temporary directory. + + Returns: + str: Directory containing checkpoint data. + """ + user_provided_path = path is not None + local_path = ( + path if user_provided_path else self._get_temporary_checkpoint_dir() + ) + local_path = os.path.normpath(os.path.expanduser(str(local_path))) + os.makedirs(local_path, exist_ok=True) + + try: + # Timeout 0 means there will be only one attempt to acquire + # the file lock. If it cannot be acquired, throw a TimeoutError + with TempFileLock(local_path, timeout=0): + _download_from_fs_path( + fs=self.filesystem, fs_path=self.path, local_path=local_path + ) + except TimeoutError: + # if the directory is already locked, then wait but do not do anything. + with TempFileLock(local_path, timeout=-1): + pass + if not os.path.exists(local_path): + raise RuntimeError( + f"Checkpoint directory {local_path} does not exist, " + "even though it should have been created by " + "another process. Please raise an issue on GitHub: " + "https://github.com/ray-project/ray/issues" + ) + + return local_path + + @contextlib.contextmanager + def as_directory(self) -> Iterator[str]: + """Returns checkpoint contents in a local directory as a context. + + This function makes checkpoint data available as a directory while avoiding + unnecessary copies and left-over temporary data. + + *If the checkpoint points to a local directory*, this method just returns the + local directory path without making a copy, and nothing will be cleaned up + after exiting the context. + + *If the checkpoint points to a remote directory*, this method will download the + checkpoint to a local temporary directory and return the path + to the temporary directory. + + *If multiple processes on the same node call this method simultaneously,* + only a single process will perform the download, while the others + wait for the download to finish. Once the download finishes, all processes + receive the same local (temporary) directory to read from. + + Once all processes have finished working with the checkpoint, + the temporary directory is cleaned up. + + Users should treat the returned checkpoint directory as read-only and avoid + changing any data within it, as it may be deleted when exiting the context. + + Example: + + .. testcode:: + :hide: + + from pathlib import Path + import tempfile + + from ray.train import Checkpoint + + temp_dir = tempfile.mkdtemp() + (Path(temp_dir) / "example.txt").write_text("example checkpoint data") + checkpoint = Checkpoint.from_directory(temp_dir) + + .. testcode:: + + with checkpoint.as_directory() as checkpoint_dir: + # Do some read-only processing of files within checkpoint_dir + pass + + # At this point, if a temporary directory was created, it will have + # been deleted. + + """ + if isinstance(self.filesystem, pyarrow.fs.LocalFileSystem): + yield self.path + else: + del_lock_path = _get_del_lock_path(self._get_temporary_checkpoint_dir()) + open(del_lock_path, "a").close() + + temp_dir = self.to_directory() + try: + yield temp_dir + finally: + # Always cleanup the del lock after we're done with the directory. + # This avoids leaving a lock file behind in the case of an exception + # in the user code. + try: + os.remove(del_lock_path) + except Exception: + logger.warning( + f"Could not remove {del_lock_path} deletion file lock. " + f"Traceback:\n{traceback.format_exc()}" + ) + + # If there are no more lock files, that means there are no more + # readers of this directory, and we can safely delete it. + # In the edge case (process crash before del lock file is removed), + # we do not remove the directory at all. + # Since it's in /tmp, this is not that big of a deal. + # check if any lock files are remaining + remaining_locks = _list_existing_del_locks(temp_dir) + if not remaining_locks: + try: + # Timeout 0 means there will be only one attempt to acquire + # the file lock. If it cannot be acquired, a TimeoutError + # will be thrown. + with TempFileLock(temp_dir, timeout=0): + shutil.rmtree(temp_dir, ignore_errors=True) + except TimeoutError: + pass + + def _get_temporary_checkpoint_dir(self) -> str: + """Return the name for the temporary checkpoint dir that this checkpoint + will get downloaded to, if accessing via `to_directory` or `as_directory`. + """ + tmp_dir_path = tempfile.gettempdir() + checkpoint_dir_name = _CHECKPOINT_TEMP_DIR_PREFIX + self._uuid.hex + if platform.system() == "Windows": + # Max path on Windows is 260 chars, -1 for joining \ + # Also leave a little for the del lock + del_lock_name = _get_del_lock_path("") + checkpoint_dir_name = ( + _CHECKPOINT_TEMP_DIR_PREFIX + + self._uuid.hex[ + -259 + + len(_CHECKPOINT_TEMP_DIR_PREFIX) + + len(tmp_dir_path) + + len(del_lock_name) : + ] + ) + if not checkpoint_dir_name.startswith(_CHECKPOINT_TEMP_DIR_PREFIX): + raise RuntimeError( + "Couldn't create checkpoint directory due to length " + "constraints. Try specifying a shorter checkpoint path." + ) + return Path(tmp_dir_path, checkpoint_dir_name).as_posix() + + def __fspath__(self): + raise TypeError( + "You cannot use `Checkpoint` objects directly as paths. " + "Use `Checkpoint.to_directory()` or `Checkpoint.as_directory()` instead." + ) + + +def _get_del_lock_path(path: str, suffix: str = None) -> str: + """Get the path to the deletion lock file for a file/directory at `path`. + + Example: + + >>> _get_del_lock_path("/tmp/checkpoint_tmp") # doctest: +ELLIPSIS + '/tmp/checkpoint_tmp.del_lock_... + >>> _get_del_lock_path("/tmp/checkpoint_tmp/") # doctest: +ELLIPSIS + '/tmp/checkpoint_tmp.del_lock_... + >>> _get_del_lock_path("/tmp/checkpoint_tmp.txt") # doctest: +ELLIPSIS + '/tmp/checkpoint_tmp.txt.del_lock_... + + """ + suffix = suffix if suffix is not None else str(os.getpid()) + return f"{path.rstrip('/')}.del_lock_{suffix}" + + +def _list_existing_del_locks(path: str) -> List[str]: + """List all the deletion lock files for a file/directory at `path`. + + For example, if 2 checkpoints are being read via `as_directory`, + then this should return a list of 2 deletion lock files. + """ + return list(glob.glob(f"{_get_del_lock_path(path, suffix='*')}")) + + +def _get_migration_error(name: str): + return AttributeError( + f"The new `ray.train.Checkpoint` class does not support `{name}()`. " + f"Instead, only directories are supported.\n\n" + f"Example to store a dictionary in a checkpoint:\n\n" + f"import os, tempfile\n" + f"import ray.cloudpickle as pickle\n" + f"from ray import train\n" + f"from ray.train import Checkpoint\n\n" + f"with tempfile.TemporaryDirectory() as checkpoint_dir:\n" + f" with open(os.path.join(checkpoint_dir, 'data.pkl'), 'wb') as fp:\n" + f" pickle.dump({{'data': 'value'}}, fp)\n\n" + f" checkpoint = Checkpoint.from_directory(checkpoint_dir)\n" + f" train.report(..., checkpoint=checkpoint)\n\n" + f"Example to load a dictionary from a checkpoint:\n\n" + f"if train.get_checkpoint():\n" + f" with train.get_checkpoint().as_directory() as checkpoint_dir:\n" + f" with open(os.path.join(checkpoint_dir, 'data.pkl'), 'rb') as fp:\n" + f" data = pickle.load(fp)" + ) + + +def _get_uri_error(name: str): + return AttributeError( + f"The new `ray.train.Checkpoint` class does not support `{name}()`. " + f"To create a checkpoint from remote storage, create a `Checkpoint` using its " + f"constructor instead of `from_directory`.\n" + f'Example: `Checkpoint(path="s3://a/b/c")`.\n' + f"Then, access the contents of the checkpoint with " + f"`checkpoint.as_directory()` / `checkpoint.to_directory()`.\n" + f"To upload data to remote storage, use e.g. `pyarrow.fs.FileSystem` " + f"or your client of choice." + ) + + +def _get_preprocessor_error(name: str): + return AttributeError( + f"The new `ray.train.Checkpoint` class does not support `{name}()`. " + f"To include preprocessor information in checkpoints, " + f"pass it as metadata in the Trainer constructor.\n" + f"Example: `TorchTrainer(..., metadata={{...}})`.\n" + f"After training, access it in the checkpoint via `checkpoint.get_metadata()`. " + f"See here: https://docs.ray.io/en/master/train/user-guides/" + f"data-loading-preprocessing.html#preprocessing-structured-data" + ) diff --git a/.venv/lib/python3.11/site-packages/ray/train/backend.py b/.venv/lib/python3.11/site-packages/ray/train/backend.py new file mode 100644 index 0000000000000000000000000000000000000000..b50f5867e7a75f47b36a0778463d7a113be1585d --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/train/backend.py @@ -0,0 +1,59 @@ +import logging +from contextlib import nullcontext +from typing import TypeVar + +from ray.train._internal.utils import Singleton +from ray.train._internal.worker_group import WorkerGroup +from ray.util.annotations import DeveloperAPI +from ray.widgets import make_table_html_repr + +EncodedData = TypeVar("EncodedData") + +logger = logging.getLogger(__name__) + + +@DeveloperAPI +class BackendConfig: + """Parent class for configurations of training backend.""" + + @property + def backend_cls(self): + return Backend + + @property + def train_func_context(self): + return nullcontext + + def _repr_html_(self) -> str: + return make_table_html_repr(obj=self, title=type(self).__name__) + + +@DeveloperAPI +class Backend(metaclass=Singleton): + """Singleton for distributed communication backend. + + Attributes: + share_cuda_visible_devices: If True, each worker + process will have CUDA_VISIBLE_DEVICES set as the visible device + IDs of all workers on the same node for this training instance. + If False, each worker will have CUDA_VISIBLE_DEVICES set to the + device IDs allocated by Ray for that worker. + """ + + share_cuda_visible_devices: bool = False + + def on_start(self, worker_group: WorkerGroup, backend_config: BackendConfig): + """Logic for starting this backend.""" + pass + + def on_shutdown(self, worker_group: WorkerGroup, backend_config: BackendConfig): + """Logic for shutting down the backend.""" + pass + + def on_training_start( + self, worker_group: WorkerGroup, backend_config: BackendConfig + ): + """Logic ran right before training is started. + + Session API is available at this point.""" + pass diff --git a/.venv/lib/python3.11/site-packages/ray/train/base_trainer.py b/.venv/lib/python3.11/site-packages/ray/train/base_trainer.py new file mode 100644 index 0000000000000000000000000000000000000000..04cac51c1ee0633faf58f01a6c61adabbb25234c --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/train/base_trainer.py @@ -0,0 +1,827 @@ +import abc +import copy +import inspect +import json +import logging +import os +import warnings +from functools import partial +from pathlib import Path +from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Type, Union + +import pyarrow.fs + +import ray +import ray.cloudpickle as pickle +from ray._private.dict import deep_update +from ray.air._internal import usage as air_usage +from ray.air._internal.config import ensure_only_allowed_dataclass_keys_updated +from ray.air._internal.usage import AirEntrypoint +from ray.air.config import RunConfig, ScalingConfig +from ray.air.result import Result +from ray.train import Checkpoint +from ray.train._internal.session import get_session +from ray.train._internal.storage import ( + StorageContext, + _exists_at_fs_path, + get_fs_and_path, +) +from ray.util import PublicAPI +from ray.util.annotations import DeveloperAPI + +if TYPE_CHECKING: + from ray.data import Dataset + from ray.tune import Trainable + +_TRAINER_PKL = "trainer.pkl" + +# A type representing either a ray.data.Dataset or a function that returns a +# ray.data.Dataset and accepts no arguments. +GenDataset = Union["Dataset", Callable[[], "Dataset"]] + + +logger = logging.getLogger(__name__) + +PREPROCESSOR_DEPRECATION_MESSAGE = ( + "The `preprocessor` argument to Trainers is deprecated as of Ray 2.7. " + "Instead, use the Preprocessor `fit` and `transform` APIs directly on the Ray " + "Dataset. For any state that needs to be saved to the trained checkpoint, pass it " + "in using the `metadata` argument of the `Trainer`. " + "For a full example, see " + "https://docs.ray.io/en/master/train/user-guides/data-loading-preprocessing.html#preprocessing-structured-data " # noqa:E501 +) + + +@PublicAPI(stability="beta") +class TrainingFailedError(RuntimeError): + """An error indicating that training has failed.""" + + _RESTORE_MSG = ( + "The Ray Train run failed. Please inspect the previous error messages for a " + "cause. After fixing the issue (assuming that the error is not caused by " + "your own application logic, but rather an error such as OOM), you can restart " + "the run from scratch or continue this run.\n" + "To continue this run, you can use: " + '`trainer = {trainer_cls_name}.restore("{path}")`.' + ) + + _FAILURE_CONFIG_MSG = ( + "To start a new run that will retry on training failures, set " + "`train.RunConfig(failure_config=train.FailureConfig(max_failures))` " + "in the Trainer's `run_config` with `max_failures > 0`, or `max_failures = -1` " + "for unlimited retries." + ) + + +def _train_coordinator_fn( + config: dict, trainer_cls: Type["BaseTrainer"], metadata: dict +): + """This is the function that defines the logic of the Ray Train coordinator. + This is responsible for setting up a remote instance of the `trainer_cls` + (a different instance than the one calling `trainer.fit` on the driver!) + and running the training loop. + """ + assert metadata is not None, metadata + # Propagate user metadata from the Trainer constructor. + get_session().metadata = metadata + + # config already contains merged values. + # Instantiate new Trainer in Trainable. + trainer = trainer_cls(**config) + + # Get the checkpoint from Tune and pass it to workers later on. + checkpoint = ray.train.get_checkpoint() + if checkpoint: + # Set `starting_checkpoint` for auto-recovery fault-tolerance + # as well as manual restoration. + trainer.starting_checkpoint = checkpoint + # else: Train will restore from the user-provided + # `resume_from_checkpoint` == `starting_checkpoint`. + + # Evaluate datasets if they are wrapped in a factory. + trainer.datasets = { + k: d() if callable(d) else d for k, d in trainer.datasets.items() + } + + trainer.setup() + trainer.training_loop() + + +@DeveloperAPI +class BaseTrainer(abc.ABC): + """Defines interface for distributed training on Ray. + + Note: The base ``BaseTrainer`` class cannot be instantiated directly. Only + one of its subclasses can be used. + + Note to developers: If a new trainer is added, please update + `air/_internal/usage.py`. + + **How does a trainer work?** + + - First, initialize the Trainer. The initialization runs locally, + so heavyweight setup should not be done in ``__init__``. + - Then, when you call ``trainer.fit()``, the Trainer is serialized + and copied to a remote Ray actor. The following methods are then + called in sequence on the remote actor. + - ``trainer.setup()``: Any heavyweight Trainer setup should be + specified here. + - ``trainer.training_loop()``: Executes the main training logic. + - Calling ``trainer.fit()`` will return a ``ray.result.Result`` + object where you can access metrics from your training run, as well + as any checkpoints that may have been saved. + + **How do I create a new Trainer?** + + Subclass ``ray.train.trainer.BaseTrainer``, and override the ``training_loop`` + method, and optionally ``setup``. + + .. testcode:: + + import torch + + from ray.train.trainer import BaseTrainer + from ray import train, tune + + + class MyPytorchTrainer(BaseTrainer): + def setup(self): + self.model = torch.nn.Linear(1, 1) + self.optimizer = torch.optim.SGD( + self.model.parameters(), lr=0.1) + + def training_loop(self): + # You can access any Trainer attributes directly in this method. + # self.datasets["train"] has already been + dataset = self.datasets["train"] + + torch_ds = dataset.iter_torch_batches(dtypes=torch.float) + loss_fn = torch.nn.MSELoss() + + for epoch_idx in range(10): + loss = 0 + num_batches = 0 + torch_ds = dataset.iter_torch_batches( + dtypes=torch.float, batch_size=2 + ) + for batch in torch_ds: + X = torch.unsqueeze(batch["x"], 1) + y = torch.unsqueeze(batch["y"], 1) + # Compute prediction error + pred = self.model(X) + batch_loss = loss_fn(pred, y) + + # Backpropagation + self.optimizer.zero_grad() + batch_loss.backward() + self.optimizer.step() + + loss += batch_loss.item() + num_batches += 1 + loss /= num_batches + + # Use Tune functions to report intermediate + # results. + train.report({"loss": loss, "epoch": epoch_idx}) + + + # Initialize the Trainer, and call Trainer.fit() + import ray + train_dataset = ray.data.from_items( + [{"x": i, "y": i} for i in range(10)]) + my_trainer = MyPytorchTrainer(datasets={"train": train_dataset}) + result = my_trainer.fit() + + .. testoutput:: + :hide: + + ... + + Args: + scaling_config: Configuration for how to scale training. + run_config: Configuration for the execution of the training run. + datasets: Any Datasets to use for training. Use the key "train" + to denote which dataset is the training dataset. + metadata: Dict that should be made available via + `train.get_context().get_metadata()` and in `checkpoint.get_metadata()` + for checkpoints saved from this Trainer. Must be JSON-serializable. + resume_from_checkpoint: A checkpoint to resume training from. + """ + + _scaling_config_allowed_keys: List[str] = [ + "trainer_resources", + ] + _handles_checkpoint_freq: bool = False + _handles_checkpoint_at_end: bool = False + + # fields to propagate to Tuner param_space. + # See `BaseTrainer._extract_fields_for_tuner_param_space` for more details. + _fields_for_tuner_param_space = [] + + def __init__( + self, + *, + scaling_config: Optional[ScalingConfig] = None, + run_config: Optional[RunConfig] = None, + datasets: Optional[Dict[str, GenDataset]] = None, + metadata: Optional[Dict[str, Any]] = None, + resume_from_checkpoint: Optional[Checkpoint] = None, + ): + self.scaling_config = ( + scaling_config if scaling_config is not None else ScalingConfig() + ) + self.run_config = ( + copy.copy(run_config) if run_config is not None else RunConfig() + ) + self.metadata = metadata + self.datasets = datasets if datasets is not None else {} + self.starting_checkpoint = resume_from_checkpoint + + # These attributes should only be set through `BaseTrainer.restore` + self._restore_path = None + self._restore_storage_filesystem = None + + self._validate_attributes() + + air_usage.tag_air_trainer(self) + + @PublicAPI(stability="alpha") + @classmethod + def restore( + cls: Type["BaseTrainer"], + path: Union[str, os.PathLike], + storage_filesystem: Optional[pyarrow.fs.FileSystem] = None, + datasets: Optional[Dict[str, GenDataset]] = None, + scaling_config: Optional[ScalingConfig] = None, + **kwargs, + ) -> "BaseTrainer": + """Restores a Train experiment from a previously interrupted/failed run. + + Restore should be used for experiment-level fault tolerance in the event + that the head node crashes (e.g., OOM or some other runtime error) or the + entire cluster goes down (e.g., network error affecting all nodes). + + A run that has already completed successfully will not be resumed from this API. + To continue training from a successful run, launch a new run with the + ``Trainer(resume_from_checkpoint)`` API instead, passing in a + checkpoint from the previous run to start with. + + .. note:: + + Restoring an experiment from a path that's pointing to a *different* + location than the original experiment path is supported. However, Ray Train + assumes that the full experiment directory is available + (including checkpoints) so that it's possible to resume trials from their + latest state. + + For example, if the original experiment path was run locally, then the + results are uploaded to cloud storage, Ray Train expects the full contents + to be available in cloud storage if attempting to resume + via ``Trainer.restore("s3://...")``. The restored run will + continue writing results to the same cloud storage location. + + The following example can be paired with implementing job retry using + :ref:`Ray Jobs ` to produce a Train experiment that will + attempt to resume on both experiment-level and trial-level failures: + + .. testcode:: + + import os + import ray + from ray import train + from ray.train.trainer import BaseTrainer + + experiment_name = "unique_experiment_name" + storage_path = os.path.expanduser("~/ray_results") + experiment_dir = os.path.join(storage_path, experiment_name) + + # Define some dummy inputs for demonstration purposes + datasets = {"train": ray.data.from_items([{"a": i} for i in range(10)])} + + class CustomTrainer(BaseTrainer): + def training_loop(self): + pass + + if CustomTrainer.can_restore(experiment_dir): + trainer = CustomTrainer.restore( + experiment_dir, datasets=datasets + ) + else: + trainer = CustomTrainer( + datasets=datasets, + run_config=train.RunConfig( + name=experiment_name, + storage_path=storage_path, + # Tip: You can also enable retries on failure for + # worker-level fault tolerance + failure_config=train.FailureConfig(max_failures=3), + ), + ) + + result = trainer.fit() + + .. testoutput:: + :hide: + + ... + + Args: + path: The path to the experiment directory of the training run to restore. + This can be a local path or a remote URI if the experiment was + uploaded to the cloud. + storage_filesystem: Custom ``pyarrow.fs.FileSystem`` + corresponding to the ``path``. This may be necessary if the original + experiment passed in a custom filesystem. + datasets: Re-specified datasets used in the original training run. + This must include all the datasets that were passed in the + original trainer constructor. + scaling_config: Optionally re-specified scaling config. This can be + modified to be different from the original spec. + **kwargs: Other optionally re-specified arguments, passed in by subclasses. + + Raises: + ValueError: If all datasets were not re-supplied on restore. + + Returns: + BaseTrainer: A restored instance of the class that is calling this method. + """ + if not cls.can_restore(path, storage_filesystem): + raise ValueError( + f"Invalid restore path: {path}. Make sure that this path exists and " + "is the experiment directory that results from a call to " + "`trainer.fit()`." + ) + fs, fs_path = get_fs_and_path(path, storage_filesystem) + trainer_pkl_path = Path(fs_path, _TRAINER_PKL).as_posix() + with fs.open_input_file(trainer_pkl_path) as f: + trainer_cls, param_dict = pickle.loads(f.readall()) + + if trainer_cls is not cls: + warnings.warn( + f"Invalid trainer type. You are attempting to restore a trainer of type" + f" {trainer_cls} with `{cls.__name__}.restore`, " + "which will most likely fail. " + f"Use `{trainer_cls.__name__}.restore` instead." + ) + + original_datasets = param_dict.pop("datasets", {}) + if original_datasets and not datasets: + raise ValueError( + "The following datasets need to be provided again on restore: " + f"{list(original_datasets.keys())}\n" + f"Use {cls.__name__}.restore(..., datasets=datasets) " + "with the datasets that were provided to the original trainer." + ) + datasets = datasets or {} + if set(original_datasets) != set(datasets): + raise ValueError( + "The provided datasets don't match the original dataset keys.\n" + f" Expected datasets for the keys: {list(original_datasets.keys())}\n" + f" Actual datasets provided: {list(datasets.keys())}" + ) + param_dict["datasets"] = datasets + + if scaling_config: + param_dict["scaling_config"] = scaling_config + + for param_name, val in kwargs.items(): + # Overwrite the old value if something is passed into restore + if val is not None: + param_dict[param_name] = val + + try: + trainer = cls(**param_dict) + except Exception as e: + raise ValueError( + "Trainer restoration failed (see above for the stack trace). " + "Make sure that you use the right trainer class to restore: " + f"`{cls.__name__}.restore`\n" + ) from e + trainer._restore_path = path + trainer._restore_storage_filesystem = storage_filesystem + return trainer + + @PublicAPI(stability="alpha") + @classmethod + def can_restore( + cls: Type["BaseTrainer"], + path: Union[str, os.PathLike], + storage_filesystem: Optional[pyarrow.fs.FileSystem] = None, + ) -> bool: + """Checks whether a given directory contains a restorable Train experiment. + + Args: + path: The path to the experiment directory of the Train experiment. + This can be either a local directory (e.g., ~/ray_results/exp_name) + or a remote URI (e.g., s3://bucket/exp_name). + + Returns: + bool: Whether this path exists and contains the trainer state to resume from + """ + fs, fs_path = get_fs_and_path(path, storage_filesystem) + trainer_pkl_path = Path(fs_path, _TRAINER_PKL).as_posix() + return _exists_at_fs_path(fs, trainer_pkl_path) + + def __repr__(self): + # A dictionary that maps parameters to their default values. + default_values: Dict[str, Any] = { + "scaling_config": ScalingConfig(), + "run_config": RunConfig(), + "datasets": {}, + "starting_checkpoint": None, + } + + non_default_arguments = [] + for parameter, default_value in default_values.items(): + value = getattr(self, parameter) + if value != default_value: + non_default_arguments.append(f"{parameter}={value!r}") + + if non_default_arguments: + return f"<{self.__class__.__name__} {' '.join(non_default_arguments)}>" + + return f"<{self.__class__.__name__}>" + + def __new__(cls, *args, **kwargs): + # Store the init args as attributes so this can be merged with Tune hparams. + trainer = super(BaseTrainer, cls).__new__(cls) + parameters = inspect.signature(cls.__init__).parameters + parameters = list(parameters.keys()) + # Remove self. + parameters = parameters[1:] + arg_dict = dict(zip(parameters, args)) + trainer._param_dict = {**arg_dict, **kwargs} + return trainer + + def _validate_attributes(self): + """Called on __init()__ to validate trainer attributes.""" + # Run config + if not isinstance(self.run_config, RunConfig): + raise ValueError( + f"`run_config` should be an instance of `ray.train.RunConfig`, " + f"found {type(self.run_config)} with value `{self.run_config}`." + ) + # Scaling config + if not isinstance(self.scaling_config, ScalingConfig): + raise ValueError( + "`scaling_config` should be an instance of `ScalingConfig`, " + f"found {type(self.scaling_config)} with value `{self.scaling_config}`." + ) + # Datasets + if not isinstance(self.datasets, dict): + raise ValueError( + f"`datasets` should be a dict mapping from a string to " + f"`ray.data.Dataset` objects, " + f"found {type(self.datasets)} with value `{self.datasets}`." + ) + else: + for key, dataset in self.datasets.items(): + if not isinstance(dataset, ray.data.Dataset) and not callable(dataset): + raise ValueError( + f"The Dataset under '{key}' key is not a " + "`ray.data.Dataset`. " + f"Received {dataset} instead." + ) + # Metadata. + self.metadata = self.metadata or {} + if not isinstance(self.metadata, dict): + raise TypeError( + f"The provided metadata must be a dict, was {type(self.metadata)}." + ) + try: + self.metadata = json.loads(json.dumps(self.metadata)) + except Exception as e: + raise ValueError( + "The provided metadata must be JSON-serializable: " + f"{self.metadata}: {e}" + ) + + if self.starting_checkpoint is not None and not isinstance( + self.starting_checkpoint, Checkpoint + ): + raise ValueError( + f"`resume_from_checkpoint` should be an instance of " + f"`ray.train.Checkpoint`, found {type(self.starting_checkpoint)} " + f"with value `{self.starting_checkpoint}`." + ) + + @classmethod + def _validate_scaling_config(cls, scaling_config: ScalingConfig) -> ScalingConfig: + """Returns scaling config dataclass after validating updated keys.""" + ensure_only_allowed_dataclass_keys_updated( + dataclass=scaling_config, + allowed_keys=cls._scaling_config_allowed_keys, + ) + return scaling_config + + def setup(self) -> None: + """Called during fit() to perform initial setup on the Trainer. + + .. note:: This method is run on a remote process. + + This method will not be called on the driver, so any expensive setup + operations should be placed here and not in ``__init__``. + + This method is called prior to ``preprocess_datasets`` and + ``training_loop``. + """ + pass + + def preprocess_datasets(self) -> None: + """Deprecated.""" + raise DeprecationWarning( + "`preprocess_datasets` is no longer used, since preprocessors " + f"are no longer accepted by Trainers.\n{PREPROCESSOR_DEPRECATION_MESSAGE}" + ) + + @abc.abstractmethod + def training_loop(self) -> None: + """Loop called by fit() to run training and report results to Tune. + + .. note:: This method runs on a remote process. + + ``self.datasets`` have already been evaluated if they were wrapped in a factory. + + You can use the :ref:`Ray Train utilities ` + (:func:`train.report() ` and + :func:`train.get_checkpoint() `) inside + this training loop. + + Example: + + .. testcode:: + + from ray.train.trainer import BaseTrainer + from ray import train + + class MyTrainer(BaseTrainer): + def training_loop(self): + for epoch_idx in range(5): + ... + train.report({"epoch": epoch_idx}) + + """ + raise NotImplementedError + + @PublicAPI(stability="beta") + def fit(self) -> Result: + """Runs training. + + Returns: + A Result object containing the training result. + + Raises: + TrainingFailedError: If any failures during the execution + of ``self.as_trainable()``, or during the Tune execution loop. + """ + from ray.tune import ResumeConfig, TuneError + from ray.tune.tuner import Tuner + + trainable = self.as_trainable() + param_space = self._extract_fields_for_tuner_param_space() + + self.run_config.name = ( + self.run_config.name or StorageContext.get_experiment_dir_name(trainable) + ) + # The storage context here is only used to access the resolved + # storage fs and experiment path, in order to avoid duplicating that logic. + # This is NOT the storage context object that gets passed to remote workers. + storage = StorageContext( + storage_path=self.run_config.storage_path, + experiment_dir_name=self.run_config.name, + storage_filesystem=self.run_config.storage_filesystem, + ) + + if self._restore_path: + tuner = Tuner.restore( + path=self._restore_path, + trainable=trainable, + param_space=param_space, + _resume_config=ResumeConfig( + finished=ResumeConfig.ResumeType.RESUME, + unfinished=ResumeConfig.ResumeType.RESUME, + errored=ResumeConfig.ResumeType.RESUME, + ), + storage_filesystem=self._restore_storage_filesystem, + ) + else: + tuner = Tuner( + trainable=trainable, + param_space=param_space, + run_config=self.run_config, + _entrypoint=AirEntrypoint.TRAINER, + ) + + self._save(storage.storage_filesystem, storage.experiment_fs_path) + + restore_msg = TrainingFailedError._RESTORE_MSG.format( + trainer_cls_name=self.__class__.__name__, + path=str(storage.experiment_fs_path), + ) + + try: + result_grid = tuner.fit() + except TuneError as e: + # Catch any `TuneError`s raised by the `Tuner.fit` call. + # Unwrap the `TuneError` if needed. + parent_error = e.__cause__ or e + + # Raise it to the user as a `TrainingFailedError` with a message to restore. + raise TrainingFailedError(restore_msg) from parent_error + # Other exceptions get passed through directly (ex: on `fail_fast='raise'`) + + assert len(result_grid) == 1 + result = result_grid[0] + if result.error: + # Raise trainable errors to the user with a message to restore + # or configure `FailureConfig` in a new run. + raise TrainingFailedError( + "\n".join([restore_msg, TrainingFailedError._FAILURE_CONFIG_MSG]) + ) from result.error + return result + + def _save(self, fs: pyarrow.fs.FileSystem, experiment_path: str): + """Saves the current trainer's class along with the `param_dict` of + parameters passed to this trainer's constructor. + + This is used to recreate the trainer on restore. + Unless a parameter is re-specified during restoration (only a subset + of parameters can be passed in again), that parameter will be loaded + from the saved copy. + + Datasets should not be saved as part of the state. Instead, we save the + keys and replace the dataset values with dummy functions that will + raise an error if invoked. The error only serves as a guardrail for + misuse (e.g., manually unpickling and constructing the Trainer again) + and is not typically surfaced, since datasets must be re-specified + upon restoration. + """ + param_dict = self._param_dict.copy() + datasets = param_dict.pop("datasets", {}) + + def raise_fn(): + raise RuntimeError + + if datasets: + param_dict["datasets"] = { + dataset_name: raise_fn for dataset_name in datasets + } + + cls_and_param_dict = (self.__class__, param_dict) + + fs.create_dir(experiment_path) + with fs.open_output_stream(Path(experiment_path, _TRAINER_PKL).as_posix()) as f: + f.write(pickle.dumps(cls_and_param_dict)) + + def _extract_fields_for_tuner_param_space(self) -> Dict: + """Extracts fields to be included in `Tuner.param_space`. + + This is needed to leverage the full logging/integration offerings from Tune. + For example, `param_space` is logged automatically to wandb integration. + + Currently only done for `train_loop_config`. + + Returns: + A dictionary that should be passed to Tuner.param_space. + """ + result = {} + for key in self._fields_for_tuner_param_space: + if key in self._param_dict.keys(): + result[key] = copy.deepcopy(self._param_dict[key]) + return result + + def _generate_trainable_cls(self) -> Type["Trainable"]: + """Generates the base Trainable class. + + Returns: + A Trainable class to use for training. + """ + + from ray.tune.execution.placement_groups import PlacementGroupFactory + from ray.tune.trainable import wrap_function + + trainer_cls = self.__class__ + scaling_config = self.scaling_config + metadata = self.metadata + + train_coordinator_fn = partial( + _train_coordinator_fn, trainer_cls=trainer_cls, metadata=metadata + ) + # Change the name of the training function to match the name of the Trainer + # class. This will mean the Tune trial name will match the name of Trainer on + # stdout messages and the results directory. + train_coordinator_fn.__name__ = trainer_cls.__name__ + + trainable_cls = wrap_function(train_coordinator_fn) + has_base_dataset = bool(self.datasets) + if has_base_dataset: + from ray.data.context import DataContext + + dataset_context = DataContext.get_current() + else: + dataset_context = None + + class TrainTrainable(trainable_cls): + """Adds default resources to the Trainable.""" + + _handles_checkpoint_freq = trainer_cls._handles_checkpoint_freq + _handles_checkpoint_at_end = trainer_cls._handles_checkpoint_at_end + + @classmethod + def has_base_dataset(cls) -> bool: + """Whether a dataset is provided through the Trainer.""" + return has_base_dataset + + @classmethod + def base_scaling_config(cls) -> ScalingConfig: + """Returns the unchanged scaling config provided through the Trainer.""" + return scaling_config + + def setup(self, config, **kwargs): + base_config = dict(kwargs) + # Merge Tuner param space hyperparameters in `config` into the + # base config passed to the Trainer constructor, which is `base_config`. + # `base_config` is pulled from the object store from the usage of + # tune.with_parameters in `BaseTrainer.as_trainable`. + + # run_config is not a tunable hyperparameter so it does not need to be + # merged. + run_config = base_config.pop("run_config", None) + self._merged_config = deep_update( + base_config, self.config, new_keys_allowed=True + ) + self._merged_config["run_config"] = run_config + merged_scaling_config = self._merged_config.get( + "scaling_config", ScalingConfig() + ) + if isinstance(merged_scaling_config, dict): + merged_scaling_config = ScalingConfig(**merged_scaling_config) + self._merged_config[ + "scaling_config" + ] = self._reconcile_scaling_config_with_trial_resources( + merged_scaling_config + ) + if self.has_base_dataset(): + # Set the DataContext on the Trainer actor to the DataContext + # specified on the driver. + DataContext._set_current(dataset_context) + super(TrainTrainable, self).setup(config) + + def _reconcile_scaling_config_with_trial_resources( + self, scaling_config: ScalingConfig + ) -> ScalingConfig: + """ + ResourceChangingScheduler workaround. + + Ensures that the scaling config matches trial resources. + + This should be replaced with RCS returning a ScalingConfig + in the future. + """ + + trial_resources = self.trial_resources + # This will be false if the resources are default + if not isinstance(trial_resources, PlacementGroupFactory): + return scaling_config + + # Ignore ResourceChangingScheduler workaround when resource bundles + # are unchanged + if self.trial_resources == scaling_config.as_placement_group_factory(): + return scaling_config + + trainer_cls._validate_scaling_config(scaling_config) + + return ScalingConfig.from_placement_group_factory(trial_resources) + + def _trainable_func(self, config): + # We ignore the config passed by Tune and instead use the merged + # config which includes the initial Trainer args. + super()._trainable_func(self._merged_config) + + @classmethod + def default_resource_request(cls, config): + # `config["scaling_config"] is a dataclass when passed via the + # `scaling_config` argument in `Trainer` and is a dict when passed + # via the `scaling_config` key of `param_spec`. + + # Conversion logic must be duplicated in `TrainTrainable.__init__` + # because this is a class method. + updated_scaling_config = config.get("scaling_config", scaling_config) + if isinstance(updated_scaling_config, dict): + updated_scaling_config = ScalingConfig(**updated_scaling_config) + validated_scaling_config = trainer_cls._validate_scaling_config( + updated_scaling_config + ) + return validated_scaling_config.as_placement_group_factory() + + return TrainTrainable + + def as_trainable(self) -> Type["Trainable"]: + """Converts self to a ``tune.Trainable`` class.""" + from ray import tune + + base_config = self._param_dict + trainable_cls = self._generate_trainable_cls() + + # Wrap with `tune.with_parameters` to handle very large values in base_config + return tune.with_parameters(trainable_cls, **base_config) diff --git a/.venv/lib/python3.11/site-packages/ray/train/constants.py b/.venv/lib/python3.11/site-packages/ray/train/constants.py new file mode 100644 index 0000000000000000000000000000000000000000..4e42e30f4370a729a161a1485ba3a8811320d8af --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/train/constants.py @@ -0,0 +1,118 @@ +from pathlib import Path + +import ray +from ray._private.ray_constants import env_bool +from ray.air.constants import ( # noqa: F401 + COPY_DIRECTORY_CHECKPOINTS_INSTEAD_OF_MOVING_ENV, + EVALUATION_DATASET_KEY, + MODEL_KEY, + PREPROCESSOR_KEY, + TRAIN_DATASET_KEY, +) + + +def _get_ray_train_session_dir() -> str: + assert ray.is_initialized(), "Ray must be initialized to get the session dir." + return Path( + ray._private.worker._global_node.get_session_dir_path(), "artifacts" + ).as_posix() + + +DEFAULT_STORAGE_PATH = Path("~/ray_results").expanduser().as_posix() + +# Autofilled ray.train.report() metrics. Keys should be consistent with Tune. +CHECKPOINT_DIR_NAME = "checkpoint_dir_name" +TIME_TOTAL_S = "_time_total_s" +WORKER_HOSTNAME = "_hostname" +WORKER_NODE_IP = "_node_ip" +WORKER_PID = "_pid" + +# Will not be reported unless ENABLE_DETAILED_AUTOFILLED_METRICS_ENV +# env var is not 0 +DETAILED_AUTOFILLED_KEYS = {WORKER_HOSTNAME, WORKER_NODE_IP, WORKER_PID, TIME_TOTAL_S} + +# Default filename for JSON logger +RESULT_FILE_JSON = "results.json" + +# The name of the subdirectory inside the trainer run_dir to store checkpoints. +TRAIN_CHECKPOINT_SUBDIR = "checkpoints" + +# The key to use to specify the checkpoint id for Tune. +# This needs to be added to the checkpoint dictionary so if the Tune trial +# is restarted, the checkpoint_id can continue to increment. +TUNE_CHECKPOINT_ID = "_current_checkpoint_id" + +# Deprecated configs can use this value to detect if the user has set it. +_DEPRECATED_VALUE = "DEPRECATED" + +# ================================================== +# Environment Variables +# ================================================== + +ENABLE_DETAILED_AUTOFILLED_METRICS_ENV = ( + "TRAIN_RESULT_ENABLE_DETAILED_AUTOFILLED_METRICS" +) + +# Integer value which if set will override the value of +# Backend.share_cuda_visible_devices. 1 for True, 0 for False. +ENABLE_SHARE_CUDA_VISIBLE_DEVICES_ENV = "TRAIN_ENABLE_SHARE_CUDA_VISIBLE_DEVICES" + +# Integer value which if set will not share ROCR accelerator visible devices +# across workers. 1 for True (default), 0 for False. +ENABLE_SHARE_ROCR_VISIBLE_DEVICES_ENV = "TRAIN_ENABLE_SHARE_ROCR_VISIBLE_DEVICES" + +# Integer value which if set will not share neuron-core accelerator visible cores +# across workers. 1 for True (default), 0 for False. +ENABLE_SHARE_NEURON_CORES_ACCELERATOR_ENV = ( + "TRAIN_ENABLE_SHARE_NEURON_CORES_ACCELERATOR" +) + +# Integer value which if set will not share npu visible devices +# across workers. 1 for True (default), 0 for False. +ENABLE_SHARE_NPU_RT_VISIBLE_DEVICES_ENV = "TRAIN_ENABLE_SHARE_ASCEND_RT_VISIBLE_DEVICES" + +# Integer value which indicates the number of seconds to wait when creating +# the worker placement group before timing out. +TRAIN_PLACEMENT_GROUP_TIMEOUT_S_ENV = "TRAIN_PLACEMENT_GROUP_TIMEOUT_S" + +# Integer value which if set will change the placement group strategy from +# PACK to SPREAD. 1 for True, 0 for False. +TRAIN_ENABLE_WORKER_SPREAD_ENV = "TRAIN_ENABLE_WORKER_SPREAD" + +# Set this to 0 to disable changing the working directory of each Tune Trainable +# or Train worker to the trial directory. Defaults to 1. +RAY_CHDIR_TO_TRIAL_DIR = "RAY_CHDIR_TO_TRIAL_DIR" + +# Set this to 1 to count preemption errors toward `FailureConfig(max_failures)`. +# Defaults to 0, which always retries on node preemption failures. +RAY_TRAIN_COUNT_PREEMPTION_AS_FAILURE = "RAY_TRAIN_COUNT_PREEMPTION_AS_FAILURE" + +# Set this to 1 to start a StateActor and collect information Train Runs +# Defaults to 0 +RAY_TRAIN_ENABLE_STATE_TRACKING = "RAY_TRAIN_ENABLE_STATE_TRACKING" + +# Set this to 1 to enable deprecation warnings for V2 migration. +ENABLE_V2_MIGRATION_WARNINGS_ENV_VAR = "RAY_TRAIN_ENABLE_V2_MIGRATION_WARNINGS" + + +def _v2_migration_warnings_enabled() -> bool: + return env_bool(ENABLE_V2_MIGRATION_WARNINGS_ENV_VAR, False) + + +# NOTE: When adding a new environment variable, please track it in this list. +TRAIN_ENV_VARS = { + ENABLE_DETAILED_AUTOFILLED_METRICS_ENV, + ENABLE_SHARE_CUDA_VISIBLE_DEVICES_ENV, + ENABLE_SHARE_NEURON_CORES_ACCELERATOR_ENV, + TRAIN_PLACEMENT_GROUP_TIMEOUT_S_ENV, + TRAIN_ENABLE_WORKER_SPREAD_ENV, + RAY_CHDIR_TO_TRIAL_DIR, + RAY_TRAIN_COUNT_PREEMPTION_AS_FAILURE, + RAY_TRAIN_ENABLE_STATE_TRACKING, +} + +# Key for AIR Checkpoint metadata in TrainingResult metadata +CHECKPOINT_METADATA_KEY = "checkpoint_metadata" + +# Key for AIR Checkpoint world rank in TrainingResult metadata +CHECKPOINT_RANK_KEY = "checkpoint_rank" diff --git a/.venv/lib/python3.11/site-packages/ray/train/context.py b/.venv/lib/python3.11/site-packages/ray/train/context.py new file mode 100644 index 0000000000000000000000000000000000000000..bc447b36f2024dba0c2e88aae26d4551b3c3e23f --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/train/context.py @@ -0,0 +1,139 @@ +import threading +from typing import TYPE_CHECKING, Any, Dict, Optional + +from ray.train._internal import session +from ray.train._internal.storage import StorageContext +from ray.train.constants import _v2_migration_warnings_enabled +from ray.train.utils import _copy_doc, _log_deprecation_warning +from ray.util.annotations import Deprecated, DeveloperAPI, PublicAPI + +if TYPE_CHECKING: + from ray.tune.execution.placement_groups import PlacementGroupFactory + + +# The context singleton on this process. +_default_context: "Optional[TrainContext]" = None +_context_lock = threading.Lock() + + +_GET_METADATA_DEPRECATION_MESSAGE = ( + "`get_metadata` was an experimental API that accessed the metadata passed " + "to `Trainer(metadata=...)`. This API can be replaced by passing " + "the metadata directly to the training function (e.g., via `train_loop_config`)." +) + +_TUNE_SPECIFIC_CONTEXT_DEPRECATION_MESSAGE = ( + "`{}` is deprecated because the concept of a `Trial` will " + "soon be removed in Ray Train (see here: " + "https://github.com/ray-project/enhancements/pull/57). " + "Ray Train will no longer assume that it's running within a Ray Tune `Trial` " + "in the future." +) + + +@PublicAPI(stability="stable") +class TrainContext: + """Context containing metadata that can be accessed within Ray Train workers.""" + + @_copy_doc(session.get_experiment_name) + def get_experiment_name(self) -> str: + return session.get_experiment_name() + + @_copy_doc(session.get_world_size) + def get_world_size(self) -> int: + return session.get_world_size() + + @_copy_doc(session.get_world_rank) + def get_world_rank(self) -> int: + return session.get_world_rank() + + @_copy_doc(session.get_local_rank) + def get_local_rank(self) -> int: + return session.get_local_rank() + + @_copy_doc(session.get_local_world_size) + def get_local_world_size(self) -> int: + return session.get_local_world_size() + + @_copy_doc(session.get_node_rank) + def get_node_rank(self) -> int: + return session.get_node_rank() + + @DeveloperAPI + @_copy_doc(session.get_storage) + def get_storage(self) -> StorageContext: + return session.get_storage() + + # Deprecated APIs + + @Deprecated( + message=_GET_METADATA_DEPRECATION_MESSAGE, + warning=_v2_migration_warnings_enabled(), + ) + @_copy_doc(session.get_metadata) + def get_metadata(self) -> Dict[str, Any]: + return session.get_metadata() + + @Deprecated( + message=_TUNE_SPECIFIC_CONTEXT_DEPRECATION_MESSAGE.format("get_trial_name"), + warning=_v2_migration_warnings_enabled(), + ) + @_copy_doc(session.get_trial_name) + def get_trial_name(self) -> str: + return session.get_trial_name() + + @Deprecated( + message=_TUNE_SPECIFIC_CONTEXT_DEPRECATION_MESSAGE.format("get_trial_id"), + warning=_v2_migration_warnings_enabled(), + ) + @_copy_doc(session.get_trial_id) + def get_trial_id(self) -> str: + return session.get_trial_id() + + @Deprecated( + message=_TUNE_SPECIFIC_CONTEXT_DEPRECATION_MESSAGE.format( + "get_trial_resources" + ), + warning=_v2_migration_warnings_enabled(), + ) + @_copy_doc(session.get_trial_resources) + def get_trial_resources(self) -> "PlacementGroupFactory": + return session.get_trial_resources() + + @Deprecated( + message=_TUNE_SPECIFIC_CONTEXT_DEPRECATION_MESSAGE.format("get_trial_dir"), + warning=_v2_migration_warnings_enabled(), + ) + @_copy_doc(session.get_trial_dir) + def get_trial_dir(self) -> str: + return session.get_trial_dir() + + +@PublicAPI(stability="stable") +def get_context() -> TrainContext: + """Get or create a singleton training context. + + The context is only available within a function passed to Ray Train. + + See the :class:`~ray.train.TrainContext` API reference to see available methods. + """ + from ray.tune.trainable.trainable_fn_utils import _in_tune_session + + # If we are running in a Tune function, switch to Tune context. + if _in_tune_session(): + from ray.tune import get_context as get_tune_context + + if _v2_migration_warnings_enabled(): + _log_deprecation_warning( + "`ray.train.get_context()` should be switched to " + "`ray.tune.get_context()` when running in a function " + "passed to Ray Tune. This will be an error in the future." + ) + return get_tune_context() + + global _default_context + + with _context_lock: + if _default_context is None: + _default_context = TrainContext() + return _default_context diff --git a/.venv/lib/python3.11/site-packages/ray/train/data_parallel_trainer.py b/.venv/lib/python3.11/site-packages/ray/train/data_parallel_trainer.py new file mode 100644 index 0000000000000000000000000000000000000000..a14dc47d36dd3d238c0945675bf18a9310aa1b60 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/train/data_parallel_trainer.py @@ -0,0 +1,587 @@ +import logging +import uuid +from typing import Any, Callable, Dict, List, Optional, Type, Union + +import ray +from ray._private.ray_constants import env_integer +from ray._private.thirdparty.tabulate.tabulate import tabulate +from ray.air.config import RunConfig, ScalingConfig +from ray.train import BackendConfig, Checkpoint, TrainingIterator +from ray.train._internal import session +from ray.train._internal.backend_executor import BackendExecutor, TrialInfo +from ray.train._internal.data_config import DataConfig +from ray.train._internal.session import _TrainingResult, get_session +from ray.train._internal.utils import construct_train_func, count_required_parameters +from ray.train.constants import RAY_TRAIN_ENABLE_STATE_TRACKING +from ray.train.trainer import BaseTrainer, GenDataset +from ray.util.annotations import DeveloperAPI, PublicAPI +from ray.widgets import Template +from ray.widgets.util import repr_with_fallback + +logger = logging.getLogger(__name__) + + +@DeveloperAPI +class DataParallelTrainer(BaseTrainer): + """A Trainer for data parallel training. + + You should subclass this Trainer if your Trainer follows SPMD (single program, + multiple data) programming paradigm - you want multiple processes to run the same + function, but on different data. + + This Trainer runs the function ``train_loop_per_worker`` on multiple Ray + Actors. + + The ``train_loop_per_worker`` function is expected to take in either 0 or 1 + arguments: + + .. testcode:: + + def train_loop_per_worker(): + ... + + .. testcode:: + + def train_loop_per_worker(config: Dict): + ... + + If ``train_loop_per_worker`` accepts an argument, then + ``train_loop_config`` will be passed in as the argument. This is useful if you + want to tune the values in ``train_loop_config`` as hyperparameters. + + If the ``datasets`` dict contains a training dataset (denoted by + the "train" key), then it will be split into multiple dataset + shards that can then be accessed by ``train.get_dataset_shard("train")`` inside + ``train_loop_per_worker``. All the other datasets will not be split and + ``train.get_dataset_shard(...)`` will return the the entire Dataset. + + Inside the ``train_loop_per_worker`` function, you can use any of the + :ref:`Ray Train loop methods `. + + .. testcode:: + + from ray import train + + def train_loop_per_worker(): + # Report intermediate results for callbacks or logging and + # checkpoint data. + train.report(...) + + # Returns dict of last saved checkpoint. + train.get_checkpoint() + + # Returns the Dataset shard for the given key. + train.get_dataset_shard("my_dataset") + + # Returns the total number of workers executing training. + train.get_context().get_world_size() + + # Returns the rank of this worker. + train.get_context().get_world_rank() + + # Returns the rank of the worker on the current node. + train.get_context().get_local_rank() + + Any returns from the ``train_loop_per_worker`` will be discarded and not + used or persisted anywhere. + + **How do I use DataParallelTrainer or any of its subclasses?** + + Example: + + .. testcode:: + + import ray + from ray import train + from ray.train import ScalingConfig + from ray.train.data_parallel_trainer import DataParallelTrainer + + def train_loop_for_worker(): + dataset_shard_for_this_worker = train.get_dataset_shard("train") + + # 3 items for 3 workers, each worker gets 1 item + batches = list(dataset_shard_for_this_worker.iter_batches(batch_size=1)) + assert len(batches) == 1 + + train_dataset = ray.data.from_items([1, 2, 3]) + assert train_dataset.count() == 3 + trainer = DataParallelTrainer( + train_loop_for_worker, + scaling_config=ScalingConfig(num_workers=3), + datasets={"train": train_dataset}, + ) + result = trainer.fit() + + .. testoutput:: + :hide: + + ... + + **How do I develop on top of DataParallelTrainer?** + + In many cases, using DataParallelTrainer directly is sufficient to execute + functions on multiple actors. + + However, you may want to subclass ``DataParallelTrainer`` and create a custom + Trainer for the following 2 use cases: + + - **Use Case 1:** You want to do data parallel training, but want to have + a predefined ``training_loop_per_worker``. + + - **Use Case 2:** You want to implement a custom + :py:class:`~ray.train.backend.Backend` that automatically handles + additional setup or teardown logic on each actor, so that the users of this + new trainer do not have to implement this logic. For example, a + ``TensorflowTrainer`` can be built on top of ``DataParallelTrainer`` + that automatically handles setting the proper environment variables for + distributed Tensorflow on each actor. + + For 1, you can set a predefined training loop in __init__ + + .. testcode:: + + from ray.train.data_parallel_trainer import DataParallelTrainer + + class MyDataParallelTrainer(DataParallelTrainer): + def __init__(self, *args, **kwargs): + predefined_train_loop_per_worker = lambda: 1 + super().__init__(predefined_train_loop_per_worker, *args, **kwargs) + + + For 2, you can implement the ``ray.train.Backend`` and ``ray.train.BackendConfig`` + interfaces. + + .. testcode:: + + from dataclasses import dataclass + from ray.train.backend import Backend, BackendConfig + + class MyBackend(Backend): + def on_start(self, worker_group, backend_config): + def set_env_var(env_var_value): + import os + os.environ["MY_ENV_VAR"] = env_var_value + + worker_group.execute(set_env_var, backend_config.env_var) + + @dataclass + class MyBackendConfig(BackendConfig): + env_var: str = "default_value" + + def backend_cls(self): + return MyBackend + + class MyTrainer(DataParallelTrainer): + def __init__(self, train_loop_per_worker, my_backend_config: + MyBackendConfig, **kwargs): + + super().__init__( + train_loop_per_worker, + backend_config=my_backend_config, **kwargs) + + Args: + train_loop_per_worker: The training function to execute. + This can either take in no arguments or a ``config`` dict. + train_loop_config: Configurations to pass into + ``train_loop_per_worker`` if it accepts an argument. + backend_config: Configuration for setting up a Backend (e.g. Torch, + Tensorflow, Horovod) on each worker to enable distributed + communication. If no Backend should be set up, then set this to None. + scaling_config: Configuration for how to scale data parallel training. + dataset_config: Configuration for dataset ingest. This is merged with the + default dataset config for the given trainer (`cls._dataset_config`). + run_config: Configuration for the execution of the training run. + datasets: Ray Datasets to use for training and evaluation. + This is a dict where the key is the name of the dataset, which + can be accessed from within the ``train_loop_per_worker`` by calling + ``train.get_dataset_shard(dataset_key)``. + By default, all datasets are sharded equally across workers. + This can be configured via ``dataset_config``. + metadata: Dict that should be made available via + `train.get_context().get_metadata()` and in `checkpoint.get_metadata()` + for checkpoints saved from this Trainer. Must be JSON-serializable. + resume_from_checkpoint: A checkpoint to resume training from. + """ + + # Exposed here for testing purposes. Should never need + # to be overriden. + _backend_executor_cls: Type[BackendExecutor] = BackendExecutor + _training_iterator_cls: Type[TrainingIterator] = TrainingIterator + + _scaling_config_allowed_keys = BaseTrainer._scaling_config_allowed_keys + [ + "num_workers", + "resources_per_worker", + "use_gpu", + "placement_strategy", + "accelerator_type", + ] + + # For backwards compatibility with the legacy dataset config API. + _dataset_config = None + + _fields_for_tuner_param_space = BaseTrainer._fields_for_tuner_param_space + [ + "train_loop_config" + ] + + def __init__( + self, + train_loop_per_worker: Union[Callable[[], None], Callable[[Dict], None]], + *, + train_loop_config: Optional[Dict] = None, + backend_config: Optional[BackendConfig] = None, + scaling_config: Optional[ScalingConfig] = None, + dataset_config: Optional[DataConfig] = None, + run_config: Optional[RunConfig] = None, + datasets: Optional[Dict[str, GenDataset]] = None, + metadata: Optional[Dict[str, Any]] = None, + resume_from_checkpoint: Optional[Checkpoint] = None, + ): + self._train_loop_per_worker = train_loop_per_worker + self._train_loop_config = train_loop_config + + if dataset_config is None: + dataset_config = DataConfig() + + if not isinstance(dataset_config, DataConfig): + raise ValueError( + "`dataset_config` must be an instance of ray.train.DataConfig, " + f"was: {dataset_config}" + ) + self._data_config = dataset_config + + backend_config = ( + backend_config if backend_config is not None else BackendConfig() + ) + self._backend_config = backend_config + + super(DataParallelTrainer, self).__init__( + scaling_config=scaling_config, + run_config=run_config, + datasets=datasets, + metadata=metadata, + resume_from_checkpoint=resume_from_checkpoint, + ) + + train_total_resources = self.scaling_config.total_resources + self._data_config.set_train_total_resources( + train_total_resources.get("CPU", 0), + train_total_resources.get("GPU", 0), + ) + + if env_integer(RAY_TRAIN_ENABLE_STATE_TRACKING, 0): + from ray.train._internal.state.state_actor import get_or_create_state_actor + + get_or_create_state_actor() + + @PublicAPI(stability="beta") + @classmethod + def restore( + cls: Type["DataParallelTrainer"], + path: str, + train_loop_per_worker: Optional[ + Union[Callable[[], None], Callable[[Dict], None]] + ] = None, + train_loop_config: Optional[Dict] = None, + **kwargs, + ) -> "DataParallelTrainer": + """Restores a DataParallelTrainer from a previously interrupted/failed run. + + Args: + train_loop_per_worker: Optionally re-specified train loop function. + This should be used to re-specify a function that is not + restorable in a new Ray cluster (e.g., it holds onto outdated + object references). This should be the same training loop + that was passed to the original trainer constructor. + train_loop_config: Optionally re-specified train config. + This should similarly be used if the original `train_loop_config` + contained outdated object references, and it should not be modified + from what was originally passed in. + + See :meth:`BaseTrainer.restore() ` + for descriptions of the other arguments. + + Returns: + DataParallelTrainer: A restored instance of the `DataParallelTrainer` + subclass that is calling this method. + """ + return super(DataParallelTrainer, cls).restore( + path=path, + train_loop_per_worker=train_loop_per_worker, + train_loop_config=train_loop_config, + **kwargs, + ) + + def _validate_attributes(self): + super()._validate_attributes() + + self._validate_train_loop_per_worker( + self._train_loop_per_worker, "train_loop_per_worker" + ) + + def _validate_train_loop_per_worker( + self, train_loop_per_worker: Callable, fn_name: str + ) -> None: + num_required_params = count_required_parameters(train_loop_per_worker) + if num_required_params > 1: + raise ValueError( + f"{fn_name} should take in 0 or 1 arguments, " + f"but it accepts {num_required_params} arguments instead." + ) + + @classmethod + def _validate_scaling_config(cls, scaling_config: ScalingConfig) -> ScalingConfig: + scaling_config = super(DataParallelTrainer, cls)._validate_scaling_config( + scaling_config + ) + + # This validation happens after the scaling config is updated from + # its specification in the Tuner `param_space` + if not scaling_config.use_gpu and "GPU" in ray.available_resources(): + logger.info( + "GPUs are detected in your Ray cluster, but GPU " + "training is not enabled for this trainer. To enable " + "GPU training, make sure to set `use_gpu` to True " + "in your scaling config." + ) + + if scaling_config.num_workers is None: + raise ValueError( + "You must specify the 'num_workers' in `scaling_config` as either an " + f"argument of `{cls.__name__}` or through the `param_space` of a " + "`Tuner` (if performing hyperparameter tuning)." + ) + + if scaling_config.num_workers <= 0: + raise ValueError( + "'num_workers' in `scaling_config` must be a positive " + f"integer. Received {scaling_config.num_workers}" + ) + + return scaling_config + + def _run_training(self, training_iterator: TrainingIterator) -> None: + """This method loops over the `TrainingIterator`: + The actual iteration (for ... in ...) waits for the training function + on each worker to report a result and supplies it as a list of results. + Afterwards (in the body of the loop), it will report the result + to the Tune session. + The iterator ends after the training function on each worker has finished. + """ + for training_results in training_iterator: + # TODO(ml-team): add ability to report results from multiple workers. + self._propagate_results(training_results) + + def _propagate_results(self, training_results: List[_TrainingResult]): + first_worker_result = training_results[0] + assert all(isinstance(result, _TrainingResult) for result in training_results) + + tune_session = get_session() + + # Check if any workers reported a checkpoint. + # If so, report a checkpoint pointing to the persisted location + # to Tune for book-keeping. + # NOTE: This removes the restriction for any individual worker + # (ex: global rank 0 worker) from needing to report a checkpoint. + # All workers reported a checkpoint to the same fs path, so there's + # no need to report multiple checkpoints to Tune. + worker_checkpoints = [ + result.checkpoint + for result in training_results + if result.checkpoint is not None + ] + at_least_one_reported_checkpoint = len(worker_checkpoints) > 0 + + if at_least_one_reported_checkpoint: + # Update the coordinator's checkpoint index to the latest. + # This is what keeps the checkpoint index in line with the workers. + tune_session.storage._update_checkpoint_index(first_worker_result.metrics) + + # Make sure that all workers uploaded to the same location. + assert all( + checkpoint.path == tune_session.storage.checkpoint_fs_path + for checkpoint in worker_checkpoints + ) + + checkpoint = ( + Checkpoint( + filesystem=tune_session.storage.storage_filesystem, + path=tune_session.storage.checkpoint_fs_path, + ) + if at_least_one_reported_checkpoint + else None + ) + + tracked_training_result = _TrainingResult( + checkpoint=checkpoint, + metrics=first_worker_result.metrics, + ) + + logger.debug( + "Report (metrics, checkpoint) to the Tune session:\n" + f" metrics={tracked_training_result.metrics}\n" + f" checkpoint={tracked_training_result.checkpoint}" + ) + + # Report the metrics and checkpoint to Tune. + tune_session._report_training_result(tracked_training_result) + + def training_loop(self) -> None: + scaling_config = self._validate_scaling_config(self.scaling_config) + + train_loop_per_worker = construct_train_func( + self._train_loop_per_worker, + self._train_loop_config, + train_func_context=self._backend_config.train_func_context, + fn_arg_name="train_loop_per_worker", + discard_returns=True, + ) + + trial_info = TrialInfo( + name=session.get_trial_name(), + id=session.get_trial_id(), + resources=session.get_trial_resources(), + logdir=session.get_trial_dir(), + driver_ip=ray.util.get_node_ip_address(), + driver_node_id=ray.get_runtime_context().get_node_id(), + experiment_name=session.get_experiment_name(), + run_id=uuid.uuid4().hex, + ) + + backend_executor = self._backend_executor_cls( + backend_config=self._backend_config, + trial_info=trial_info, + num_workers=scaling_config.num_workers, + resources_per_worker=scaling_config._resources_per_worker_not_none, + max_retries=0, + ) + + # Start the remote actors. + backend_executor.start() + + training_iterator = self._training_iterator_cls( + backend_executor=backend_executor, + backend_config=self._backend_config, + train_func=train_loop_per_worker, + datasets=self.datasets, + metadata=self.metadata, + data_config=self._data_config, + checkpoint=self.starting_checkpoint, + ) + + self._run_training(training_iterator) + + # Shutdown workers. + backend_executor.shutdown() + + def get_dataset_config(self) -> DataConfig: + """Returns a copy of this Trainer's final dataset configs. + + Returns: + The merged default + user-supplied dataset config. + """ + + return self._data_config + + @repr_with_fallback(["ipywidgets", "8"]) + def _repr_mimebundle_(self, **kwargs): + """Returns a mimebundle with an ipywidget repr and a simple text repr. + + Depending on the frontend where the data is being displayed, + different mimetypes will be used from this bundle. + See https://ipython.readthedocs.io/en/stable/config/integrating.html + for information about this method, and + https://ipywidgets.readthedocs.io/en/latest/embedding.html + for more information about the jupyter widget mimetype. + + Returns: + A mimebundle containing an ipywidget repr and a simple text repr. + """ + from ipywidgets import HTML, Layout, Tab, VBox + + title = HTML(f"

{self.__class__.__name__}

") + + children = [] + titles = [] + + if self.datasets: + children.append(self._datasets_repr_()) + titles.append("Datasets") + + children.append(HTML(self._data_config_repr_html_())) + titles.append("Data Config") + + if self._train_loop_config: + children.append(HTML(self._train_loop_config_repr_html_())) + titles.append("Train Loop Config") + + if self.scaling_config: + children.append(HTML(self.scaling_config._repr_html_())) + titles.append("Scaling Config") + + if self.run_config: + children.append(HTML(self.run_config._repr_html_())) + titles.append("Run Config") + + if self._backend_config: + children.append(HTML(self._backend_config._repr_html_())) + titles.append("Backend Config") + + tab = Tab(children, titles=titles) + widget = VBox([title, tab], layout=Layout(width="100%")) + bundle = widget._repr_mimebundle_(**kwargs) + bundle.update( + { + "text/plain": repr(self), + } + ) + return bundle + + def _train_loop_config_repr_html_(self) -> str: + if self._train_loop_config: + table_data = {} + for k, v in self._train_loop_config.items(): + if isinstance(v, str) or str(v).isnumeric(): + table_data[k] = v + elif hasattr(v, "_repr_html_"): + table_data[k] = v._repr_html_() + else: + table_data[k] = str(v) + + return Template("title_data.html.j2").render( + title="Train Loop Config", + data=Template("scrollableTable.html.j2").render( + table=tabulate( + table_data.items(), + headers=["Setting", "Value"], + showindex=False, + tablefmt="unsafehtml", + ), + max_height="none", + ), + ) + else: + return "" + + def _data_config_repr_html_(self) -> str: + # TODO make this rendering nicer. + content = [str(self._data_config)] + return Template("rendered_html_common.html.j2").render(content=content) + + def _datasets_repr_(self) -> str: + from ipywidgets import HTML, Layout, VBox + + content = [] + if self.datasets: + for name, config in self.datasets.items(): + tab = config._tab_repr_() + if tab: + content.append( + HTML( + Template("title_data.html.j2").render( + title=f"Dataset - {name}", data=None + ) + ) + ) + content.append(config._tab_repr_()) + + return VBox(content, layout=Layout(width="100%")) diff --git a/.venv/lib/python3.11/site-packages/ray/train/error.py b/.venv/lib/python3.11/site-packages/ray/train/error.py new file mode 100644 index 0000000000000000000000000000000000000000..1aa8c82471bbe8c800b2415c8af3b1aef601d00d --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/train/error.py @@ -0,0 +1,6 @@ +from ray.util.annotations import PublicAPI + + +@PublicAPI(stability="beta") +class SessionMisuseError(Exception): + """Indicates a method or function was used outside of a session.""" diff --git a/.venv/lib/python3.11/site-packages/ray/train/examples/__init__.py b/.venv/lib/python3.11/site-packages/ray/train/examples/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/.venv/lib/python3.11/site-packages/ray/train/examples/mlflow_simple_example.py b/.venv/lib/python3.11/site-packages/ray/train/examples/mlflow_simple_example.py new file mode 100644 index 0000000000000000000000000000000000000000..5e1a49f83bb22eb166f6aabba76b4e9968a00296 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/train/examples/mlflow_simple_example.py @@ -0,0 +1,55 @@ +from pathlib import Path + +from ray import train +from ray.train import RunConfig, ScalingConfig +from ray.train.torch import TorchTrainer +from ray.tune.logger import TBXLoggerCallback +from ray.tune.logger.mlflow import MLflowLoggerCallback + + +def train_func(): + for i in range(3): + train.report(dict(epoch=i)) + + +trainer = TorchTrainer( + train_func, + scaling_config=ScalingConfig(num_workers=2), + run_config=RunConfig( + callbacks=[ + MLflowLoggerCallback(experiment_name="train_experiment"), + TBXLoggerCallback(), + ], + ), +) + +# Run the training function, logging all the intermediate results +# to MLflow and Tensorboard. +result = trainer.fit() + +# For MLFLow logs: + +# MLFlow logs will by default be saved in an `mlflow` directory +# in the current working directory. + +# $ cd mlflow +# # View the MLflow UI. +# $ mlflow ui + +# You can change the directory by setting the `tracking_uri` argument +# in `MLflowLoggerCallback`. + +# For TensorBoard logs: + +# Print the latest run directory and keep note of it. +# For example: /home/ubuntu/ray_results/TorchTrainer_2022-06-13_20-31-06 +print("Run directory:", Path(result.path).parent) # TensorBoard is saved in parent dir + +# How to visualize the logs + +# Navigate to the run directory of the trainer. +# For example `cd /home/ubuntu/ray_results/TorchTrainer_2022-06-13_20-31-06` +# $ cd +# +# # View the tensorboard UI. +# $ tensorboard --logdir . diff --git a/.venv/lib/python3.11/site-packages/ray/train/examples/tf/tune_tensorflow_autoencoder_example.py b/.venv/lib/python3.11/site-packages/ray/train/examples/tf/tune_tensorflow_autoencoder_example.py new file mode 100644 index 0000000000000000000000000000000000000000..3c2d90b1887671e8dc35820ca900af10425e64a2 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/train/examples/tf/tune_tensorflow_autoencoder_example.py @@ -0,0 +1,77 @@ +import argparse + +import ray +from ray import tune +from ray.train import ScalingConfig +from ray.train.examples.tf.tensorflow_mnist_example import train_func +from ray.train.tensorflow import TensorflowTrainer +from ray.tune.tune_config import TuneConfig +from ray.tune.tuner import Tuner + + +def tune_tensorflow_mnist( + num_workers: int = 2, num_samples: int = 2, use_gpu: bool = False +): + scaling_config = ScalingConfig(num_workers=num_workers, use_gpu=use_gpu) + trainer = TensorflowTrainer( + train_loop_per_worker=train_func, + scaling_config=scaling_config, + ) + tuner = Tuner( + trainer, + tune_config=TuneConfig( + num_samples=num_samples, metric="binary_crossentropy", mode="min" + ), + param_space={ + "train_loop_config": { + "lr": tune.loguniform(1e-4, 1e-1), + "batch_size": tune.choice([32, 64, 128]), + "epochs": 3, + } + }, + ) + best_accuracy = tuner.fit().get_best_result().metrics["binary_crossentropy"] + print(f"Best accuracy config: {best_accuracy}") + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument( + "--smoke-test", + action="store_true", + default=False, + help="Finish quickly for testing.", + ) + parser.add_argument( + "--address", required=False, type=str, help="the address to use for Ray" + ) + parser.add_argument( + "--num-workers", + "-n", + type=int, + default=2, + help="Sets number of workers for training.", + ) + parser.add_argument( + "--num-samples", + type=int, + default=2, + help="Sets number of samples for training.", + ) + parser.add_argument( + "--use-gpu", action="store_true", default=False, help="Enables GPU training" + ) + + args = parser.parse_args() + + if args.smoke_test: + num_gpus = args.num_workers if args.use_gpu else 0 + ray.init(num_cpus=8, num_gpus=num_gpus) + tune_tensorflow_mnist(num_workers=2, num_samples=2, use_gpu=args.use_gpu) + else: + ray.init(address=args.address) + tune_tensorflow_mnist( + num_workers=args.num_workers, + num_samples=args.num_samples, + use_gpu=args.use_gpu, + ) diff --git a/.venv/lib/python3.11/site-packages/ray/train/huggingface/__init__.py b/.venv/lib/python3.11/site-packages/ray/train/huggingface/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/.venv/lib/python3.11/site-packages/ray/train/huggingface/__pycache__/__init__.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/train/huggingface/__pycache__/__init__.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..435c3182eb6e3e02060407a0f991438e3fda3e2d Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/train/huggingface/__pycache__/__init__.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/train/huggingface/transformers/__init__.py b/.venv/lib/python3.11/site-packages/ray/train/huggingface/transformers/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..870fe4ee94b873e077a7bf0d3bf48429e1bf8df0 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/train/huggingface/transformers/__init__.py @@ -0,0 +1,12 @@ +from ray.train.huggingface.transformers._transformers_utils import ( + RayTrainReportCallback, + prepare_trainer, +) + +__all__ = [ + "RayTrainReportCallback", + "prepare_trainer", +] + + +# DO NOT ADD ANYTHING AFTER THIS LINE. diff --git a/.venv/lib/python3.11/site-packages/ray/train/huggingface/transformers/__pycache__/__init__.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/train/huggingface/transformers/__pycache__/__init__.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..9d8f3b5bb0092adf51d6535936227f272ad43cf2 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/train/huggingface/transformers/__pycache__/__init__.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/train/huggingface/transformers/__pycache__/_transformers_utils.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/train/huggingface/transformers/__pycache__/_transformers_utils.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..42c345d0216f293ddf88613000734d36017a57e8 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/train/huggingface/transformers/__pycache__/_transformers_utils.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/train/huggingface/transformers/_transformers_utils.py b/.venv/lib/python3.11/site-packages/ray/train/huggingface/transformers/_transformers_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..c522b81cfbf1c8756e26fd42c2692f3bdb421964 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/train/huggingface/transformers/_transformers_utils.py @@ -0,0 +1,143 @@ +import logging +import shutil +from pathlib import Path +from tempfile import TemporaryDirectory +from typing import Iterator, Optional, Type + +from torch.utils.data import DataLoader, Dataset, IterableDataset + +import ray +from ray._private.usage.usage_lib import TagKey, record_extra_usage_tag +from ray.data.iterator import _IterableFromIterator +from ray.train import Checkpoint +from ray.util import PublicAPI + +logger = logging.getLogger(__name__) + + +TRANSFORMERS_IMPORT_ERROR: Optional[ImportError] = None + +try: + import transformers.trainer + from transformers import Trainer + from transformers.trainer_callback import TrainerCallback +except ImportError as e: + TRANSFORMERS_IMPORT_ERROR = e + TrainerCallback = object + + +@PublicAPI(stability="beta") +class RayTrainReportCallback(TrainerCallback): + """A simple callback to report checkpoints and metrics to Ray Train. + + This callback is a subclass of `transformers.TrainerCallback + `_ + and overrides the `TrainerCallback.on_save()` method. After + a new checkpoint get saved, it fetches the latest metric dictionary + from `TrainerState.log_history` and reports it with the latest checkpoint + to Ray Train. + + Checkpoints will be saved in the following structure:: + + checkpoint_00000*/ Ray Train Checkpoint + └─ checkpoint/ Hugging Face Transformers Checkpoint + + For customized reporting and checkpointing logic, implement your own + `transformers.TrainerCallback` following this user + guide: :ref:`Saving and Loading Checkpoints `. + + Note that users should ensure that the logging, evaluation, and saving frequencies + are properly configured so that the monitoring metric is always up-to-date + when `transformers.Trainer` saves a checkpoint. + + Suppose the monitoring metric is reported from evaluation stage: + + Some valid configurations: + - evaluation_strategy == save_strategy == "epoch" + - evaluation_strategy == save_strategy == "steps", save_steps % eval_steps == 0 + + Some invalid configurations: + - evaluation_strategy != save_strategy + - evaluation_strategy == save_strategy == "steps", save_steps % eval_steps != 0 + + """ + + CHECKPOINT_NAME = "checkpoint" + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + record_extra_usage_tag(TagKey.TRAIN_TRANSFORMERS_RAYTRAINREPORTCALLBACK, "1") + + def on_save(self, args, state, control, **kwargs): + """Event called after a checkpoint save.""" + with TemporaryDirectory() as tmpdir: + # Aggregate all the logged metrics + metrics = {} + for log in state.log_history: + metrics.update(log) + + # Copy ckpt files and construct a Ray Train Checkpoint + source_ckpt_path = transformers.trainer.get_last_checkpoint(args.output_dir) + if source_ckpt_path is not None: + target_ckpt_path = Path(tmpdir, self.CHECKPOINT_NAME).as_posix() + shutil.copytree(source_ckpt_path, target_ckpt_path) + checkpoint = Checkpoint.from_directory(tmpdir) + else: + checkpoint = None + + # Report latest metrics and checkpoint to Ray Train + ray.train.report(metrics=metrics, checkpoint=checkpoint) + + +class RayTorchIterableDataset(IterableDataset): + """Wrapper class for ray data iterables.""" + + def __init__(self, data_iterable) -> None: + super().__init__() + self.data_iterable = data_iterable + + def __iter__(self) -> Iterator: + return iter(self.data_iterable) + + +@PublicAPI(stability="beta") +def prepare_trainer(trainer: "Trainer") -> "Trainer": + """Prepare your HuggingFace Transformer Trainer for Ray Train. + + This utility function enable the trainer integrates with Ray Data Integration. + Internally, it overrides the `get_train_dataloader` and `get_eval_dataloader` + methods and inject the data integration logics if the `train_dataset` and + `eval_dataset` are Ray Data Iterables. + """ + + if TRANSFORMERS_IMPORT_ERROR is not None: + raise TRANSFORMERS_IMPORT_ERROR + + base_trainer_class: Type[transformers.trainer.Trainer] = trainer.__class__ + + class RayTransformersTrainer(base_trainer_class): + """A Wrapper of `transformers.Trainer` for Ray Data Integration.""" + + def get_train_dataloader(self) -> DataLoader: + if isinstance(self.train_dataset, _IterableFromIterator): + dataset = RayTorchIterableDataset(self.train_dataset) + return DataLoader(dataset, batch_size=1, collate_fn=lambda x: x[0]) + else: + return super().get_train_dataloader() + + def get_eval_dataloader( + self, eval_dataset: Optional[Dataset] = None + ) -> DataLoader: + if eval_dataset is None: + eval_dataset = self.eval_dataset + + if isinstance(eval_dataset, _IterableFromIterator): + dataset = RayTorchIterableDataset(eval_dataset) + return DataLoader(dataset, batch_size=1, collate_fn=lambda x: x[0]) + else: + return super().get_eval_dataloader(eval_dataset) + + trainer.__class__ = RayTransformersTrainer + + record_extra_usage_tag(TagKey.TRAIN_TRANSFORMERS_PREPARE_TRAINER, "1") + return trainer diff --git a/.venv/lib/python3.11/site-packages/ray/train/lightgbm/__init__.py b/.venv/lib/python3.11/site-packages/ray/train/lightgbm/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..531165e7c8463f873af787dfc495b02764d3f91c --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/train/lightgbm/__init__.py @@ -0,0 +1,18 @@ +from ray.train.lightgbm._lightgbm_utils import RayTrainReportCallback +from ray.train.lightgbm.lightgbm_checkpoint import LightGBMCheckpoint +from ray.train.lightgbm.lightgbm_predictor import LightGBMPredictor +from ray.train.lightgbm.lightgbm_trainer import LightGBMTrainer +from ray.train.v2._internal.constants import is_v2_enabled + +if is_v2_enabled(): + from ray.train.v2.lightgbm.lightgbm_trainer import LightGBMTrainer # noqa: F811 + +__all__ = [ + "RayTrainReportCallback", + "LightGBMCheckpoint", + "LightGBMPredictor", + "LightGBMTrainer", +] + + +# DO NOT ADD ANYTHING AFTER THIS LINE. diff --git a/.venv/lib/python3.11/site-packages/ray/train/lightgbm/__pycache__/__init__.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/train/lightgbm/__pycache__/__init__.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..473c3709ebe869c07987213745c0fbeef54a13df Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/train/lightgbm/__pycache__/__init__.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/train/lightgbm/__pycache__/_lightgbm_utils.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/train/lightgbm/__pycache__/_lightgbm_utils.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..d2f8264c58b34f09357e4cb7dd28fd623973cedb Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/train/lightgbm/__pycache__/_lightgbm_utils.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/train/lightgbm/__pycache__/config.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/train/lightgbm/__pycache__/config.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..6054fcef950d199be9e161ee64a01aca4b935401 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/train/lightgbm/__pycache__/config.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/train/lightgbm/__pycache__/lightgbm_checkpoint.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/train/lightgbm/__pycache__/lightgbm_checkpoint.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..eeb047d8b5bb93098fd99fb5bab6f3ae96ac6ea0 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/train/lightgbm/__pycache__/lightgbm_checkpoint.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/train/lightgbm/__pycache__/lightgbm_predictor.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/train/lightgbm/__pycache__/lightgbm_predictor.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..b4f8394effebbdc16d43742eb0eaef69a89d8e05 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/train/lightgbm/__pycache__/lightgbm_predictor.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/train/lightgbm/__pycache__/lightgbm_trainer.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/train/lightgbm/__pycache__/lightgbm_trainer.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..cedfb623bbc1076f57319cffc1192d5dd419752c Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/train/lightgbm/__pycache__/lightgbm_trainer.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/train/lightgbm/__pycache__/v2.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/train/lightgbm/__pycache__/v2.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..1b2b4be343c58ecdb9ba5d5cbd62c0f18afceb67 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/train/lightgbm/__pycache__/v2.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/train/lightgbm/_lightgbm_utils.py b/.venv/lib/python3.11/site-packages/ray/train/lightgbm/_lightgbm_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..15c4e344bd16dd19e84b3863eb4614bcff5fbf44 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/train/lightgbm/_lightgbm_utils.py @@ -0,0 +1,170 @@ +import tempfile +from contextlib import contextmanager +from pathlib import Path +from typing import Callable, Dict, List, Optional, Union + +from lightgbm.basic import Booster +from lightgbm.callback import CallbackEnv + +import ray.train +from ray.train import Checkpoint +from ray.tune.utils import flatten_dict +from ray.util.annotations import PublicAPI + + +@PublicAPI(stability="beta") +class RayTrainReportCallback: + """Creates a callback that reports metrics and checkpoints model. + + Args: + metrics: Metrics to report. If this is a list, + each item should be a metric key reported by LightGBM, + and it will be reported to Ray Train/Tune under the same name. + This can also be a dict of {: }, + which can be used to rename LightGBM default metrics. + filename: Customize the saved checkpoint file type by passing + a filename. Defaults to "model.txt". + frequency: How often to save checkpoints, in terms of iterations. + Defaults to 0 (no checkpoints are saved during training). + checkpoint_at_end: Whether or not to save a checkpoint at the end of training. + results_postprocessing_fn: An optional Callable that takes in + the metrics dict that will be reported (after it has been flattened) + and returns a modified dict. + + Examples + -------- + + Reporting checkpoints and metrics to Ray Tune when running many + independent xgboost trials (without data parallelism within a trial). + + .. testcode:: + :skipif: True + + import lightgbm + + from ray.train.lightgbm import RayTrainReportCallback + + config = { + # ... + "metric": ["binary_logloss", "binary_error"], + } + + # Report only log loss to Tune after each validation epoch. + bst = lightgbm.train( + ..., + callbacks=[ + RayTrainReportCallback( + metrics={"loss": "eval-binary_logloss"}, frequency=1 + ) + ], + ) + + Loading a model from a checkpoint reported by this callback. + + .. testcode:: + :skipif: True + + from ray.train.lightgbm import RayTrainReportCallback + + # Get a `Checkpoint` object that is saved by the callback during training. + result = trainer.fit() + booster = RayTrainReportCallback.get_model(result.checkpoint) + + """ + + CHECKPOINT_NAME = "model.txt" + + def __init__( + self, + metrics: Optional[Union[str, List[str], Dict[str, str]]] = None, + filename: str = CHECKPOINT_NAME, + frequency: int = 0, + checkpoint_at_end: bool = True, + results_postprocessing_fn: Optional[ + Callable[[Dict[str, Union[float, List[float]]]], Dict[str, float]] + ] = None, + ): + if isinstance(metrics, str): + metrics = [metrics] + self._metrics = metrics + self._filename = filename + self._frequency = frequency + self._checkpoint_at_end = checkpoint_at_end + self._results_postprocessing_fn = results_postprocessing_fn + + @classmethod + def get_model( + cls, checkpoint: Checkpoint, filename: str = CHECKPOINT_NAME + ) -> Booster: + """Retrieve the model stored in a checkpoint reported by this callback. + + Args: + checkpoint: The checkpoint object returned by a training run. + The checkpoint should be saved by an instance of this callback. + filename: The filename to load the model from, which should match + the filename used when creating the callback. + """ + with checkpoint.as_directory() as checkpoint_path: + return Booster(model_file=Path(checkpoint_path, filename).as_posix()) + + def _get_report_dict(self, evals_log: Dict[str, Dict[str, list]]) -> dict: + result_dict = flatten_dict(evals_log, delimiter="-") + if not self._metrics: + report_dict = result_dict + else: + report_dict = {} + for key in self._metrics: + if isinstance(self._metrics, dict): + metric = self._metrics[key] + else: + metric = key + report_dict[key] = result_dict[metric] + if self._results_postprocessing_fn: + report_dict = self._results_postprocessing_fn(report_dict) + return report_dict + + def _get_eval_result(self, env: CallbackEnv) -> dict: + eval_result = {} + for entry in env.evaluation_result_list: + data_name, eval_name, result = entry[0:3] + if len(entry) > 4: + stdv = entry[4] + suffix = "-mean" + else: + stdv = None + suffix = "" + if data_name not in eval_result: + eval_result[data_name] = {} + eval_result[data_name][eval_name + suffix] = result + if stdv is not None: + eval_result[data_name][eval_name + "-stdv"] = stdv + return eval_result + + @contextmanager + def _get_checkpoint(self, model: Booster) -> Optional[Checkpoint]: + if ray.train.get_context().get_world_rank() in (0, None): + with tempfile.TemporaryDirectory() as temp_checkpoint_dir: + model.save_model(Path(temp_checkpoint_dir, self._filename).as_posix()) + yield Checkpoint.from_directory(temp_checkpoint_dir) + else: + yield None + + def __call__(self, env: CallbackEnv) -> None: + eval_result = self._get_eval_result(env) + report_dict = self._get_report_dict(eval_result) + + # Ex: if frequency=2, checkpoint_at_end=True and num_boost_rounds=11, + # you will checkpoint at iterations 1, 3, 5, ..., 9, and 10 (checkpoint_at_end) + # (iterations count from 0) + on_last_iter = env.iteration == env.end_iteration - 1 + should_checkpoint_at_end = on_last_iter and self._checkpoint_at_end + should_checkpoint_with_frequency = ( + self._frequency != 0 and (env.iteration + 1) % self._frequency == 0 + ) + should_checkpoint = should_checkpoint_at_end or should_checkpoint_with_frequency + + if should_checkpoint: + with self._get_checkpoint(model=env.model) as checkpoint: + ray.train.report(report_dict, checkpoint=checkpoint) + else: + ray.train.report(report_dict) diff --git a/.venv/lib/python3.11/site-packages/ray/train/lightgbm/config.py b/.venv/lib/python3.11/site-packages/ray/train/lightgbm/config.py new file mode 100644 index 0000000000000000000000000000000000000000..c57f4b6d17c71c563b25e59af6b4ecda1cc7cbeb --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/train/lightgbm/config.py @@ -0,0 +1,89 @@ +import logging +import threading +from dataclasses import dataclass +from typing import Any, Dict, Optional + +import ray +from ray.train._internal.utils import get_address_and_port +from ray.train._internal.worker_group import WorkerGroup +from ray.train.backend import Backend, BackendConfig + +logger = logging.getLogger(__name__) + + +# Global LightGBM distributed network configuration for each worker process. +_lightgbm_network_params: Optional[Dict[str, Any]] = None +_lightgbm_network_params_lock = threading.Lock() + + +def get_network_params() -> Dict[str, Any]: + """Returns the network parameters to enable LightGBM distributed training.""" + global _lightgbm_network_params + + with _lightgbm_network_params_lock: + if not _lightgbm_network_params: + logger.warning( + "`ray.train.lightgbm.get_network_params` was called outside " + "the context of a `ray.train.lightgbm.LightGBMTrainer`. " + "The current process has no knowledge of the distributed training " + "worker group, so this method will return an empty dict. " + "Please call this within the training loop of a " + "`ray.train.lightgbm.LightGBMTrainer`. " + "If you are in fact calling this within a `LightGBMTrainer`, " + "this is unexpected: please file a bug report to the Ray Team." + ) + return {} + + return _lightgbm_network_params.copy() + + +def _set_network_params( + num_machines: int, + local_listen_port: int, + machines: str, +): + global _lightgbm_network_params + + with _lightgbm_network_params_lock: + assert ( + _lightgbm_network_params is None + ), "LightGBM network params are already initialized." + _lightgbm_network_params = dict( + num_machines=num_machines, + local_listen_port=local_listen_port, + machines=machines, + ) + + +@dataclass +class LightGBMConfig(BackendConfig): + """Configuration for LightGBM distributed data-parallel training setup. + + See the LightGBM docs for more information on the "network parameters" + that Ray Train sets up for you: + https://lightgbm.readthedocs.io/en/latest/Parameters.html#network-parameters + """ + + @property + def backend_cls(self): + return _LightGBMBackend + + +class _LightGBMBackend(Backend): + def on_training_start( + self, worker_group: WorkerGroup, backend_config: LightGBMConfig + ): + node_ips_and_ports = worker_group.execute(get_address_and_port) + ports = [port for _, port in node_ips_and_ports] + machines = ",".join( + [f"{node_ip}:{port}" for node_ip, port in node_ips_and_ports] + ) + num_machines = len(worker_group) + ray.get( + [ + worker_group.execute_single_async( + rank, _set_network_params, num_machines, ports[rank], machines + ) + for rank in range(len(worker_group)) + ] + ) diff --git a/.venv/lib/python3.11/site-packages/ray/train/lightgbm/lightgbm_checkpoint.py b/.venv/lib/python3.11/site-packages/ray/train/lightgbm/lightgbm_checkpoint.py new file mode 100644 index 0000000000000000000000000000000000000000..786b411d8d0dffeb8c438f4d16dfa74bc0e9898a --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/train/lightgbm/lightgbm_checkpoint.py @@ -0,0 +1,70 @@ +import tempfile +from pathlib import Path +from typing import TYPE_CHECKING, Optional + +import lightgbm + +from ray.train._internal.framework_checkpoint import FrameworkCheckpoint +from ray.util.annotations import PublicAPI + +if TYPE_CHECKING: + from ray.data.preprocessor import Preprocessor + + +@PublicAPI(stability="beta") +class LightGBMCheckpoint(FrameworkCheckpoint): + """A :py:class:`~ray.train.Checkpoint` with LightGBM-specific functionality.""" + + MODEL_FILENAME = "model.txt" + + @classmethod + def from_model( + cls, + booster: lightgbm.Booster, + *, + preprocessor: Optional["Preprocessor"] = None, + path: Optional[str] = None, + ) -> "LightGBMCheckpoint": + """Create a :py:class:`~ray.train.Checkpoint` that stores a LightGBM model. + + Args: + booster: The LightGBM model to store in the checkpoint. + preprocessor: A fitted preprocessor to be applied before inference. + path: The path to the directory where the checkpoint file will be saved. + This should start as an empty directory, since the *entire* + directory will be treated as the checkpoint when reported. + By default, a temporary directory will be created. + + Returns: + An :py:class:`LightGBMCheckpoint` containing the specified ``Estimator``. + + Examples: + >>> import lightgbm + >>> import numpy as np + >>> from ray.train.lightgbm import LightGBMCheckpoint + >>> + >>> train_X = np.array([[1, 2], [3, 4]]) + >>> train_y = np.array([0, 1]) + >>> + >>> model = lightgbm.LGBMClassifier().fit(train_X, train_y) + >>> checkpoint = LightGBMCheckpoint.from_model(model.booster_) + """ + checkpoint_path = Path(path or tempfile.mkdtemp()) + + if not checkpoint_path.is_dir(): + raise ValueError(f"`path` must be a directory, but got: {checkpoint_path}") + + booster.save_model(checkpoint_path.joinpath(cls.MODEL_FILENAME).as_posix()) + + checkpoint = cls.from_directory(checkpoint_path.as_posix()) + if preprocessor: + checkpoint.set_preprocessor(preprocessor) + + return checkpoint + + def get_model(self) -> lightgbm.Booster: + """Retrieve the LightGBM model stored in this checkpoint.""" + with self.as_directory() as checkpoint_path: + return lightgbm.Booster( + model_file=Path(checkpoint_path, self.MODEL_FILENAME).as_posix() + ) diff --git a/.venv/lib/python3.11/site-packages/ray/train/lightgbm/lightgbm_predictor.py b/.venv/lib/python3.11/site-packages/ray/train/lightgbm/lightgbm_predictor.py new file mode 100644 index 0000000000000000000000000000000000000000..62c5237d00760617f0cf2a48ea996eda4552e32f --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/train/lightgbm/lightgbm_predictor.py @@ -0,0 +1,152 @@ +from typing import TYPE_CHECKING, List, Optional, Union + +import lightgbm +import pandas as pd +from pandas.api.types import is_object_dtype + +from ray.air.constants import TENSOR_COLUMN_NAME +from ray.air.data_batch_type import DataBatchType +from ray.air.util.data_batch_conversion import _unwrap_ndarray_object_type_if_needed +from ray.train.lightgbm import LightGBMCheckpoint +from ray.train.predictor import Predictor +from ray.util.annotations import PublicAPI + +if TYPE_CHECKING: + from ray.data.preprocessor import Preprocessor + + +@PublicAPI(stability="beta") +class LightGBMPredictor(Predictor): + """A predictor for LightGBM models. + + Args: + model: The LightGBM booster to use for predictions. + preprocessor: A preprocessor used to transform data batches prior + to prediction. + """ + + def __init__( + self, model: lightgbm.Booster, preprocessor: Optional["Preprocessor"] = None + ): + self.model = model + super().__init__(preprocessor) + + def __repr__(self): + return ( + f"{self.__class__.__name__}(model={self.model!r}, " + f"preprocessor={self._preprocessor!r})" + ) + + @classmethod + def from_checkpoint(cls, checkpoint: LightGBMCheckpoint) -> "LightGBMPredictor": + """Instantiate the predictor from a LightGBMCheckpoint. + + Args: + checkpoint: The checkpoint to load the model and preprocessor from. + + """ + model = checkpoint.get_model() + preprocessor = checkpoint.get_preprocessor() + return cls(model=model, preprocessor=preprocessor) + + def predict( + self, + data: DataBatchType, + feature_columns: Optional[Union[List[str], List[int]]] = None, + **predict_kwargs, + ) -> DataBatchType: + """Run inference on data batch. + + Args: + data: A batch of input data. + feature_columns: The names or indices of the columns in the + data to use as features to predict on. If None, then use + all columns in ``data``. + **predict_kwargs: Keyword arguments passed to + ``lightgbm.Booster.predict``. + + Examples: + >>> import numpy as np + >>> import lightgbm as lgbm + >>> from ray.train.lightgbm import LightGBMPredictor + >>> + >>> train_X = np.array([[1, 2], [3, 4]]) + >>> train_y = np.array([0, 1]) + >>> + >>> model = lgbm.LGBMClassifier().fit(train_X, train_y) + >>> predictor = LightGBMPredictor(model=model.booster_) + >>> + >>> data = np.array([[1, 2], [3, 4]]) + >>> predictions = predictor.predict(data) + >>> + >>> # Only use first and second column as the feature + >>> data = np.array([[1, 2, 8], [3, 4, 9]]) + >>> predictions = predictor.predict(data, feature_columns=[0, 1]) + + >>> import pandas as pd + >>> import lightgbm as lgbm + >>> from ray.train.lightgbm import LightGBMPredictor + >>> + >>> train_X = pd.DataFrame([[1, 2], [3, 4]], columns=["A", "B"]) + >>> train_y = pd.Series([0, 1]) + >>> + >>> model = lgbm.LGBMClassifier().fit(train_X, train_y) + >>> predictor = LightGBMPredictor(model=model.booster_) + >>> + >>> # Pandas dataframe. + >>> data = pd.DataFrame([[1, 2], [3, 4]], columns=["A", "B"]) + >>> predictions = predictor.predict(data) + >>> + >>> # Only use first and second column as the feature + >>> data = pd.DataFrame([[1, 2, 8], [3, 4, 9]], columns=["A", "B", "C"]) + >>> predictions = predictor.predict(data, feature_columns=["A", "B"]) + + + Returns: + Prediction result. + + """ + return Predictor.predict( + self, data, feature_columns=feature_columns, **predict_kwargs + ) + + def _predict_pandas( + self, + data: "pd.DataFrame", + feature_columns: Optional[Union[List[str], List[int]]] = None, + **predict_kwargs, + ) -> pd.DataFrame: + feature_names = None + if TENSOR_COLUMN_NAME in data: + data = data[TENSOR_COLUMN_NAME].to_numpy() + data = _unwrap_ndarray_object_type_if_needed(data) + if feature_columns: + # In this case feature_columns is a list of integers + data = data[:, feature_columns] + # Turn into dataframe to make dtype resolution easy + data = pd.DataFrame(data, columns=feature_names) + data = data.infer_objects() + + # Pandas does not detect categorical dtypes. Any remaining object + # dtypes are probably categories, so convert them. + # This will fail if we have a category composed entirely of + # integers, but this is the best we can do here. + update_dtypes = {} + for column in data.columns: + dtype = data.dtypes[column] + if is_object_dtype(dtype): + update_dtypes[column] = pd.CategoricalDtype() + + if update_dtypes: + data = data.astype(update_dtypes, copy=False) + elif feature_columns: + # feature_columns is a list of integers or strings + data = data[feature_columns] + + df = pd.DataFrame(self.model.predict(data, **predict_kwargs)) + df.columns = ( + ["predictions"] + if len(df.columns) == 1 + else [f"predictions_{i}" for i in range(len(df.columns))] + ) + return df diff --git a/.venv/lib/python3.11/site-packages/ray/train/lightgbm/lightgbm_trainer.py b/.venv/lib/python3.11/site-packages/ray/train/lightgbm/lightgbm_trainer.py new file mode 100644 index 0000000000000000000000000000000000000000..7057100cc571286cd26d231899806f0d95a1dc34 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/train/lightgbm/lightgbm_trainer.py @@ -0,0 +1,221 @@ +import logging +from functools import partial +from typing import Any, Dict, Optional + +import lightgbm + +import ray +from ray.train import Checkpoint +from ray.train.constants import _DEPRECATED_VALUE, TRAIN_DATASET_KEY +from ray.train.lightgbm import RayTrainReportCallback +from ray.train.lightgbm.v2 import LightGBMTrainer as SimpleLightGBMTrainer +from ray.train.trainer import GenDataset +from ray.util.annotations import PublicAPI + +logger = logging.getLogger(__name__) + + +def _lightgbm_train_fn_per_worker( + config: dict, + label_column: str, + num_boost_round: int, + dataset_keys: set, + lightgbm_train_kwargs: dict, +): + checkpoint = ray.train.get_checkpoint() + starting_model = None + remaining_iters = num_boost_round + if checkpoint: + starting_model = RayTrainReportCallback.get_model(checkpoint) + starting_iter = starting_model.current_iteration() + remaining_iters = num_boost_round - starting_iter + logger.info( + f"Model loaded from checkpoint will train for " + f"additional {remaining_iters} iterations (trees) in order " + "to achieve the target number of iterations " + f"({num_boost_round=})." + ) + + train_ds_iter = ray.train.get_dataset_shard(TRAIN_DATASET_KEY) + train_df = train_ds_iter.materialize().to_pandas() + + eval_ds_iters = { + k: ray.train.get_dataset_shard(k) + for k in dataset_keys + if k != TRAIN_DATASET_KEY + } + eval_dfs = {k: d.materialize().to_pandas() for k, d in eval_ds_iters.items()} + + train_X, train_y = train_df.drop(label_column, axis=1), train_df[label_column] + train_set = lightgbm.Dataset(train_X, label=train_y) + + # NOTE: Include the training dataset in the evaluation datasets. + # This allows `train-*` metrics to be calculated and reported. + valid_sets = [train_set] + valid_names = [TRAIN_DATASET_KEY] + + for eval_name, eval_df in eval_dfs.items(): + eval_X, eval_y = eval_df.drop(label_column, axis=1), eval_df[label_column] + valid_sets.append(lightgbm.Dataset(eval_X, label=eval_y)) + valid_names.append(eval_name) + + # Add network params of the worker group to enable distributed training. + config.update(ray.train.lightgbm.v2.get_network_params()) + + lightgbm.train( + params=config, + train_set=train_set, + num_boost_round=remaining_iters, + valid_sets=valid_sets, + valid_names=valid_names, + init_model=starting_model, + **lightgbm_train_kwargs, + ) + + +@PublicAPI(stability="beta") +class LightGBMTrainer(SimpleLightGBMTrainer): + """A Trainer for data parallel LightGBM training. + + This Trainer runs the LightGBM training loop in a distributed manner + using multiple Ray Actors. + + If you would like to take advantage of LightGBM's built-in handling + for features with the categorical data type, consider applying the + :class:`Categorizer` preprocessor to set the dtypes in the dataset. + + .. note:: + ``LightGBMTrainer`` does not modify or otherwise alter the working + of the LightGBM distributed training algorithm. + Ray only provides orchestration, data ingest and fault tolerance. + For more information on LightGBM distributed training, refer to + `LightGBM documentation `__. + + Example: + .. testcode:: + + import ray + + from ray.train.lightgbm import LightGBMTrainer + from ray.train import ScalingConfig + + train_dataset = ray.data.from_items( + [{"x": x, "y": x + 1} for x in range(32)] + ) + trainer = LightGBMTrainer( + label_column="y", + params={"objective": "regression"}, + scaling_config=ScalingConfig(num_workers=3), + datasets={"train": train_dataset}, + ) + result = trainer.fit() + + .. testoutput:: + :hide: + + ... + + Args: + datasets: The Ray Datasets to use for training and validation. Must include a + "train" key denoting the training dataset. All non-training datasets will + be used as separate validation sets, each reporting a separate metric. + label_column: Name of the label column. A column with this name + must be present in the training dataset. + params: LightGBM training parameters passed to ``lightgbm.train()``. + Refer to `LightGBM documentation `_ + for a list of possible parameters. + num_boost_round: Target number of boosting iterations (trees in the model). + Note that unlike in ``lightgbm.train``, this is the target number + of trees, meaning that if you set ``num_boost_round=10`` and pass a model + that has already been trained for 5 iterations, it will be trained for 5 + iterations more, instead of 10 more. + scaling_config: Configuration for how to scale data parallel training. + run_config: Configuration for the execution of the training run. + resume_from_checkpoint: A checkpoint to resume training from. + metadata: Dict that should be made available in `checkpoint.get_metadata()` + for checkpoints saved from this Trainer. Must be JSON-serializable. + **train_kwargs: Additional kwargs passed to ``lightgbm.train()`` function. + """ + + _handles_checkpoint_freq = True + _handles_checkpoint_at_end = True + + def __init__( + self, + *, + datasets: Dict[str, GenDataset], + label_column: str, + params: Dict[str, Any], + num_boost_round: int = 10, + scaling_config: Optional[ray.train.ScalingConfig] = None, + run_config: Optional[ray.train.RunConfig] = None, + dataset_config: Optional[ray.train.DataConfig] = None, + resume_from_checkpoint: Optional[Checkpoint] = None, + metadata: Optional[Dict[str, Any]] = None, + dmatrix_params: Optional[Dict[str, Dict[str, Any]]] = _DEPRECATED_VALUE, + **train_kwargs, + ): + # TODO(justinvyu): [Deprecated] Remove in 2.11 + if dmatrix_params != _DEPRECATED_VALUE: + raise DeprecationWarning( + "`dmatrix_params` is deprecated, since XGBoostTrainer no longer " + "depends on the `xgboost_ray.RayDMatrix` utility. " + "You can remove this argument and use `dataset_config` instead " + "to customize Ray Dataset ingestion." + ) + + # Initialize a default Ray Train metrics/checkpoint reporting callback if needed + callbacks = train_kwargs.get("callbacks", []) + user_supplied_callback = any( + isinstance(callback, RayTrainReportCallback) for callback in callbacks + ) + callback_kwargs = {} + if run_config: + checkpoint_frequency = run_config.checkpoint_config.checkpoint_frequency + checkpoint_at_end = run_config.checkpoint_config.checkpoint_at_end + + callback_kwargs["frequency"] = checkpoint_frequency + # Default `checkpoint_at_end=True` unless the user explicitly sets it. + callback_kwargs["checkpoint_at_end"] = ( + checkpoint_at_end if checkpoint_at_end is not None else True + ) + + if not user_supplied_callback: + callbacks.append(RayTrainReportCallback(**callback_kwargs)) + train_kwargs["callbacks"] = callbacks + + train_fn_per_worker = partial( + _lightgbm_train_fn_per_worker, + label_column=label_column, + num_boost_round=num_boost_round, + dataset_keys=set(datasets), + lightgbm_train_kwargs=train_kwargs, + ) + + super(LightGBMTrainer, self).__init__( + train_loop_per_worker=train_fn_per_worker, + train_loop_config=params, + scaling_config=scaling_config, + run_config=run_config, + datasets=datasets, + dataset_config=dataset_config, + resume_from_checkpoint=resume_from_checkpoint, + metadata=metadata, + ) + + @classmethod + def get_model( + cls, + checkpoint: Checkpoint, + ) -> lightgbm.Booster: + """Retrieve the LightGBM model stored in this checkpoint.""" + return RayTrainReportCallback.get_model(checkpoint) + + def _validate_attributes(self): + super()._validate_attributes() + + if TRAIN_DATASET_KEY not in self.datasets: + raise KeyError( + f"'{TRAIN_DATASET_KEY}' key must be preset in `datasets`. " + f"Got {list(self.datasets.keys())}" + ) diff --git a/.venv/lib/python3.11/site-packages/ray/train/lightgbm/v2.py b/.venv/lib/python3.11/site-packages/ray/train/lightgbm/v2.py new file mode 100644 index 0000000000000000000000000000000000000000..2b943cdf3a1dcf28453966e5636329dcd1821679 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/train/lightgbm/v2.py @@ -0,0 +1,132 @@ +import logging +from typing import Any, Callable, Dict, Optional, Union + +import ray.train +from ray.train import Checkpoint +from ray.train.data_parallel_trainer import DataParallelTrainer +from ray.train.lightgbm.config import LightGBMConfig, get_network_params # noqa: F401 +from ray.train.trainer import GenDataset + +logger = logging.getLogger(__name__) + + +class LightGBMTrainer(DataParallelTrainer): + """A Trainer for distributed data-parallel LightGBM training. + + Example + ------- + + .. testcode:: + + import lightgbm as lgb + + import ray.data + import ray.train + from ray.train.lightgbm import RayTrainReportCallback + from ray.train.lightgbm.v2 import LightGBMTrainer + + + def train_fn_per_worker(config: dict): + # (Optional) Add logic to resume training state from a checkpoint. + # ray.train.get_checkpoint() + + # 1. Get the dataset shard for the worker and convert to a `lgb.Dataset` + train_ds_iter, eval_ds_iter = ( + ray.train.get_dataset_shard("train"), + ray.train.get_dataset_shard("validation"), + ) + train_ds, eval_ds = train_ds_iter.materialize(), eval_ds_iter.materialize() + train_df, eval_df = train_ds.to_pandas(), eval_ds.to_pandas() + train_X, train_y = train_df.drop("y", axis=1), train_df["y"] + eval_X, eval_y = eval_df.drop("y", axis=1), eval_df["y"] + + train_set = lgb.Dataset(train_X, label=train_y) + eval_set = lgb.Dataset(eval_X, label=eval_y) + + # 2. Run distributed data-parallel training. + # `get_network_params` sets up the necessary configurations for LightGBM + # to set up the data parallel training worker group on your Ray cluster. + params = { + "objective": "regression", + # Adding the line below is the only change needed + # for your `lgb.train` call! + **ray.train.lightgbm.v2.get_network_params(), + } + lgb.train( + params, + train_set, + valid_sets=[eval_set], + valid_names=["eval"], + callbacks=[RayTrainReportCallback()], + ) + + train_ds = ray.data.from_items([{"x": x, "y": x + 1} for x in range(32)]) + eval_ds = ray.data.from_items( + [{"x": x, "y": x + 1} for x in range(32, 32 + 16)] + ) + trainer = LightGBMTrainer( + train_fn_per_worker, + datasets={"train": train_ds, "validation": eval_ds}, + scaling_config=ray.train.ScalingConfig(num_workers=4), + ) + result = trainer.fit() + booster = RayTrainReportCallback.get_model(result.checkpoint) + + .. testoutput:: + :hide: + + ... + + Args: + train_loop_per_worker: The training function to execute on each worker. + This function can either take in zero arguments or a single ``Dict`` + argument which is set by defining ``train_loop_config``. + Within this function you can use any of the + :ref:`Ray Train Loop utilities `. + train_loop_config: A configuration ``Dict`` to pass in as an argument to + ``train_loop_per_worker``. + This is typically used for specifying hyperparameters. + lightgbm_config: The configuration for setting up the distributed lightgbm + backend. See :class:`~ray.train.lightgbm.LightGBMConfig` for more info. + datasets: The Ray Datasets to use for training and validation. + dataset_config: The configuration for ingesting the input ``datasets``. + By default, all the Ray Dataset are split equally across workers. + See :class:`~ray.train.DataConfig` for more details. + scaling_config: The configuration for how to scale data parallel training. + ``num_workers`` determines how many Python processes are used for training, + and ``use_gpu`` determines whether or not each process should use GPUs. + See :class:`~ray.train.ScalingConfig` for more info. + run_config: The configuration for the execution of the training run. + See :class:`~ray.train.RunConfig` for more info. + resume_from_checkpoint: A checkpoint to resume training from. + This checkpoint can be accessed from within ``train_loop_per_worker`` + by calling ``ray.train.get_checkpoint()``. + metadata: Dict that should be made available via + `ray.train.get_context().get_metadata()` and in `checkpoint.get_metadata()` + for checkpoints saved from this Trainer. Must be JSON-serializable. + """ + + def __init__( + self, + train_loop_per_worker: Union[Callable[[], None], Callable[[Dict], None]], + *, + train_loop_config: Optional[Dict] = None, + lightgbm_config: Optional[LightGBMConfig] = None, + scaling_config: Optional[ray.train.ScalingConfig] = None, + run_config: Optional[ray.train.RunConfig] = None, + datasets: Optional[Dict[str, GenDataset]] = None, + dataset_config: Optional[ray.train.DataConfig] = None, + metadata: Optional[Dict[str, Any]] = None, + resume_from_checkpoint: Optional[Checkpoint] = None, + ): + super(LightGBMTrainer, self).__init__( + train_loop_per_worker=train_loop_per_worker, + train_loop_config=train_loop_config, + backend_config=lightgbm_config or LightGBMConfig(), + scaling_config=scaling_config, + dataset_config=dataset_config, + run_config=run_config, + datasets=datasets, + resume_from_checkpoint=resume_from_checkpoint, + metadata=metadata, + ) diff --git a/.venv/lib/python3.11/site-packages/ray/train/predictor.py b/.venv/lib/python3.11/site-packages/ray/train/predictor.py new file mode 100644 index 0000000000000000000000000000000000000000..7e25bdaa8022bbd5b1cd1cc88397d32fe726f1dd --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/train/predictor.py @@ -0,0 +1,254 @@ +import abc +from typing import Callable, Dict, Optional, Type, Union + +import numpy as np +import pandas as pd + +from ray.air.data_batch_type import DataBatchType +from ray.air.util.data_batch_conversion import ( + BatchFormat, + _convert_batch_type_to_numpy, + _convert_batch_type_to_pandas, +) +from ray.data import Preprocessor +from ray.train import Checkpoint +from ray.util.annotations import DeveloperAPI, PublicAPI + +try: + import pyarrow + + pa_table = pyarrow.Table +except ImportError: + pa_table = None + +# Reverse mapping from data batch type to batch format. +TYPE_TO_ENUM: Dict[Type[DataBatchType], BatchFormat] = { + np.ndarray: BatchFormat.NUMPY, + dict: BatchFormat.NUMPY, + pd.DataFrame: BatchFormat.PANDAS, +} + + +@PublicAPI(stability="beta") +class PredictorNotSerializableException(RuntimeError): + """Error raised when trying to serialize a Predictor instance.""" + + pass + + +@PublicAPI(stability="beta") +class Predictor(abc.ABC): + """Predictors load models from checkpoints to perform inference. + + .. note:: + The base ``Predictor`` class cannot be instantiated directly. Only one of + its subclasses can be used. + + **How does a Predictor work?** + + Predictors expose a ``predict`` method that accepts an input batch of type + ``DataBatchType`` and outputs predictions of the same type as the input batch. + + When the ``predict`` method is called the following occurs: + + - The input batch is converted into a pandas DataFrame. Tensor input (like a + ``np.ndarray``) will be converted into a single column Pandas Dataframe. + - If there is a :ref:`Preprocessor ` saved in the provided + :class:`Checkpoint `, the preprocessor will be used to + transform the DataFrame. + - The transformed DataFrame will be passed to the model for inference (via the + ``predictor._predict_pandas`` method). + - The predictions will be outputted by ``predict`` in the same type as the + original input. + + **How do I create a new Predictor?** + + To implement a new Predictor for your particular framework, you should subclass + the base ``Predictor`` and implement the following two methods: + + 1. ``_predict_pandas``: Given a pandas.DataFrame input, return a + pandas.DataFrame containing predictions. + 2. ``from_checkpoint``: Logic for creating a Predictor from a + :class:`Checkpoint `. + 3. Optionally ``_predict_numpy`` for better performance when working with + tensor data to avoid extra copies from Pandas conversions. + """ + + def __init__(self, preprocessor: Optional[Preprocessor] = None): + """Subclasseses must call Predictor.__init__() to set a preprocessor.""" + self._preprocessor: Optional[Preprocessor] = preprocessor + # Whether tensor columns should be automatically cast from/to the tensor + # extension type at UDF boundaries. This can be overridden by subclasses. + self._cast_tensor_columns = False + + @classmethod + @abc.abstractmethod + def from_checkpoint(cls, checkpoint: Checkpoint, **kwargs) -> "Predictor": + """Create a specific predictor from a checkpoint. + + Args: + checkpoint: Checkpoint to load predictor data from. + kwargs: Arguments specific to predictor implementations. + + Returns: + Predictor: Predictor object. + """ + raise NotImplementedError + + @classmethod + def from_pandas_udf( + cls, pandas_udf: Callable[[pd.DataFrame], pd.DataFrame] + ) -> "Predictor": + """Create a Predictor from a Pandas UDF. + + Args: + pandas_udf: A function that takes a pandas.DataFrame and other + optional kwargs and returns a pandas.DataFrame. + """ + + class PandasUDFPredictor(Predictor): + @classmethod + def from_checkpoint(cls, checkpoint: Checkpoint, **kwargs) -> "Predictor": + return PandasUDFPredictor() + + def _predict_pandas(self, df, **kwargs) -> "pd.DataFrame": + return pandas_udf(df, **kwargs) + + return PandasUDFPredictor() + + def get_preprocessor(self) -> Optional[Preprocessor]: + """Get the preprocessor to use prior to executing predictions.""" + return self._preprocessor + + def set_preprocessor(self, preprocessor: Optional[Preprocessor]) -> None: + """Set the preprocessor to use prior to executing predictions.""" + self._preprocessor = preprocessor + + @classmethod + @DeveloperAPI + def preferred_batch_format(cls) -> BatchFormat: + """Batch format hint for upstream producers to try yielding best block format. + + The preferred batch format to use if both `_predict_pandas` and + `_predict_numpy` are implemented. Defaults to Pandas. + + Can be overriden by predictor classes depending on the framework type, + e.g. TorchPredictor prefers Numpy and XGBoostPredictor prefers Pandas as + native batch format. + + """ + return BatchFormat.PANDAS + + @classmethod + def _batch_format_to_use(cls) -> BatchFormat: + """Determine the batch format to use for the predictor.""" + has_pandas_implemented = cls._predict_pandas != Predictor._predict_pandas + has_numpy_implemented = cls._predict_numpy != Predictor._predict_numpy + if has_pandas_implemented and has_numpy_implemented: + return cls.preferred_batch_format() + elif has_pandas_implemented: + return BatchFormat.PANDAS + elif has_numpy_implemented: + return BatchFormat.NUMPY + else: + raise NotImplementedError( + f"Predictor {cls.__name__} must implement at least one of " + "`_predict_pandas` and `_predict_numpy`." + ) + + def _set_cast_tensor_columns(self): + """Enable automatic tensor column casting. + + If this is called on a predictor, the predictor will cast tensor columns to + NumPy ndarrays in the input to the preprocessors and cast tensor columns back to + the tensor extension type in the prediction outputs. + """ + self._cast_tensor_columns = True + + def predict(self, data: DataBatchType, **kwargs) -> DataBatchType: + """Perform inference on a batch of data. + + Args: + data: A batch of input data of type ``DataBatchType``. + kwargs: Arguments specific to predictor implementations. These are passed + directly to ``_predict_numpy`` or ``_predict_pandas``. + + Returns: + DataBatchType: + Prediction result. The return type will be the same as the input type. + """ + if not hasattr(self, "_preprocessor"): + raise NotImplementedError( + "Subclasses of Predictor must call Predictor.__init__(preprocessor)." + ) + try: + batch_format = TYPE_TO_ENUM[type(data)] + except KeyError: + raise RuntimeError( + f"Invalid input data type of {type(data)}, supported " + f"types: {list(TYPE_TO_ENUM.keys())}" + ) + + if self._preprocessor: + data = self._preprocessor.transform_batch(data) + + batch_format_to_use = self._batch_format_to_use() + + # We can finish prediction as long as one predict method is implemented. + # For prediction, we have to return back in the same format as the input. + if batch_format == BatchFormat.PANDAS: + if batch_format_to_use == BatchFormat.PANDAS: + return self._predict_pandas( + _convert_batch_type_to_pandas(data), **kwargs + ) + elif batch_format_to_use == BatchFormat.NUMPY: + return _convert_batch_type_to_pandas( + self._predict_numpy(_convert_batch_type_to_numpy(data), **kwargs) + ) + elif batch_format == BatchFormat.NUMPY: + if batch_format_to_use == BatchFormat.PANDAS: + return _convert_batch_type_to_numpy( + self._predict_pandas(_convert_batch_type_to_pandas(data), **kwargs) + ) + elif batch_format_to_use == BatchFormat.NUMPY: + return self._predict_numpy(_convert_batch_type_to_numpy(data), **kwargs) + + @DeveloperAPI + def _predict_pandas(self, data: "pd.DataFrame", **kwargs) -> "pd.DataFrame": + """Perform inference on a Pandas DataFrame. + + Args: + data: A pandas DataFrame to perform predictions on. + kwargs: Arguments specific to the predictor implementation. + + Returns: + A pandas DataFrame containing the prediction result. + + """ + raise NotImplementedError + + @DeveloperAPI + def _predict_numpy( + self, data: Union[np.ndarray, Dict[str, np.ndarray]], **kwargs + ) -> Union[np.ndarray, Dict[str, np.ndarray]]: + """Perform inference on a Numpy data. + + All Predictors working with tensor data (like deep learning predictors) + should implement this method. + + Args: + data: A Numpy ndarray or dictionary of ndarrays to perform predictions on. + kwargs: Arguments specific to the predictor implementation. + + Returns: + A Numpy ndarray or dictionary of ndarray containing the prediction result. + + """ + raise NotImplementedError + + def __reduce__(self): + raise PredictorNotSerializableException( + "Predictor instances are not serializable. Instead, you may want " + "to serialize a checkpoint and initialize the Predictor with " + "Predictor.from_checkpoint." + ) diff --git a/.venv/lib/python3.11/site-packages/ray/train/session.py b/.venv/lib/python3.11/site-packages/ray/train/session.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/.venv/lib/python3.11/site-packages/ray/train/trainer.py b/.venv/lib/python3.11/site-packages/ray/train/trainer.py new file mode 100644 index 0000000000000000000000000000000000000000..725ba029d766c37d838e456f58abd2182e836a8d --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/train/trainer.py @@ -0,0 +1,194 @@ +import logging +import traceback +from pathlib import Path +from typing import Any, Callable, Dict, List, Optional, TypeVar, Union + +from ray.air._internal.util import ( + StartTraceback, + StartTracebackWithWorkerRank, + skip_exceptions, +) +from ray.data import Dataset +from ray.train import Checkpoint, DataConfig +from ray.train._internal.backend_executor import ( + BackendExecutor, + InactiveWorkerGroupError, + TrainBackendError, + TrainingWorkerError, +) +from ray.train._internal.session import _TrainingResult, _TrainSession, get_session +from ray.train._internal.utils import ActorWrapper +from ray.train.backend import BackendConfig +from ray.train.base_trainer import ( # noqa: F401 + BaseTrainer, + GenDataset, + TrainingFailedError, +) +from ray.util.annotations import DeveloperAPI + +T = TypeVar("T") +S = TypeVar("S") + +logger = logging.getLogger(__name__) + + +@DeveloperAPI +class TrainingIterator: + """An iterator over Train results. Returned by ``trainer.run_iterator``.""" + + def __init__( + self, + backend_executor: Union[BackendExecutor, ActorWrapper], + backend_config: BackendConfig, + train_func: Union[Callable[[], T], Callable[[Dict[str, Any]], T]], + datasets: Dict[str, Dataset], + metadata: Dict[str, Any], + data_config: DataConfig, + checkpoint: Optional[Union[Dict, str, Path, Checkpoint]], + ): + self._backend_executor = backend_executor + self._backend = backend_config.backend_cls() + self._train_func = train_func + self._datasets = datasets + self._metadata = metadata + self._data_config = data_config + + self._start_training( + train_func=train_func, + datasets=self._datasets, + metadata=self._metadata, + data_config=self._data_config, + checkpoint=checkpoint, + ) + + self._finished_training = False + + def __iter__(self): + return self + + def _start_training( + self, + train_func, + datasets, + metadata, + data_config, + checkpoint: Optional[Checkpoint] = None, + ): + tune_session: _TrainSession = get_session() + assert tune_session, "`_start_training` should only be called from within Tune" + storage = tune_session.storage + + self._run_with_error_handling( + lambda: self._backend_executor.start_training( + train_func=train_func, + datasets=datasets, + metadata=metadata, + data_config=data_config, + storage=storage, + checkpoint=checkpoint, + ) + ) + + def _run_with_error_handling(self, func: Callable): + try: + return func() + except TrainingWorkerError: + # TODO(ml-team): This Train fault-tolerance code doesn't get used + # since max_retries=0 + # Workers have already been restarted. + logger.info( + "Workers have been successfully restarted. Resuming " + "training from latest checkpoint." + ) + self._start_training( + self._train_func, + self._datasets, + self._metadata, + self._data_config, + ) + return self._run_with_error_handling(func) + except InactiveWorkerGroupError: + raise RuntimeError( + "This Trainer is not active. It is either shutdown " + "already or never started in the first place. " + "Either create a new Trainer or start this one." + ) from None + except TrainBackendError: + raise RuntimeError( + "Training failed. You should not be seeing " + "this error and this is a bug. Please create " + "a new issue at " + "https://github.com/ray-project/ray." + ) from None + + def __next__(self): + if self.is_finished(): + self._backend_executor.report_final_run_status(errored=False) + raise StopIteration + try: + next_results = self._run_with_error_handling(self._fetch_next_result) + if next_results is None: + self._backend_executor.report_final_run_status(errored=False) + self._run_with_error_handling(self._finish_training) + self._finished_training = True + raise StopIteration + else: + return next_results + except StartTraceback as e: + # If this is a StartTraceback, then this is a user error. + # We raise it directly + if isinstance(e, StartTracebackWithWorkerRank): + failed_rank = e.worker_rank + else: + failed_rank = None + + # Extract the stack trace from the exception + e = skip_exceptions(e) + stack_trace = "".join( + traceback.format_exception(type(e), e, e.__traceback__) + ) + + self._backend_executor.report_final_run_status( + errored=True, stack_trace=stack_trace, failed_rank=failed_rank + ) + try: + # Exception raised in at least one training worker. Immediately raise + # this error to the user and do not attempt to terminate gracefully. + self._backend_executor.shutdown(graceful_termination=False) + self._finished_training = True + except Exception: + pass + raise + + def _fetch_next_result(self) -> Optional[List[Dict]]: + """Fetch next results produced by ``session.report()`` from each worker. + + Assumes ``start_training`` has already been called. + + Returns: + A list of dictionaries of values passed to ``session.report()`` from + each worker. Each item corresponds to an intermediate result + a single worker. If there are no more items to fetch, + returns None. + """ + results = self._backend_executor.get_next_results() + if results is None: + return None + assert all(isinstance(result, _TrainingResult) for result in results) + return results + + def _finish_training(self): + """Finish training and return final results. Propagate any exceptions. + + Blocks until training is finished on all workers. + + Assumes `start_training` has already been called. + + Returns: + A list of return values from calling ``train_func`` on each worker. + Each item corresponds to the return value from a single worker. + """ + return self._backend_executor.finish_training() + + def is_finished(self) -> bool: + return self._finished_training diff --git a/.venv/lib/python3.11/site-packages/ray/train/utils.py b/.venv/lib/python3.11/site-packages/ray/train/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..98b11f1f6091fba061679af2e08bec556315e7d0 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/train/utils.py @@ -0,0 +1,19 @@ +import warnings + +from ray.util.annotations import RayDeprecationWarning + + +def _copy_doc(copy_func): + def wrapped(func): + func.__doc__ = copy_func.__doc__ + return func + + return wrapped + + +def _log_deprecation_warning(message): + warnings.warn( + message, + RayDeprecationWarning, + stacklevel=2, + ) diff --git a/.venv/lib/python3.11/site-packages/ray/train/xgboost/__init__.py b/.venv/lib/python3.11/site-packages/ray/train/xgboost/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..aa2d1c88d11b0b1049cf834fb75a39e669f5fbb2 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/train/xgboost/__init__.py @@ -0,0 +1,20 @@ +from ray.train.v2._internal.constants import is_v2_enabled +from ray.train.xgboost._xgboost_utils import RayTrainReportCallback +from ray.train.xgboost.config import XGBoostConfig +from ray.train.xgboost.xgboost_checkpoint import XGBoostCheckpoint +from ray.train.xgboost.xgboost_predictor import XGBoostPredictor +from ray.train.xgboost.xgboost_trainer import XGBoostTrainer + +if is_v2_enabled(): + from ray.train.v2.xgboost.xgboost_trainer import XGBoostTrainer # noqa: F811 + +__all__ = [ + "RayTrainReportCallback", + "XGBoostCheckpoint", + "XGBoostConfig", + "XGBoostPredictor", + "XGBoostTrainer", +] + + +# DO NOT ADD ANYTHING AFTER THIS LINE. diff --git a/.venv/lib/python3.11/site-packages/ray/train/xgboost/__pycache__/__init__.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/train/xgboost/__pycache__/__init__.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..60d4dab19bf2a5ea8173eef5178246c33e8472c6 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/train/xgboost/__pycache__/__init__.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/train/xgboost/__pycache__/_xgboost_utils.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/train/xgboost/__pycache__/_xgboost_utils.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c5e0834c8e82f32975a4f8fe70874b638be9489f Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/train/xgboost/__pycache__/_xgboost_utils.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/train/xgboost/__pycache__/config.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/train/xgboost/__pycache__/config.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..435ff698a807dc7145325124a90b7867ba87cf3b Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/train/xgboost/__pycache__/config.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/train/xgboost/__pycache__/v2.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/train/xgboost/__pycache__/v2.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..836811088ecf5a449f67c93c226d6651a1baa0fb Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/train/xgboost/__pycache__/v2.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/train/xgboost/__pycache__/xgboost_checkpoint.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/train/xgboost/__pycache__/xgboost_checkpoint.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..11bcd06e38caa845848b237bc2a4bc1c56ac7d88 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/train/xgboost/__pycache__/xgboost_checkpoint.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/train/xgboost/__pycache__/xgboost_predictor.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/train/xgboost/__pycache__/xgboost_predictor.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..4eb74ce28f5b08090636ed88a57595fe986c8599 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/train/xgboost/__pycache__/xgboost_predictor.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/train/xgboost/__pycache__/xgboost_trainer.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/train/xgboost/__pycache__/xgboost_trainer.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..30f8a3da6ebd238947ba3b0fcdb3c8ee43747e9e Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/train/xgboost/__pycache__/xgboost_trainer.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/train/xgboost/_xgboost_utils.py b/.venv/lib/python3.11/site-packages/ray/train/xgboost/_xgboost_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..459dfcf07a2208659f469eb6769f5f3c5e2e8fbc --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/train/xgboost/_xgboost_utils.py @@ -0,0 +1,210 @@ +import tempfile +from collections import OrderedDict +from contextlib import contextmanager +from pathlib import Path +from typing import Callable, Dict, List, Optional, Union + +from xgboost.core import Booster + +import ray.train +from ray.train import Checkpoint +from ray.tune.utils import flatten_dict +from ray.util.annotations import PublicAPI + +try: + from xgboost.callback import TrainingCallback +except ImportError: + + class TrainingCallback: + pass + + +class TuneCallback(TrainingCallback): + # TODO(justinvyu): [code_removal] Remove this after enforcing min xgboost version. + """Base class for Tune's XGBoost callbacks.""" + + def __call__(self, env): + """Compatibility with xgboost<1.3""" + return self.after_iteration( + env.model, env.iteration, env.evaluation_result_list + ) + + def after_iteration(self, model: Booster, epoch: int, evals_log: Dict): + raise NotImplementedError + + +@PublicAPI(stability="beta") +class RayTrainReportCallback(TuneCallback): + """XGBoost callback to save checkpoints and report metrics. + + Args: + metrics: Metrics to report. If this is a list, + each item describes the metric key reported to XGBoost, + and it will be reported under the same name. + This can also be a dict of {: }, + which can be used to rename xgboost default metrics. + filename: Customize the saved checkpoint file type by passing + a filename. Defaults to "model.ubj". + frequency: How often to save checkpoints, in terms of iterations. + Defaults to 0 (no checkpoints are saved during training). + checkpoint_at_end: Whether or not to save a checkpoint at the end of training. + results_postprocessing_fn: An optional Callable that takes in + the metrics dict that will be reported (after it has been flattened) + and returns a modified dict. For example, this can be used to + average results across CV fold when using ``xgboost.cv``. + + Examples + -------- + + Reporting checkpoints and metrics to Ray Tune when running many + independent xgboost trials (without data parallelism within a trial). + + .. testcode:: + :skipif: True + + import xgboost + + from ray.tune import Tuner + from ray.train.xgboost import RayTrainReportCallback + + def train_fn(config): + # Report log loss to Ray Tune after each validation epoch. + bst = xgboost.train( + ..., + callbacks=[ + RayTrainReportCallback( + metrics={"loss": "eval-logloss"}, frequency=1 + ) + ], + ) + + tuner = Tuner(train_fn) + results = tuner.fit() + + Loading a model from a checkpoint reported by this callback. + + .. testcode:: + :skipif: True + + from ray.train.xgboost import RayTrainReportCallback + + # Get a `Checkpoint` object that is saved by the callback during training. + result = trainer.fit() + booster = RayTrainReportCallback.get_model(result.checkpoint) + + """ + + CHECKPOINT_NAME = "model.ubj" + + def __init__( + self, + metrics: Optional[Union[str, List[str], Dict[str, str]]] = None, + filename: str = CHECKPOINT_NAME, + frequency: int = 0, + checkpoint_at_end: bool = True, + results_postprocessing_fn: Optional[ + Callable[[Dict[str, Union[float, List[float]]]], Dict[str, float]] + ] = None, + ): + if isinstance(metrics, str): + metrics = [metrics] + self._metrics = metrics + self._filename = filename + self._frequency = frequency + self._checkpoint_at_end = checkpoint_at_end + self._results_postprocessing_fn = results_postprocessing_fn + + # Keeps track of the eval metrics from the last iteration, + # so that the latest metrics can be reported with the checkpoint + # at the end of training. + self._evals_log = None + # Keep track of the last checkpoint iteration to avoid double-checkpointing + # when using `checkpoint_at_end=True`. + self._last_checkpoint_iteration = None + + @classmethod + def get_model( + cls, checkpoint: Checkpoint, filename: str = CHECKPOINT_NAME + ) -> Booster: + """Retrieve the model stored in a checkpoint reported by this callback. + + Args: + checkpoint: The checkpoint object returned by a training run. + The checkpoint should be saved by an instance of this callback. + filename: The filename to load the model from, which should match + the filename used when creating the callback. + """ + with checkpoint.as_directory() as checkpoint_path: + booster = Booster() + booster.load_model(Path(checkpoint_path, filename).as_posix()) + return booster + + def _get_report_dict(self, evals_log): + if isinstance(evals_log, OrderedDict): + # xgboost>=1.3 + result_dict = flatten_dict(evals_log, delimiter="-") + for k in list(result_dict): + result_dict[k] = result_dict[k][-1] + else: + # xgboost<1.3 + result_dict = dict(evals_log) + if not self._metrics: + report_dict = result_dict + else: + report_dict = {} + for key in self._metrics: + if isinstance(self._metrics, dict): + metric = self._metrics[key] + else: + metric = key + report_dict[key] = result_dict[metric] + + if self._results_postprocessing_fn: + report_dict = self._results_postprocessing_fn(report_dict) + + return report_dict + + @contextmanager + def _get_checkpoint(self, model: Booster) -> Optional[Checkpoint]: + # NOTE: The world rank returns None for Tune usage without Train. + if ray.train.get_context().get_world_rank() in (0, None): + with tempfile.TemporaryDirectory() as temp_checkpoint_dir: + model.save_model(Path(temp_checkpoint_dir, self._filename).as_posix()) + yield Checkpoint(temp_checkpoint_dir) + else: + yield None + + def after_iteration(self, model: Booster, epoch: int, evals_log: Dict): + self._evals_log = evals_log + + checkpointing_disabled = self._frequency == 0 + # Ex: if frequency=2, checkpoint at epoch 1, 3, 5, ... (counting from 0) + should_checkpoint = ( + not checkpointing_disabled and (epoch + 1) % self._frequency == 0 + ) + + report_dict = self._get_report_dict(evals_log) + if should_checkpoint: + self._last_checkpoint_iteration = epoch + with self._get_checkpoint(model=model) as checkpoint: + ray.train.report(report_dict, checkpoint=checkpoint) + else: + ray.train.report(report_dict) + + def after_training(self, model: Booster) -> Booster: + if not self._checkpoint_at_end: + return model + + if ( + self._last_checkpoint_iteration is not None + and model.num_boosted_rounds() - 1 == self._last_checkpoint_iteration + ): + # Avoids a duplicate checkpoint if the checkpoint frequency happens + # to align with the last iteration. + return model + + report_dict = self._get_report_dict(self._evals_log) if self._evals_log else {} + with self._get_checkpoint(model=model) as checkpoint: + ray.train.report(report_dict, checkpoint=checkpoint) + + return model diff --git a/.venv/lib/python3.11/site-packages/ray/train/xgboost/config.py b/.venv/lib/python3.11/site-packages/ray/train/xgboost/config.py new file mode 100644 index 0000000000000000000000000000000000000000..725326c70ffbff9d679c7498fe2f01c77e0c0531 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/train/xgboost/config.py @@ -0,0 +1,202 @@ +import json +import logging +import os +import threading +from contextlib import contextmanager +from dataclasses import dataclass +from typing import Optional + +import xgboost +from packaging.version import Version +from xgboost import RabitTracker +from xgboost.collective import CommunicatorContext + +import ray +from ray.train._internal.worker_group import WorkerGroup +from ray.train.backend import Backend, BackendConfig + +logger = logging.getLogger(__name__) + + +@dataclass +class XGBoostConfig(BackendConfig): + """Configuration for xgboost collective communication setup. + + Ray Train will set up the necessary coordinator processes and environment + variables for your workers to communicate with each other. + Additional configuration options can be passed into the + `xgboost.collective.CommunicatorContext` that wraps your own `xgboost.train` code. + + See the `xgboost.collective` module for more information: + https://github.com/dmlc/xgboost/blob/master/python-package/xgboost/collective.py + + Args: + xgboost_communicator: The backend to use for collective communication for + distributed xgboost training. For now, only "rabit" is supported. + """ + + xgboost_communicator: str = "rabit" + + @property + def train_func_context(self): + @contextmanager + def collective_communication_context(): + with CommunicatorContext(**_get_xgboost_args()): + yield + + return collective_communication_context + + @property + def backend_cls(self): + if self.xgboost_communicator == "rabit": + return ( + _XGBoostRabitBackend + if Version(xgboost.__version__) >= Version("2.1.0") + else _XGBoostRabitBackend_pre_xgb210 + ) + + raise NotImplementedError(f"Unsupported backend: {self.xgboost_communicator}") + + +class _XGBoostRabitBackend(Backend): + def __init__(self): + self._tracker: Optional[RabitTracker] = None + self._wait_thread: Optional[threading.Thread] = None + + def _setup_xgboost_distributed_backend(self, worker_group: WorkerGroup): + # Set up the rabit tracker on the Train driver. + num_workers = len(worker_group) + rabit_args = {"n_workers": num_workers} + train_driver_ip = ray.util.get_node_ip_address() + + # NOTE: sortby="task" is needed to ensure that the xgboost worker ranks + # align with Ray Train worker ranks. + # The worker ranks will be sorted by `dmlc_task_id`, + # which is defined below. + self._tracker = RabitTracker( + n_workers=num_workers, host_ip=train_driver_ip, sortby="task" + ) + self._tracker.start() + + # The RabitTracker is started in a separate thread, and the + # `wait_for` method must be called for `worker_args` to return. + self._wait_thread = threading.Thread(target=self._tracker.wait_for, daemon=True) + self._wait_thread.start() + + rabit_args.update(self._tracker.worker_args()) + + start_log = ( + "RabitTracker coordinator started with parameters:\n" + f"{json.dumps(rabit_args, indent=2)}" + ) + logger.debug(start_log) + + def set_xgboost_communicator_args(args): + import ray.train + + args["dmlc_task_id"] = ( + f"[xgboost.ray-rank={ray.train.get_context().get_world_rank():08}]:" + f"{ray.get_runtime_context().get_actor_id()}" + ) + + _set_xgboost_args(args) + + worker_group.execute(set_xgboost_communicator_args, rabit_args) + + def on_training_start( + self, worker_group: WorkerGroup, backend_config: XGBoostConfig + ): + assert backend_config.xgboost_communicator == "rabit" + self._setup_xgboost_distributed_backend(worker_group) + + def on_shutdown(self, worker_group: WorkerGroup, backend_config: XGBoostConfig): + timeout = 5 + + if self._wait_thread is not None: + self._wait_thread.join(timeout=timeout) + + if self._wait_thread.is_alive(): + logger.warning( + "During shutdown, the RabitTracker thread failed to join " + f"within {timeout} seconds. " + "The process will still be terminated as part of Ray actor cleanup." + ) + + +class _XGBoostRabitBackend_pre_xgb210(Backend): + def __init__(self): + self._tracker: Optional[RabitTracker] = None + + def _setup_xgboost_distributed_backend(self, worker_group: WorkerGroup): + # Set up the rabit tracker on the Train driver. + num_workers = len(worker_group) + rabit_args = {"DMLC_NUM_WORKER": num_workers} + train_driver_ip = ray.util.get_node_ip_address() + + # NOTE: sortby="task" is needed to ensure that the xgboost worker ranks + # align with Ray Train worker ranks. + # The worker ranks will be sorted by `DMLC_TASK_ID`, + # which is defined below. + self._tracker = RabitTracker( + n_workers=num_workers, host_ip=train_driver_ip, sortby="task" + ) + self._tracker.start(n_workers=num_workers) + + worker_args = self._tracker.worker_envs() + rabit_args.update(worker_args) + + start_log = ( + "RabitTracker coordinator started with parameters:\n" + f"{json.dumps(rabit_args, indent=2)}" + ) + logger.debug(start_log) + + def set_xgboost_env_vars(): + import ray.train + + for k, v in rabit_args.items(): + os.environ[k] = str(v) + + # Ranks are assigned in increasing order of the worker's task id. + # This task id will be sorted by increasing world rank. + os.environ["DMLC_TASK_ID"] = ( + f"[xgboost.ray-rank={ray.train.get_context().get_world_rank():08}]:" + f"{ray.get_runtime_context().get_actor_id()}" + ) + + worker_group.execute(set_xgboost_env_vars) + + def on_training_start( + self, worker_group: WorkerGroup, backend_config: XGBoostConfig + ): + assert backend_config.xgboost_communicator == "rabit" + self._setup_xgboost_distributed_backend(worker_group) + + def on_shutdown(self, worker_group: WorkerGroup, backend_config: XGBoostConfig): + if not self._tracker: + return + + timeout = 5 + self._tracker.thread.join(timeout=timeout) + + if self._tracker.thread.is_alive(): + logger.warning( + "During shutdown, the RabitTracker thread failed to join " + f"within {timeout} seconds. " + "The process will still be terminated as part of Ray actor cleanup." + ) + + +_xgboost_args: dict = {} +_xgboost_args_lock = threading.Lock() + + +def _set_xgboost_args(args): + with _xgboost_args_lock: + global _xgboost_args + _xgboost_args = args + + +def _get_xgboost_args() -> dict: + with _xgboost_args_lock: + return _xgboost_args diff --git a/.venv/lib/python3.11/site-packages/ray/train/xgboost/v2.py b/.venv/lib/python3.11/site-packages/ray/train/xgboost/v2.py new file mode 100644 index 0000000000000000000000000000000000000000..2494b479eb12c933a921cd927904a6d38ff34b49 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/train/xgboost/v2.py @@ -0,0 +1,133 @@ +import logging +from typing import Any, Callable, Dict, Optional, Union + +import ray.train +from ray.train import Checkpoint +from ray.train.data_parallel_trainer import DataParallelTrainer +from ray.train.trainer import GenDataset +from ray.train.xgboost import XGBoostConfig + +logger = logging.getLogger(__name__) + + +class XGBoostTrainer(DataParallelTrainer): + """A Trainer for distributed data-parallel XGBoost training. + + Example + ------- + + .. testcode:: + + import xgboost + + import ray.data + import ray.train + from ray.train.xgboost import RayTrainReportCallback + from ray.train.xgboost.v2 import XGBoostTrainer + + def train_fn_per_worker(config: dict): + # (Optional) Add logic to resume training state from a checkpoint. + # ray.train.get_checkpoint() + + # 1. Get the dataset shard for the worker and convert to a `xgboost.DMatrix` + train_ds_iter, eval_ds_iter = ( + ray.train.get_dataset_shard("train"), + ray.train.get_dataset_shard("validation"), + ) + train_ds, eval_ds = train_ds_iter.materialize(), eval_ds_iter.materialize() + + train_df, eval_df = train_ds.to_pandas(), eval_ds.to_pandas() + train_X, train_y = train_df.drop("y", axis=1), train_df["y"] + eval_X, eval_y = eval_df.drop("y", axis=1), eval_df["y"] + + dtrain = xgboost.DMatrix(train_X, label=train_y) + deval = xgboost.DMatrix(eval_X, label=eval_y) + + params = { + "tree_method": "approx", + "objective": "reg:squarederror", + "eta": 1e-4, + "subsample": 0.5, + "max_depth": 2, + } + + # 2. Do distributed data-parallel training. + # Ray Train sets up the necessary coordinator processes and + # environment variables for your workers to communicate with each other. + bst = xgboost.train( + params, + dtrain=dtrain, + evals=[(deval, "validation")], + num_boost_round=10, + callbacks=[RayTrainReportCallback()], + ) + + train_ds = ray.data.from_items([{"x": x, "y": x + 1} for x in range(32)]) + eval_ds = ray.data.from_items([{"x": x, "y": x + 1} for x in range(16)]) + trainer = XGBoostTrainer( + train_fn_per_worker, + datasets={"train": train_ds, "validation": eval_ds}, + scaling_config=ray.train.ScalingConfig(num_workers=4), + ) + result = trainer.fit() + booster = RayTrainReportCallback.get_model(result.checkpoint) + + .. testoutput:: + :hide: + + ... + + Args: + train_loop_per_worker: The training function to execute on each worker. + This function can either take in zero arguments or a single ``Dict`` + argument which is set by defining ``train_loop_config``. + Within this function you can use any of the + :ref:`Ray Train Loop utilities `. + train_loop_config: A configuration ``Dict`` to pass in as an argument to + ``train_loop_per_worker``. + This is typically used for specifying hyperparameters. + xgboost_config: The configuration for setting up the distributed xgboost + backend. Defaults to using the "rabit" backend. + See :class:`~ray.train.xgboost.XGBoostConfig` for more info. + datasets: The Ray Datasets to use for training and validation. + dataset_config: The configuration for ingesting the input ``datasets``. + By default, all the Ray Datasets are split equally across workers. + See :class:`~ray.train.DataConfig` for more details. + scaling_config: The configuration for how to scale data parallel training. + ``num_workers`` determines how many Python processes are used for training, + and ``use_gpu`` determines whether or not each process should use GPUs. + See :class:`~ray.train.ScalingConfig` for more info. + run_config: The configuration for the execution of the training run. + See :class:`~ray.train.RunConfig` for more info. + resume_from_checkpoint: A checkpoint to resume training from. + This checkpoint can be accessed from within ``train_loop_per_worker`` + by calling ``ray.train.get_checkpoint()``. + metadata: Dict that should be made available via + `ray.train.get_context().get_metadata()` and in `checkpoint.get_metadata()` + for checkpoints saved from this Trainer. Must be JSON-serializable. + """ + + def __init__( + self, + train_loop_per_worker: Union[Callable[[], None], Callable[[Dict], None]], + *, + train_loop_config: Optional[Dict] = None, + xgboost_config: Optional[XGBoostConfig] = None, + scaling_config: Optional[ray.train.ScalingConfig] = None, + run_config: Optional[ray.train.RunConfig] = None, + datasets: Optional[Dict[str, GenDataset]] = None, + dataset_config: Optional[ray.train.DataConfig] = None, + metadata: Optional[Dict[str, Any]] = None, + resume_from_checkpoint: Optional[Checkpoint] = None, + ): + super(XGBoostTrainer, self).__init__( + train_loop_per_worker=train_loop_per_worker, + train_loop_config=train_loop_config, + backend_config=xgboost_config or XGBoostConfig(), + scaling_config=scaling_config, + dataset_config=dataset_config, + run_config=run_config, + datasets=datasets, + resume_from_checkpoint=resume_from_checkpoint, + metadata=metadata, + ) diff --git a/.venv/lib/python3.11/site-packages/ray/train/xgboost/xgboost_checkpoint.py b/.venv/lib/python3.11/site-packages/ray/train/xgboost/xgboost_checkpoint.py new file mode 100644 index 0000000000000000000000000000000000000000..fc8b70465b5c01f85d8f16f841eda2c0969748b8 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/train/xgboost/xgboost_checkpoint.py @@ -0,0 +1,75 @@ +import tempfile +from pathlib import Path +from typing import TYPE_CHECKING, Optional + +import xgboost + +from ray.train._internal.framework_checkpoint import FrameworkCheckpoint +from ray.util.annotations import PublicAPI + +if TYPE_CHECKING: + from ray.data.preprocessor import Preprocessor + + +@PublicAPI(stability="beta") +class XGBoostCheckpoint(FrameworkCheckpoint): + """A :py:class:`~ray.train.Checkpoint` with XGBoost-specific functionality.""" + + MODEL_FILENAME = "model.json" + + @classmethod + def from_model( + cls, + booster: xgboost.Booster, + *, + preprocessor: Optional["Preprocessor"] = None, + path: Optional[str] = None, + ) -> "XGBoostCheckpoint": + """Create a :py:class:`~ray.train.Checkpoint` that stores an XGBoost + model. + + Args: + booster: The XGBoost model to store in the checkpoint. + preprocessor: A fitted preprocessor to be applied before inference. + path: The path to the directory where the checkpoint file will be saved. + This should start as an empty directory, since the *entire* + directory will be treated as the checkpoint when reported. + By default, a temporary directory will be created. + + Returns: + An :py:class:`XGBoostCheckpoint` containing the specified ``Estimator``. + + Examples: + + ... testcode:: + + import numpy as np + import ray + from ray.train.xgboost import XGBoostCheckpoint + import xgboost + + train_X = np.array([[1, 2], [3, 4]]) + train_y = np.array([0, 1]) + + model = xgboost.XGBClassifier().fit(train_X, train_y) + checkpoint = XGBoostCheckpoint.from_model(model.get_booster()) + + """ + checkpoint_path = Path(path or tempfile.mkdtemp()) + + if not checkpoint_path.is_dir(): + raise ValueError(f"`path` must be a directory, but got: {checkpoint_path}") + + booster.save_model(checkpoint_path.joinpath(cls.MODEL_FILENAME).as_posix()) + + checkpoint = cls.from_directory(checkpoint_path.as_posix()) + if preprocessor: + checkpoint.set_preprocessor(preprocessor) + return checkpoint + + def get_model(self) -> xgboost.Booster: + """Retrieve the XGBoost model stored in this checkpoint.""" + with self.as_directory() as checkpoint_path: + booster = xgboost.Booster() + booster.load_model(Path(checkpoint_path, self.MODEL_FILENAME).as_posix()) + return booster diff --git a/.venv/lib/python3.11/site-packages/ray/train/xgboost/xgboost_predictor.py b/.venv/lib/python3.11/site-packages/ray/train/xgboost/xgboost_predictor.py new file mode 100644 index 0000000000000000000000000000000000000000..1b319b93b299bc02a5b83a2f1cdcfa1e8fab6e8e --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/train/xgboost/xgboost_predictor.py @@ -0,0 +1,160 @@ +from typing import TYPE_CHECKING, Any, Dict, List, Optional, Union + +import pandas as pd +import xgboost + +from ray.air.constants import TENSOR_COLUMN_NAME +from ray.air.data_batch_type import DataBatchType +from ray.air.util.data_batch_conversion import _unwrap_ndarray_object_type_if_needed +from ray.train.predictor import Predictor +from ray.train.xgboost import XGBoostCheckpoint +from ray.util.annotations import PublicAPI + +if TYPE_CHECKING: + from ray.data.preprocessor import Preprocessor + + +@PublicAPI(stability="beta") +class XGBoostPredictor(Predictor): + """A predictor for XGBoost models. + + Args: + model: The XGBoost booster to use for predictions. + preprocessor: A preprocessor used to transform data batches prior + to prediction. + """ + + def __init__( + self, model: xgboost.Booster, preprocessor: Optional["Preprocessor"] = None + ): + self.model = model + super().__init__(preprocessor) + + def __repr__(self): + return ( + f"{self.__class__.__name__}(model={self.model!r}, " + f"preprocessor={self._preprocessor!r})" + ) + + @classmethod + def from_checkpoint(cls, checkpoint: XGBoostCheckpoint) -> "XGBoostPredictor": + """Instantiate the predictor from a Checkpoint. + + This is a helper constructor that instantiates the predictor from a + framework-specific XGBoost checkpoint. + + Args: + checkpoint: The checkpoint to load the model and preprocessor from. + + """ + model = checkpoint.get_model() + preprocessor = checkpoint.get_preprocessor() + return cls(model=model, preprocessor=preprocessor) + + def predict( + self, + data: DataBatchType, + feature_columns: Optional[Union[List[str], List[int]]] = None, + dmatrix_kwargs: Optional[Dict[str, Any]] = None, + **predict_kwargs, + ) -> DataBatchType: + """Run inference on data batch. + + The data is converted into an XGBoost DMatrix before being inputted to + the model. + + Args: + data: A batch of input data. + feature_columns: The names or indices of the columns in the + data to use as features to predict on. If None, then use + all columns in ``data``. + dmatrix_kwargs: Dict of keyword arguments passed to ``xgboost.DMatrix``. + **predict_kwargs: Keyword arguments passed to ``xgboost.Booster.predict``. + + + Examples: + + .. testcode:: + + import numpy as np + import xgboost as xgb + from ray.train.xgboost import XGBoostPredictor + train_X = np.array([[1, 2], [3, 4]]) + train_y = np.array([0, 1]) + model = xgb.XGBClassifier().fit(train_X, train_y) + predictor = XGBoostPredictor(model=model.get_booster()) + data = np.array([[1, 2], [3, 4]]) + predictions = predictor.predict(data) + # Only use first and second column as the feature + data = np.array([[1, 2, 8], [3, 4, 9]]) + predictions = predictor.predict(data, feature_columns=[0, 1]) + + .. testcode:: + + import pandas as pd + import xgboost as xgb + from ray.train.xgboost import XGBoostPredictor + train_X = pd.DataFrame([[1, 2], [3, 4]], columns=["A", "B"]) + train_y = pd.Series([0, 1]) + model = xgb.XGBClassifier().fit(train_X, train_y) + predictor = XGBoostPredictor(model=model.get_booster()) + # Pandas dataframe. + data = pd.DataFrame([[1, 2], [3, 4]], columns=["A", "B"]) + predictions = predictor.predict(data) + # Only use first and second column as the feature + data = pd.DataFrame([[1, 2, 8], [3, 4, 9]], columns=["A", "B", "C"]) + predictions = predictor.predict(data, feature_columns=["A", "B"]) + + + Returns: + Prediction result. + + """ + return Predictor.predict( + self, + data, + feature_columns=feature_columns, + dmatrix_kwargs=dmatrix_kwargs, + **predict_kwargs, + ) + + def _predict_pandas( + self, + data: "pd.DataFrame", + feature_columns: Optional[Union[List[str], List[int]]] = None, + dmatrix_kwargs: Optional[Dict[str, Any]] = None, + **predict_kwargs, + ) -> "pd.DataFrame": + dmatrix_kwargs = dmatrix_kwargs or {} + + feature_names = None + if TENSOR_COLUMN_NAME in data: + data = data[TENSOR_COLUMN_NAME].to_numpy() + data = _unwrap_ndarray_object_type_if_needed(data) + if feature_columns: + # In this case feature_columns is a list of integers + data = data[:, feature_columns] + elif feature_columns: + # feature_columns is a list of integers or strings + data = data[feature_columns].to_numpy() + # Only set the feature names if they are strings + if all(isinstance(fc, str) for fc in feature_columns): + feature_names = feature_columns + else: + feature_columns = data.columns.tolist() + data = data.to_numpy() + + if all(isinstance(fc, str) for fc in feature_columns): + feature_names = feature_columns + + if feature_names: + dmatrix_kwargs["feature_names"] = feature_names + + matrix = xgboost.DMatrix(data, **dmatrix_kwargs) + df = pd.DataFrame(self.model.predict(matrix, **predict_kwargs)) + df.columns = ( + ["predictions"] + if len(df.columns) == 1 + else [f"predictions_{i}" for i in range(len(df.columns))] + ) + return df diff --git a/.venv/lib/python3.11/site-packages/ray/train/xgboost/xgboost_trainer.py b/.venv/lib/python3.11/site-packages/ray/train/xgboost/xgboost_trainer.py new file mode 100644 index 0000000000000000000000000000000000000000..8b4f49b0f9ba6c3f0c60c06b346ea1514381fcab --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/train/xgboost/xgboost_trainer.py @@ -0,0 +1,222 @@ +import logging +from functools import partial +from typing import Any, Dict, Optional + +import xgboost +from packaging.version import Version + +import ray.train +from ray.train import Checkpoint +from ray.train.constants import _DEPRECATED_VALUE, TRAIN_DATASET_KEY +from ray.train.trainer import GenDataset +from ray.train.xgboost import RayTrainReportCallback +from ray.train.xgboost.v2 import XGBoostTrainer as SimpleXGBoostTrainer +from ray.util.annotations import PublicAPI + +logger = logging.getLogger(__name__) + + +def _xgboost_train_fn_per_worker( + config: dict, + label_column: str, + num_boost_round: int, + dataset_keys: set, + xgboost_train_kwargs: dict, +): + checkpoint = ray.train.get_checkpoint() + starting_model = None + remaining_iters = num_boost_round + if checkpoint: + starting_model = RayTrainReportCallback.get_model(checkpoint) + starting_iter = starting_model.num_boosted_rounds() + remaining_iters = num_boost_round - starting_iter + logger.info( + f"Model loaded from checkpoint will train for " + f"additional {remaining_iters} iterations (trees) in order " + "to achieve the target number of iterations " + f"({num_boost_round=})." + ) + + train_ds_iter = ray.train.get_dataset_shard(TRAIN_DATASET_KEY) + train_df = train_ds_iter.materialize().to_pandas() + + eval_ds_iters = { + k: ray.train.get_dataset_shard(k) + for k in dataset_keys + if k != TRAIN_DATASET_KEY + } + eval_dfs = {k: d.materialize().to_pandas() for k, d in eval_ds_iters.items()} + + train_X, train_y = train_df.drop(label_column, axis=1), train_df[label_column] + dtrain = xgboost.DMatrix(train_X, label=train_y) + + # NOTE: Include the training dataset in the evaluation datasets. + # This allows `train-*` metrics to be calculated and reported. + evals = [(dtrain, TRAIN_DATASET_KEY)] + + for eval_name, eval_df in eval_dfs.items(): + eval_X, eval_y = eval_df.drop(label_column, axis=1), eval_df[label_column] + evals.append((xgboost.DMatrix(eval_X, label=eval_y), eval_name)) + + evals_result = {} + xgboost.train( + config, + dtrain=dtrain, + evals=evals, + evals_result=evals_result, + num_boost_round=remaining_iters, + xgb_model=starting_model, + **xgboost_train_kwargs, + ) + + +@PublicAPI(stability="beta") +class XGBoostTrainer(SimpleXGBoostTrainer): + """A Trainer for data parallel XGBoost training. + + This Trainer runs the XGBoost training loop in a distributed manner + using multiple Ray Actors. + + .. note:: + ``XGBoostTrainer`` does not modify or otherwise alter the working + of the XGBoost distributed training algorithm. + Ray only provides orchestration, data ingest and fault tolerance. + For more information on XGBoost distributed training, refer to + `XGBoost documentation `__. + + Example: + .. testcode:: + + import ray + + from ray.train.xgboost import XGBoostTrainer + from ray.train import ScalingConfig + + train_dataset = ray.data.from_items( + [{"x": x, "y": x + 1} for x in range(32)]) + trainer = XGBoostTrainer( + label_column="y", + params={"objective": "reg:squarederror"}, + scaling_config=ScalingConfig(num_workers=3), + datasets={"train": train_dataset}, + ) + result = trainer.fit() + + .. testoutput:: + :hide: + + ... + + Args: + datasets: The Ray Datasets to use for training and validation. Must include a + "train" key denoting the training dataset. All non-training datasets will + be used as separate validation sets, each reporting a separate metric. + label_column: Name of the label column. A column with this name + must be present in the training dataset. + params: XGBoost training parameters. + Refer to `XGBoost documentation `_ + for a list of possible parameters. + num_boost_round: Target number of boosting iterations (trees in the model). + Note that unlike in ``xgboost.train``, this is the target number + of trees, meaning that if you set ``num_boost_round=10`` and pass a model + that has already been trained for 5 iterations, it will be trained for 5 + iterations more, instead of 10 more. + scaling_config: Configuration for how to scale data parallel training. + run_config: Configuration for the execution of the training run. + dataset_config: The configuration for ingesting the input ``datasets``. + By default, all the Ray Datasets are split equally across workers. + See :class:`~ray.train.DataConfig` for more details. + resume_from_checkpoint: A checkpoint to resume training from. + metadata: Dict that should be made available in `checkpoint.get_metadata()` + for checkpoints saved from this Trainer. Must be JSON-serializable. + **train_kwargs: Additional kwargs passed to ``xgboost.train()`` function. + """ + + _handles_checkpoint_freq = True + _handles_checkpoint_at_end = True + + def __init__( + self, + *, + datasets: Dict[str, GenDataset], + label_column: str, + params: Dict[str, Any], + dmatrix_params: Optional[Dict[str, Dict[str, Any]]] = _DEPRECATED_VALUE, + num_boost_round: int = 10, + scaling_config: Optional[ray.train.ScalingConfig] = None, + run_config: Optional[ray.train.RunConfig] = None, + dataset_config: Optional[ray.train.DataConfig] = None, + resume_from_checkpoint: Optional[Checkpoint] = None, + metadata: Optional[Dict[str, Any]] = None, + **train_kwargs, + ): + if Version(xgboost.__version__) < Version("1.7.0"): + raise ImportError( + "`XGBoostTrainer` requires the `xgboost` version to be >= 1.7.0. " + 'Upgrade with: `pip install -U "xgboost>=1.7"`' + ) + + # TODO(justinvyu): [Deprecated] Remove in 2.11 + if dmatrix_params != _DEPRECATED_VALUE: + raise DeprecationWarning( + "`dmatrix_params` is deprecated, since XGBoostTrainer no longer " + "depends on the `xgboost_ray.RayDMatrix` utility. " + "You can remove this argument and use `dataset_config` instead " + "to customize Ray Dataset ingestion." + ) + + # Initialize a default Ray Train metrics/checkpoint reporting callback if needed + callbacks = train_kwargs.get("callbacks", []) + user_supplied_callback = any( + isinstance(callback, RayTrainReportCallback) for callback in callbacks + ) + callback_kwargs = {} + if run_config: + checkpoint_frequency = run_config.checkpoint_config.checkpoint_frequency + checkpoint_at_end = run_config.checkpoint_config.checkpoint_at_end + + callback_kwargs["frequency"] = checkpoint_frequency + # Default `checkpoint_at_end=True` unless the user explicitly sets it. + callback_kwargs["checkpoint_at_end"] = ( + checkpoint_at_end if checkpoint_at_end is not None else True + ) + + if not user_supplied_callback: + callbacks.append(RayTrainReportCallback(**callback_kwargs)) + train_kwargs["callbacks"] = callbacks + + train_fn_per_worker = partial( + _xgboost_train_fn_per_worker, + label_column=label_column, + num_boost_round=num_boost_round, + dataset_keys=set(datasets), + xgboost_train_kwargs=train_kwargs, + ) + + super(XGBoostTrainer, self).__init__( + train_loop_per_worker=train_fn_per_worker, + train_loop_config=params, + scaling_config=scaling_config, + run_config=run_config, + datasets=datasets, + dataset_config=dataset_config, + resume_from_checkpoint=resume_from_checkpoint, + metadata=metadata, + ) + + @classmethod + def get_model( + cls, + checkpoint: Checkpoint, + ) -> xgboost.Booster: + """Retrieve the XGBoost model stored in this checkpoint.""" + return RayTrainReportCallback.get_model(checkpoint) + + def _validate_attributes(self): + super()._validate_attributes() + + if TRAIN_DATASET_KEY not in self.datasets: + raise KeyError( + f"'{TRAIN_DATASET_KEY}' key must be preset in `datasets`. " + f"Got {list(self.datasets.keys())}" + ) diff --git a/.venv/lib/python3.11/site-packages/ray/tune/search/__pycache__/__init__.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/tune/search/__pycache__/__init__.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..392a25b023f47ac1c4918bf5aaa04fa8be31db75 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/tune/search/__pycache__/__init__.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/tune/search/__pycache__/_mock.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/tune/search/__pycache__/_mock.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..b0b320732a7f570914c7d7d7d8e6ed2a01189b4c Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/tune/search/__pycache__/_mock.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/tune/search/__pycache__/basic_variant.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/tune/search/__pycache__/basic_variant.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..841a3b966f67287e36825ac1b892cd308942aba2 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/tune/search/__pycache__/basic_variant.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/tune/search/__pycache__/repeater.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/tune/search/__pycache__/repeater.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..b6350390ed40faa559c3ac738c33ed0060fc46d5 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/tune/search/__pycache__/repeater.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/tune/search/__pycache__/searcher.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/tune/search/__pycache__/searcher.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..6bee4602674fd28c51db5d0ef68aae9e4d45e837 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/tune/search/__pycache__/searcher.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/tune/search/__pycache__/util.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/tune/search/__pycache__/util.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..7184cfbef4230f924cbe9c7b057673a87e3cc2a3 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/tune/search/__pycache__/util.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/tune/search/__pycache__/variant_generator.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/tune/search/__pycache__/variant_generator.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..94ecd0ccffb3f48b4d7d2b3cc5c092d4e1b4be02 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/tune/search/__pycache__/variant_generator.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/tune/search/ax/__init__.py b/.venv/lib/python3.11/site-packages/ray/tune/search/ax/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..d2cc1df85b97152386a332358657e13e2dd06ede --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/tune/search/ax/__init__.py @@ -0,0 +1,3 @@ +from ray.tune.search.ax.ax_search import AxSearch + +__all__ = ["AxSearch"] diff --git a/.venv/lib/python3.11/site-packages/ray/tune/search/ax/ax_search.py b/.venv/lib/python3.11/site-packages/ray/tune/search/ax/ax_search.py new file mode 100644 index 0000000000000000000000000000000000000000..6404fcd85e7a5c7fff9626fac6ed0c1d2e280fa0 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/tune/search/ax/ax_search.py @@ -0,0 +1,432 @@ +import copy +import logging +from typing import Dict, List, Optional, Union + +import numpy as np + +from ray import cloudpickle +from ray.tune.result import DEFAULT_METRIC +from ray.tune.search import ( + UNDEFINED_METRIC_MODE, + UNDEFINED_SEARCH_SPACE, + UNRESOLVED_SEARCH_SPACE, + Searcher, +) +from ray.tune.search.sample import ( + Categorical, + Float, + Integer, + LogUniform, + Quantized, + Uniform, +) +from ray.tune.search.variant_generator import parse_spec_vars +from ray.tune.utils.util import flatten_dict, unflatten_list_dict + +try: + import ax + from ax.service.ax_client import AxClient +except ImportError: + ax = AxClient = None + +# This exception only exists in newer Ax releases for python 3.7 +try: + from ax.exceptions.core import DataRequiredError + from ax.exceptions.generation_strategy import MaxParallelismReachedException +except ImportError: + MaxParallelismReachedException = DataRequiredError = Exception + + +logger = logging.getLogger(__name__) + + +class AxSearch(Searcher): + """Uses `Ax `_ to optimize hyperparameters. + + Ax is a platform for understanding, managing, deploying, and + automating adaptive experiments. Ax provides an easy to use + interface with BoTorch, a flexible, modern library for Bayesian + optimization in PyTorch. More information can be found in https://ax.dev/. + + To use this search algorithm, you must install Ax: + + .. code-block:: bash + + $ pip install ax-platform + + Parameters: + space: Parameters in the experiment search space. + Required elements in the dictionaries are: "name" (name of + this parameter, string), "type" (type of the parameter: "range", + "fixed", or "choice", string), "bounds" for range parameters + (list of two values, lower bound first), "values" for choice + parameters (list of values), and "value" for fixed parameters + (single value). + metric: Name of the metric used as objective in this + experiment. This metric must be present in `raw_data` argument + to `log_data`. This metric must also be present in the dict + reported/returned by the Trainable. If None but a mode was passed, + the `ray.tune.result.DEFAULT_METRIC` will be used per default. + mode: One of {min, max}. Determines whether objective is + minimizing or maximizing the metric attribute. Defaults to "max". + points_to_evaluate: Initial parameter suggestions to be run + first. This is for when you already have some good parameters + you want to run first to help the algorithm make better suggestions + for future parameters. Needs to be a list of dicts containing the + configurations. + parameter_constraints: Parameter constraints, such as + "x3 >= x4" or "x3 + x4 >= 2". + outcome_constraints: Outcome constraints of form + "metric_name >= bound", like "m1 <= 3." + ax_client: Optional AxClient instance. If this is set, do + not pass any values to these parameters: `space`, `metric`, + `parameter_constraints`, `outcome_constraints`. + **ax_kwargs: Passed to AxClient instance. Ignored if `AxClient` is not + None. + + Tune automatically converts search spaces to Ax's format: + + .. code-block:: python + + from ray import train, tune + from ray.tune.search.ax import AxSearch + + config = { + "x1": tune.uniform(0.0, 1.0), + "x2": tune.uniform(0.0, 1.0) + } + + def easy_objective(config): + for i in range(100): + intermediate_result = config["x1"] + config["x2"] * i + train.report({"score": intermediate_result}) + + ax_search = AxSearch() + tuner = tune.Tuner( + easy_objective, + tune_config=tune.TuneConfig( + search_alg=ax_search, + metric="score", + mode="max", + ), + param_space=config, + ) + tuner.fit() + + If you would like to pass the search space manually, the code would + look like this: + + .. code-block:: python + + from ray import train, tune + from ray.tune.search.ax import AxSearch + + parameters = [ + {"name": "x1", "type": "range", "bounds": [0.0, 1.0]}, + {"name": "x2", "type": "range", "bounds": [0.0, 1.0]}, + ] + + def easy_objective(config): + for i in range(100): + intermediate_result = config["x1"] + config["x2"] * i + train.report({"score": intermediate_result}) + + ax_search = AxSearch(space=parameters, metric="score", mode="max") + tuner = tune.Tuner( + easy_objective, + tune_config=tune.TuneConfig( + search_alg=ax_search, + ), + ) + tuner.fit() + + """ + + def __init__( + self, + space: Optional[Union[Dict, List[Dict]]] = None, + metric: Optional[str] = None, + mode: Optional[str] = None, + points_to_evaluate: Optional[List[Dict]] = None, + parameter_constraints: Optional[List] = None, + outcome_constraints: Optional[List] = None, + ax_client: Optional[AxClient] = None, + **ax_kwargs, + ): + assert ( + ax is not None + ), """Ax must be installed! + You can install AxSearch with the command: + `pip install ax-platform`.""" + + if mode: + assert mode in ["min", "max"], "`mode` must be 'min' or 'max'." + + super(AxSearch, self).__init__( + metric=metric, + mode=mode, + ) + + self._ax = ax_client + self._ax_kwargs = ax_kwargs or {} + + if isinstance(space, dict) and space: + resolved_vars, domain_vars, grid_vars = parse_spec_vars(space) + if domain_vars or grid_vars: + logger.warning( + UNRESOLVED_SEARCH_SPACE.format(par="space", cls=type(self)) + ) + space = self.convert_search_space(space) + + self._space = space + self._parameter_constraints = parameter_constraints + self._outcome_constraints = outcome_constraints + + self._points_to_evaluate = copy.deepcopy(points_to_evaluate) + + self._parameters = [] + self._live_trial_mapping = {} + + if self._ax or self._space: + self._setup_experiment() + + def _setup_experiment(self): + if self._metric is None and self._mode: + # If only a mode was passed, use anonymous metric + self._metric = DEFAULT_METRIC + + if not self._ax: + self._ax = AxClient(**self._ax_kwargs) + + try: + exp = self._ax.experiment + has_experiment = True + except ValueError: + has_experiment = False + + if not has_experiment: + if not self._space: + raise ValueError( + "You have to create an Ax experiment by calling " + "`AxClient.create_experiment()`, or you should pass an " + "Ax search space as the `space` parameter to `AxSearch`, " + "or pass a `param_space` dict to `tune.Tuner()`." + ) + if self._mode not in ["min", "max"]: + raise ValueError( + "Please specify the `mode` argument when initializing " + "the `AxSearch` object or pass it to `tune.TuneConfig()`." + ) + self._ax.create_experiment( + parameters=self._space, + objective_name=self._metric, + parameter_constraints=self._parameter_constraints, + outcome_constraints=self._outcome_constraints, + minimize=self._mode != "max", + ) + else: + if any( + [ + self._space, + self._parameter_constraints, + self._outcome_constraints, + self._mode, + self._metric, + ] + ): + raise ValueError( + "If you create the Ax experiment yourself, do not pass " + "values for these parameters to `AxSearch`: {}.".format( + [ + "space", + "parameter_constraints", + "outcome_constraints", + "mode", + "metric", + ] + ) + ) + + exp = self._ax.experiment + + # Update mode and metric from experiment if it has been passed + self._mode = "min" if exp.optimization_config.objective.minimize else "max" + self._metric = exp.optimization_config.objective.metric.name + + self._parameters = list(exp.parameters) + + if self._ax._enforce_sequential_optimization: + logger.warning( + "Detected sequential enforcement. Be sure to use " + "a ConcurrencyLimiter." + ) + + def set_search_properties( + self, metric: Optional[str], mode: Optional[str], config: Dict, **spec + ): + if self._ax: + return False + space = self.convert_search_space(config) + self._space = space + if metric: + self._metric = metric + if mode: + self._mode = mode + + self._setup_experiment() + return True + + def suggest(self, trial_id: str) -> Optional[Dict]: + if not self._ax: + raise RuntimeError( + UNDEFINED_SEARCH_SPACE.format( + cls=self.__class__.__name__, space="space" + ) + ) + + if not self._metric or not self._mode: + raise RuntimeError( + UNDEFINED_METRIC_MODE.format( + cls=self.__class__.__name__, metric=self._metric, mode=self._mode + ) + ) + + if self._points_to_evaluate: + config = self._points_to_evaluate.pop(0) + parameters, trial_index = self._ax.attach_trial(config) + else: + try: + parameters, trial_index = self._ax.get_next_trial() + except (MaxParallelismReachedException, DataRequiredError): + return None + + self._live_trial_mapping[trial_id] = trial_index + try: + suggested_config = unflatten_list_dict(parameters) + except AssertionError: + # Fails to unflatten if keys are out of order, which only happens + # if search space includes a list with both constants and + # tunable hyperparameters: + # Ex: "a": [1, tune.uniform(2, 3), 4] + suggested_config = unflatten_list_dict( + {k: parameters[k] for k in sorted(parameters.keys())} + ) + return suggested_config + + def on_trial_complete(self, trial_id, result=None, error=False): + """Notification for the completion of trial. + + Data of form key value dictionary of metric names and values. + """ + if result: + self._process_result(trial_id, result) + self._live_trial_mapping.pop(trial_id) + + def _process_result(self, trial_id, result): + ax_trial_index = self._live_trial_mapping[trial_id] + metrics_to_include = [self._metric] + [ + oc.metric.name + for oc in self._ax.experiment.optimization_config.outcome_constraints + ] + metric_dict = {} + for key in metrics_to_include: + val = result[key] + if np.isnan(val) or np.isinf(val): + # Don't report trials with NaN metrics to Ax + self._ax.abandon_trial( + trial_index=ax_trial_index, + reason=f"nan/inf metrics reported by {trial_id}", + ) + return + metric_dict[key] = (val, None) + self._ax.complete_trial(trial_index=ax_trial_index, raw_data=metric_dict) + + @staticmethod + def convert_search_space(spec: Dict): + resolved_vars, domain_vars, grid_vars = parse_spec_vars(spec) + + if grid_vars: + raise ValueError( + "Grid search parameters cannot be automatically converted " + "to an Ax search space." + ) + + # Flatten and resolve again after checking for grid search. + spec = flatten_dict(spec, prevent_delimiter=True) + resolved_vars, domain_vars, grid_vars = parse_spec_vars(spec) + + def resolve_value(par, domain): + sampler = domain.get_sampler() + if isinstance(sampler, Quantized): + logger.warning( + "AxSearch does not support quantization. Dropped quantization." + ) + sampler = sampler.sampler + + if isinstance(domain, Float): + if isinstance(sampler, LogUniform): + return { + "name": par, + "type": "range", + "bounds": [domain.lower, domain.upper], + "value_type": "float", + "log_scale": True, + } + elif isinstance(sampler, Uniform): + return { + "name": par, + "type": "range", + "bounds": [domain.lower, domain.upper], + "value_type": "float", + "log_scale": False, + } + elif isinstance(domain, Integer): + if isinstance(sampler, LogUniform): + return { + "name": par, + "type": "range", + "bounds": [domain.lower, domain.upper - 1], + "value_type": "int", + "log_scale": True, + } + elif isinstance(sampler, Uniform): + return { + "name": par, + "type": "range", + "bounds": [domain.lower, domain.upper - 1], + "value_type": "int", + "log_scale": False, + } + elif isinstance(domain, Categorical): + if isinstance(sampler, Uniform): + return {"name": par, "type": "choice", "values": domain.categories} + + raise ValueError( + "AxSearch does not support parameters of type " + "`{}` with samplers of type `{}`".format( + type(domain).__name__, type(domain.sampler).__name__ + ) + ) + + # Parameter name is e.g. "a/b/c" for nested dicts, + # "a/d/0", "a/d/1" for nested lists (using the index in the list) + fixed_values = [ + {"name": "/".join(str(p) for p in path), "type": "fixed", "value": val} + for path, val in resolved_vars + ] + resolved_values = [ + resolve_value("/".join(str(p) for p in path), domain) + for path, domain in domain_vars + ] + + return fixed_values + resolved_values + + def save(self, checkpoint_path: str): + save_object = self.__dict__ + with open(checkpoint_path, "wb") as outputFile: + cloudpickle.dump(save_object, outputFile) + + def restore(self, checkpoint_path: str): + with open(checkpoint_path, "rb") as inputFile: + save_object = cloudpickle.load(inputFile) + self.__dict__.update(save_object) diff --git a/.venv/lib/python3.11/site-packages/ray/tune/search/bayesopt/__init__.py b/.venv/lib/python3.11/site-packages/ray/tune/search/bayesopt/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..2569dcd5849a69f9818e90ac6db9cc4891c2da9c --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/tune/search/bayesopt/__init__.py @@ -0,0 +1,3 @@ +from ray.tune.search.bayesopt.bayesopt_search import BayesOptSearch + +__all__ = ["BayesOptSearch"] diff --git a/.venv/lib/python3.11/site-packages/ray/tune/search/bayesopt/__pycache__/__init__.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/tune/search/bayesopt/__pycache__/__init__.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..5dd3af962a330305686af804fd3807d9b681132f Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/tune/search/bayesopt/__pycache__/__init__.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/tune/search/bayesopt/__pycache__/bayesopt_search.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/tune/search/bayesopt/__pycache__/bayesopt_search.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..a846c0fd24823a44b165cd3afcce7bf213ad44c7 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/tune/search/bayesopt/__pycache__/bayesopt_search.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/tune/search/bayesopt/bayesopt_search.py b/.venv/lib/python3.11/site-packages/ray/tune/search/bayesopt/bayesopt_search.py new file mode 100644 index 0000000000000000000000000000000000000000..97b67192bcc88aafe81a9dd5d361b03884d84688 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/tune/search/bayesopt/bayesopt_search.py @@ -0,0 +1,449 @@ +import json +import logging +import pickle +from collections import defaultdict +from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple + +from ray.tune.result import DEFAULT_METRIC +from ray.tune.search import ( + UNDEFINED_METRIC_MODE, + UNDEFINED_SEARCH_SPACE, + UNRESOLVED_SEARCH_SPACE, + Searcher, +) +from ray.tune.search.sample import Domain, Float, Quantized, Uniform +from ray.tune.search.variant_generator import parse_spec_vars +from ray.tune.utils import flatten_dict +from ray.tune.utils.util import is_nan_or_inf, unflatten_dict + +try: # Python 3 only -- needed for lint test. + import bayes_opt as byo +except ImportError: + byo = None + + +if TYPE_CHECKING: + from ray.tune import ExperimentAnalysis + +logger = logging.getLogger(__name__) + + +def _dict_hash(config, precision): + flatconfig = flatten_dict(config) + for param, value in flatconfig.items(): + if isinstance(value, float): + flatconfig[param] = "{:.{digits}f}".format(value, digits=precision) + + hashed = json.dumps(flatconfig, sort_keys=True, default=str) + return hashed + + +class BayesOptSearch(Searcher): + """Uses bayesian-optimization/BayesianOptimization to optimize hyperparameters. + + bayesian-optimization/BayesianOptimization is a library for Bayesian Optimization. More + info can be found here: https://github.com/bayesian-optimization/BayesianOptimization. + + This searcher will automatically filter out any NaN, inf or -inf + results. + + You will need to install bayesian-optimization/BayesianOptimization via the following: + + .. code-block:: bash + + pip install bayesian-optimization==1.4.3 + + Initializing this search algorithm with a ``space`` requires that it's + in the ``BayesianOptimization`` search space format. Otherwise, you + should instead pass in a Tune search space into ``Tuner(param_space=...)``, + and the search space will be automatically converted for you. + + See this ``BayesianOptimization`` example notebook + `_ + for an example. + + Args: + space: Continuous search space. Parameters will be sampled from + this space which will be used to run trials. + metric: The training result objective value attribute. If None + but a mode was passed, the anonymous metric `_metric` will be used + per default. + mode: One of {min, max}. Determines whether objective is + minimizing or maximizing the metric attribute. + points_to_evaluate: Initial parameter suggestions to be run + first. This is for when you already have some good parameters + you want to run first to help the algorithm make better suggestions + for future parameters. Needs to be a list of dicts containing the + configurations. + utility_kwargs: Parameters to define the utility function. + The default value is a dictionary with three keys: + - kind: ucb (Upper Confidence Bound) + - kappa: 2.576 + - xi: 0.0 + random_state: Used to initialize BayesOpt. + random_search_steps: Number of initial random searches. + This is necessary to avoid initial local overfitting + of the Bayesian process. + verbose: Sets verbosity level for BayesOpt packages. + patience: If patience is set and we've repeated a trial numerous times, + we terminate the experiment. + skip_duplicate: skip duplicate config + analysis: Optionally, the previous analysis to integrate. + + Tune automatically converts search spaces to BayesOptSearch's format: + + .. code-block:: python + + from ray import tune + from ray.tune.search.bayesopt import BayesOptSearch + + config = { + "width": tune.uniform(0, 20), + "height": tune.uniform(-100, 100) + } + + bayesopt = BayesOptSearch(metric="mean_loss", mode="min") + tuner = tune.Tuner( + my_func, + tune_config=tune.TuneConfig( + search_alg=baysopt, + ), + param_space=config, + ) + tuner.fit() + + If you would like to pass the search space manually, the code would + look like this: + + .. code-block:: python + + from ray import tune + from ray.tune.search.bayesopt import BayesOptSearch + + space = { + 'width': (0, 20), + 'height': (-100, 100), + } + bayesopt = BayesOptSearch(space, metric="mean_loss", mode="min") + tuner = tune.Tuner( + my_func, + tune_config=tune.TuneConfig( + search_alg=bayesopt, + ), + ) + tuner.fit() + + """ + + # bayes_opt.BayesianOptimization: Optimization object + optimizer = None + + def __init__( + self, + space: Optional[Dict] = None, + metric: Optional[str] = None, + mode: Optional[str] = None, + points_to_evaluate: Optional[List[Dict]] = None, + utility_kwargs: Optional[Dict] = None, + random_state: int = 42, + random_search_steps: int = 10, + verbose: int = 0, + patience: int = 5, + skip_duplicate: bool = True, + analysis: Optional["ExperimentAnalysis"] = None, + ): + assert byo is not None, ( + "BayesOpt must be installed!. You can install BayesOpt with" + " the command: `pip install bayesian-optimization==1.4.3`." + ) + if mode: + assert mode in ["min", "max"], "`mode` must be 'min' or 'max'." + self._config_counter = defaultdict(int) + self._patience = patience + # int: Precision at which to hash values. + self.repeat_float_precision = 5 + if self._patience <= 0: + raise ValueError("patience must be set to a value greater than 0!") + self._skip_duplicate = skip_duplicate + super(BayesOptSearch, self).__init__( + metric=metric, + mode=mode, + ) + + if utility_kwargs is None: + # The defaults arguments are the same + # as in the package BayesianOptimization + utility_kwargs = dict( + kind="ucb", + kappa=2.576, + xi=0.0, + ) + + if mode == "max": + self._metric_op = 1.0 + elif mode == "min": + self._metric_op = -1.0 + + self._points_to_evaluate = points_to_evaluate + + self._live_trial_mapping = {} + self._buffered_trial_results = [] + self.random_search_trials = random_search_steps + self._total_random_search_trials = 0 + + self.utility = byo.UtilityFunction(**utility_kwargs) + + self._analysis = analysis + + if isinstance(space, dict) and space: + resolved_vars, domain_vars, grid_vars = parse_spec_vars(space) + if domain_vars or grid_vars: + logger.warning( + UNRESOLVED_SEARCH_SPACE.format(par="space", cls=type(self)) + ) + space = self.convert_search_space(space, join=True) + + self._space = space + self._verbose = verbose + self._random_state = random_state + + self.optimizer = None + if space: + self._setup_optimizer() + + def _setup_optimizer(self): + if self._metric is None and self._mode: + # If only a mode was passed, use anonymous metric + self._metric = DEFAULT_METRIC + + self.optimizer = byo.BayesianOptimization( + f=None, + pbounds=self._space, + verbose=self._verbose, + random_state=self._random_state, + ) + + # Registering the provided analysis, if given + if self._analysis is not None: + self.register_analysis(self._analysis) + + def set_search_properties( + self, metric: Optional[str], mode: Optional[str], config: Dict, **spec + ) -> bool: + if self.optimizer: + return False + space = self.convert_search_space(config) + self._space = space + if metric: + self._metric = metric + if mode: + self._mode = mode + + if self._mode == "max": + self._metric_op = 1.0 + elif self._mode == "min": + self._metric_op = -1.0 + + self._setup_optimizer() + return True + + def suggest(self, trial_id: str) -> Optional[Dict]: + """Return new point to be explored by black box function. + + Args: + trial_id: Id of the trial. + This is a short alphanumerical string. + + Returns: + Either a dictionary describing the new point to explore or + None, when no new point is to be explored for the time being. + """ + if not self.optimizer: + raise RuntimeError( + UNDEFINED_SEARCH_SPACE.format( + cls=self.__class__.__name__, space="space" + ) + ) + + if not self._metric or not self._mode: + raise RuntimeError( + UNDEFINED_METRIC_MODE.format( + cls=self.__class__.__name__, metric=self._metric, mode=self._mode + ) + ) + + if self._points_to_evaluate: + config = self._points_to_evaluate.pop(0) + else: + # We compute the new point to explore + config = self.optimizer.suggest(self.utility) + + config_hash = _dict_hash(config, self.repeat_float_precision) + # Check if already computed + already_seen = config_hash in self._config_counter + self._config_counter[config_hash] += 1 + top_repeats = max(self._config_counter.values()) + + # If patience is set and we've repeated a trial numerous times, + # we terminate the experiment. + if self._patience is not None and top_repeats > self._patience: + return Searcher.FINISHED + # If we have seen a value before, we'll skip it. + if already_seen and self._skip_duplicate: + logger.info("Skipping duplicated config: {}.".format(config)) + return None + + # If we are still in the random search part and we are waiting for + # trials to complete + if len(self._buffered_trial_results) < self.random_search_trials: + # We check if we have already maxed out the number of requested + # random search trials + if self._total_random_search_trials == self.random_search_trials: + # If so we stop the suggestion and return None + return None + # Otherwise we increase the total number of rndom search trials + if config: + self._total_random_search_trials += 1 + + # Save the new trial to the trial mapping + self._live_trial_mapping[trial_id] = config + + # Return a deep copy of the mapping + return unflatten_dict(config) + + def register_analysis(self, analysis: "ExperimentAnalysis"): + """Integrate the given analysis into the gaussian process. + + Args: + analysis: Optionally, the previous analysis + to integrate. + """ + for (_, report), params in zip( + analysis.dataframe(metric=self._metric, mode=self._mode).iterrows(), + analysis.get_all_configs().values(), + ): + # We add the obtained results to the + # gaussian process optimizer + self._register_result(params, report) + + def on_trial_complete( + self, trial_id: str, result: Optional[Dict] = None, error: bool = False + ): + """Notification for the completion of trial. + + Args: + trial_id: Id of the trial. + This is a short alphanumerical string. + result: Dictionary of result. + May be none when some error occurs. + error: Boolean representing a previous error state. + The result should be None when error is True. + """ + # We try to get the parameters used for this trial + params = self._live_trial_mapping.pop(trial_id, None) + + # The results may be None if some exception is raised during the trial. + # Also, if the parameters are None (were already processed) + # we interrupt the following procedure. + # Additionally, if somehow the error is True but + # the remaining values are not we also block the method + if result is None or params is None or error: + return + + # If we don't have to execute some random search steps + if len(self._buffered_trial_results) >= self.random_search_trials: + # we simply register the obtained result + self._register_result(params, result) + return + + # We store the results into a temporary cache + self._buffered_trial_results.append((params, result)) + + # If the random search finished, + # we update the BO with all the computer points. + if len(self._buffered_trial_results) == self.random_search_trials: + for params, result in self._buffered_trial_results: + self._register_result(params, result) + + def _register_result(self, params: Tuple[str], result: Dict): + """Register given tuple of params and results.""" + if is_nan_or_inf(result[self.metric]): + return + self.optimizer.register(params, self._metric_op * result[self.metric]) + + def get_state(self) -> Dict[str, Any]: + state = self.__dict__.copy() + return state + + def set_state(self, state: Dict[str, Any]): + self.__dict__.update(state) + + def save(self, checkpoint_path: str): + """Storing current optimizer state.""" + save_object = self.get_state() + with open(checkpoint_path, "wb") as f: + pickle.dump(save_object, f) + + def restore(self, checkpoint_path: str): + """Restoring current optimizer state.""" + with open(checkpoint_path, "rb") as f: + save_object = pickle.load(f) + + if isinstance(save_object, dict): + self.set_state(save_object) + else: + # Backwards compatibility + ( + self.optimizer, + self._buffered_trial_results, + self._total_random_search_trials, + self._config_counter, + self._points_to_evaluate, + ) = save_object + + @staticmethod + def convert_search_space(spec: Dict, join: bool = False) -> Dict: + resolved_vars, domain_vars, grid_vars = parse_spec_vars(spec) + + if grid_vars: + raise ValueError( + "Grid search parameters cannot be automatically converted " + "to a BayesOpt search space." + ) + + # Flatten and resolve again after checking for grid search. + spec = flatten_dict(spec, prevent_delimiter=True) + resolved_vars, domain_vars, grid_vars = parse_spec_vars(spec) + + def resolve_value(domain: Domain) -> Tuple[float, float]: + sampler = domain.get_sampler() + if isinstance(sampler, Quantized): + logger.warning( + "BayesOpt search does not support quantization. " + "Dropped quantization." + ) + sampler = sampler.get_sampler() + + if isinstance(domain, Float): + if domain.sampler is not None and not isinstance( + domain.sampler, Uniform + ): + logger.warning( + "BayesOpt does not support specific sampling methods. " + "The {} sampler will be dropped.".format(sampler) + ) + return (domain.lower, domain.upper) + + raise ValueError( + "BayesOpt does not support parameters of type " + "`{}`".format(type(domain).__name__) + ) + + # Parameter name is e.g. "a/b/c" for nested dicts + bounds = {"/".join(path): resolve_value(domain) for path, domain in domain_vars} + + if join: + spec.update(bounds) + bounds = spec + + return bounds diff --git a/.venv/lib/python3.11/site-packages/ray/tune/search/bohb/__init__.py b/.venv/lib/python3.11/site-packages/ray/tune/search/bohb/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..40068b1562adcd327627c8b8f641079c95364ccc --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/tune/search/bohb/__init__.py @@ -0,0 +1,3 @@ +from ray.tune.search.bohb.bohb_search import BOHB, TuneBOHB + +__all__ = ["BOHB", "TuneBOHB"] diff --git a/.venv/lib/python3.11/site-packages/ray/tune/search/bohb/__pycache__/__init__.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/tune/search/bohb/__pycache__/__init__.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..45c50a1779d2ec6f582c4c05a31302221ea15ca9 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/tune/search/bohb/__pycache__/__init__.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/tune/search/bohb/__pycache__/bohb_search.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/tune/search/bohb/__pycache__/bohb_search.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..4636694f7e40cdd7fa0502a7a9bf4c33a647a401 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/tune/search/bohb/__pycache__/bohb_search.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/tune/search/bohb/bohb_search.py b/.venv/lib/python3.11/site-packages/ray/tune/search/bohb/bohb_search.py new file mode 100644 index 0000000000000000000000000000000000000000..5fa701b700f7f5adf1eed308364725c779c5466d --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/tune/search/bohb/bohb_search.py @@ -0,0 +1,383 @@ +"""BOHB (Bayesian Optimization with HyperBand)""" + +import copy +import logging +import math +from typing import Dict, List, Optional, Union + +# use cloudpickle instead of pickle to make BOHB obj +# pickleable +from ray import cloudpickle +from ray.tune.result import DEFAULT_METRIC +from ray.tune.search import ( + UNDEFINED_METRIC_MODE, + UNDEFINED_SEARCH_SPACE, + UNRESOLVED_SEARCH_SPACE, + Searcher, +) +from ray.tune.search.sample import ( + Categorical, + Domain, + Float, + Integer, + LogUniform, + Normal, + Quantized, + Uniform, +) +from ray.tune.search.variant_generator import parse_spec_vars +from ray.tune.utils.util import flatten_dict, unflatten_list_dict + +try: + import ConfigSpace + from hpbandster.optimizers.config_generators.bohb import BOHB +except ImportError: + BOHB = ConfigSpace = None + +logger = logging.getLogger(__name__) + + +class _BOHBJobWrapper: + """Mock object for HpBandSter to process.""" + + def __init__(self, loss: float, budget: float, config: Dict): + self.result = {"loss": loss} + self.kwargs = {"budget": budget, "config": config.copy()} + self.exception = None + + +class TuneBOHB(Searcher): + """BOHB suggestion component. + + + Requires HpBandSter and ConfigSpace to be installed. You can install + HpBandSter and ConfigSpace with: ``pip install hpbandster ConfigSpace``. + + This should be used in conjunction with HyperBandForBOHB. + + Args: + space: Continuous ConfigSpace search space. + Parameters will be sampled from this space which will be used + to run trials. + bohb_config: configuration for HpBandSter BOHB algorithm + metric: The training result objective value attribute. If None + but a mode was passed, the anonymous metric `_metric` will be used + per default. + mode: One of {min, max}. Determines whether objective is + minimizing or maximizing the metric attribute. + points_to_evaluate: Initial parameter suggestions to be run + first. This is for when you already have some good parameters + you want to run first to help the algorithm make better suggestions + for future parameters. Needs to be a list of dicts containing the + configurations. + seed: Optional random seed to initialize the random number + generator. Setting this should lead to identical initial + configurations at each run. + max_concurrent: Number of maximum concurrent trials. + If this Searcher is used in a ``ConcurrencyLimiter``, the + ``max_concurrent`` value passed to it will override the + value passed here. Set to <= 0 for no limit on concurrency. + + Tune automatically converts search spaces to TuneBOHB's format: + + .. code-block:: python + + config = { + "width": tune.uniform(0, 20), + "height": tune.uniform(-100, 100), + "activation": tune.choice(["relu", "tanh"]) + } + + algo = TuneBOHB(metric="mean_loss", mode="min") + bohb = HyperBandForBOHB( + time_attr="training_iteration", + metric="mean_loss", + mode="min", + max_t=100) + run(my_trainable, config=config, scheduler=bohb, search_alg=algo) + + If you would like to pass the search space manually, the code would + look like this: + + .. code-block:: python + + import ConfigSpace as CS + + config_space = CS.ConfigurationSpace() + config_space.add_hyperparameter( + CS.UniformFloatHyperparameter("width", lower=0, upper=20)) + config_space.add_hyperparameter( + CS.UniformFloatHyperparameter("height", lower=-100, upper=100)) + config_space.add_hyperparameter( + CS.CategoricalHyperparameter( + name="activation", choices=["relu", "tanh"])) + + algo = TuneBOHB( + config_space, metric="mean_loss", mode="min") + bohb = HyperBandForBOHB( + time_attr="training_iteration", + metric="mean_loss", + mode="min", + max_t=100) + run(my_trainable, scheduler=bohb, search_alg=algo) + + """ + + def __init__( + self, + space: Optional[Union[Dict, "ConfigSpace.ConfigurationSpace"]] = None, + bohb_config: Optional[Dict] = None, + metric: Optional[str] = None, + mode: Optional[str] = None, + points_to_evaluate: Optional[List[Dict]] = None, + seed: Optional[int] = None, + max_concurrent: int = 0, + ): + assert ( + BOHB is not None + ), """HpBandSter must be installed! + You can install HpBandSter with the command: + `pip install hpbandster ConfigSpace`.""" + if mode: + assert mode in ["min", "max"], "`mode` must be 'min' or 'max'." + self.trial_to_params = {} + self._metric = metric + + self._bohb_config = bohb_config + + if isinstance(space, dict) and space: + resolved_vars, domain_vars, grid_vars = parse_spec_vars(space) + if domain_vars or grid_vars: + logger.warning( + UNRESOLVED_SEARCH_SPACE.format(par="space", cls=type(self)) + ) + space = self.convert_search_space(space) + + self._space = space + self._seed = seed + + self.running = set() + self.paused = set() + + self._max_concurrent = max_concurrent + self._points_to_evaluate = points_to_evaluate + + super(TuneBOHB, self).__init__( + metric=self._metric, + mode=mode, + ) + + if self._space: + self._setup_bohb() + + def set_max_concurrency(self, max_concurrent: int) -> bool: + self._max_concurrent = max_concurrent + return True + + def _setup_bohb(self): + from hpbandster.optimizers.config_generators.bohb import BOHB + + if self._metric is None and self._mode: + # If only a mode was passed, use anonymous metric + self._metric = DEFAULT_METRIC + + if self._mode == "max": + self._metric_op = -1.0 + elif self._mode == "min": + self._metric_op = 1.0 + + if self._seed is not None: + self._space.seed(self._seed) + + self.running = set() + self.paused = set() + + bohb_config = self._bohb_config or {} + self.bohber = BOHB(self._space, **bohb_config) + + def set_search_properties( + self, metric: Optional[str], mode: Optional[str], config: Dict, **spec + ) -> bool: + if self._space: + return False + space = self.convert_search_space(config) + self._space = space + + if metric: + self._metric = metric + if mode: + self._mode = mode + + self._setup_bohb() + return True + + def suggest(self, trial_id: str) -> Optional[Dict]: + if not self._space: + raise RuntimeError( + UNDEFINED_SEARCH_SPACE.format( + cls=self.__class__.__name__, space="space" + ) + ) + + if not self._metric or not self._mode: + raise RuntimeError( + UNDEFINED_METRIC_MODE.format( + cls=self.__class__.__name__, metric=self._metric, mode=self._mode + ) + ) + + max_concurrent = ( + self._max_concurrent if self._max_concurrent > 0 else float("inf") + ) + if len(self.running) >= max_concurrent: + return None + + if self._points_to_evaluate: + config = self._points_to_evaluate.pop(0) + else: + # This parameter is not used in hpbandster implementation. + config, _ = self.bohber.get_config(None) + self.trial_to_params[trial_id] = copy.deepcopy(config) + self.running.add(trial_id) + return unflatten_list_dict(config) + + def on_trial_result(self, trial_id: str, result: Dict): + if trial_id not in self.paused: + self.running.add(trial_id) + if "hyperband_info" not in result: + logger.warning( + "BOHB Info not detected in result. Are you using " + "HyperBandForBOHB as a scheduler?" + ) + elif "budget" in result.get("hyperband_info", {}): + hbs_wrapper = self.to_wrapper(trial_id, result) + self.bohber.new_result(hbs_wrapper) + + def on_trial_complete( + self, trial_id: str, result: Optional[Dict] = None, error: bool = False + ): + del self.trial_to_params[trial_id] + self.paused.discard(trial_id) + self.running.discard(trial_id) + + def to_wrapper(self, trial_id: str, result: Dict) -> _BOHBJobWrapper: + return _BOHBJobWrapper( + self._metric_op * result[self.metric], + result["hyperband_info"]["budget"], + self.trial_to_params[trial_id], + ) + + # BOHB Specific. + # TODO(team-ml): Refactor alongside HyperBandForBOHB + def on_pause(self, trial_id: str): + self.paused.add(trial_id) + self.running.discard(trial_id) + + def on_unpause(self, trial_id: str): + self.paused.discard(trial_id) + self.running.add(trial_id) + + @staticmethod + def convert_search_space(spec: Dict) -> "ConfigSpace.ConfigurationSpace": + resolved_vars, domain_vars, grid_vars = parse_spec_vars(spec) + + if grid_vars: + raise ValueError( + "Grid search parameters cannot be automatically converted " + "to a TuneBOHB search space." + ) + + # Flatten and resolve again after checking for grid search. + spec = flatten_dict(spec, prevent_delimiter=True) + resolved_vars, domain_vars, grid_vars = parse_spec_vars(spec) + + def resolve_value( + par: str, domain: Domain + ) -> ConfigSpace.hyperparameters.Hyperparameter: + quantize = None + + sampler = domain.get_sampler() + if isinstance(sampler, Quantized): + quantize = sampler.q + sampler = sampler.sampler + + if isinstance(domain, Float): + if isinstance(sampler, LogUniform): + lower = domain.lower + upper = domain.upper + if quantize: + lower = math.ceil(domain.lower / quantize) * quantize + upper = math.floor(domain.upper / quantize) * quantize + return ConfigSpace.UniformFloatHyperparameter( + par, lower=lower, upper=upper, q=quantize, log=True + ) + elif isinstance(sampler, Uniform): + lower = domain.lower + upper = domain.upper + if quantize: + lower = math.ceil(domain.lower / quantize) * quantize + upper = math.floor(domain.upper / quantize) * quantize + return ConfigSpace.UniformFloatHyperparameter( + par, lower=lower, upper=upper, q=quantize, log=False + ) + elif isinstance(sampler, Normal): + return ConfigSpace.hyperparameters.NormalFloatHyperparameter( + par, mu=sampler.mean, sigma=sampler.sd, q=quantize, log=False + ) + + elif isinstance(domain, Integer): + if isinstance(sampler, LogUniform): + lower = domain.lower + upper = domain.upper + if quantize: + lower = math.ceil(domain.lower / quantize) * quantize + upper = math.floor(domain.upper / quantize) * quantize + else: + # Tune search space integers are exclusive + upper -= 1 + return ConfigSpace.UniformIntegerHyperparameter( + par, lower=lower, upper=upper, q=quantize, log=True + ) + elif isinstance(sampler, Uniform): + lower = domain.lower + upper = domain.upper + if quantize: + lower = math.ceil(domain.lower / quantize) * quantize + upper = math.floor(domain.upper / quantize) * quantize + else: + # Tune search space integers are exclusive + upper -= 1 + return ConfigSpace.UniformIntegerHyperparameter( + par, lower=lower, upper=upper, q=quantize, log=False + ) + + elif isinstance(domain, Categorical): + if isinstance(sampler, Uniform): + return ConfigSpace.CategoricalHyperparameter( + par, choices=domain.categories + ) + + raise ValueError( + "TuneBOHB does not support parameters of type " + "`{}` with samplers of type `{}`".format( + type(domain).__name__, type(domain.sampler).__name__ + ) + ) + + cs = ConfigSpace.ConfigurationSpace() + for path, domain in domain_vars: + par = "/".join(str(p) for p in path) + value = resolve_value(par, domain) + cs.add_hyperparameter(value) + + return cs + + def save(self, checkpoint_path: str): + save_object = self.__dict__ + with open(checkpoint_path, "wb") as outputFile: + cloudpickle.dump(save_object, outputFile) + + def restore(self, checkpoint_path: str): + with open(checkpoint_path, "rb") as inputFile: + save_object = cloudpickle.load(inputFile) + self.__dict__.update(save_object) diff --git a/.venv/lib/python3.11/site-packages/ray/tune/search/hebo/hebo_search.py b/.venv/lib/python3.11/site-packages/ray/tune/search/hebo/hebo_search.py new file mode 100644 index 0000000000000000000000000000000000000000..7145960d301c6befddc415f650e947b1180faa32 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/tune/search/hebo/hebo_search.py @@ -0,0 +1,468 @@ +import logging +import pickle +from typing import Dict, List, Optional, Union + +import numpy as np +import pandas as pd + +from ray.tune.result import DEFAULT_METRIC +from ray.tune.search import ( + UNDEFINED_METRIC_MODE, + UNDEFINED_SEARCH_SPACE, + UNRESOLVED_SEARCH_SPACE, + Searcher, +) +from ray.tune.search.sample import ( + Categorical, + Domain, + Float, + Integer, + LogUniform, + Quantized, + Uniform, +) +from ray.tune.search.variant_generator import parse_spec_vars +from ray.tune.utils.util import is_nan_or_inf, unflatten_dict, validate_warmstart + +try: # Python 3 only -- needed for lint test. + import hebo + import torch # hebo has torch as a dependency +except ImportError: + hebo = None + +logger = logging.getLogger(__name__) + +SPACE_ERROR_MESSAGE = ( + "Space must be either a HEBO DesignSpace object" + "or a dictionary with ONLY tune search spaces." +) + + +class HEBOSearch(Searcher): + """Uses HEBO (Heteroscedastic Evolutionary Bayesian Optimization) + to optimize hyperparameters. + + HEBO is a cutting edge black-box optimization framework created + by Huawei's Noah Ark. More info can be found here: + https://github.com/huawei-noah/HEBO/tree/master/HEBO. + + `space` can either be a HEBO's `DesignSpace` object or a dict of Tune + search spaces. + + Please note that the first few trials will be random and used + to kickstart the search process. In order to achieve good results, + we recommend setting the number of trials to at least 16. + + Maximum number of concurrent trials is determined by ``max_concurrent`` + argument. Trials will be done in batches of ``max_concurrent`` trials. + If this Searcher is used in a ``ConcurrencyLimiter``, the + ``max_concurrent`` value passed to it will override the value passed + here. + + Args: + space: A dict mapping parameter names to Tune search spaces or a + HEBO DesignSpace object. + metric: The training result objective value attribute. If None + but a mode was passed, the anonymous metric `_metric` will be used + per default. + mode: One of {min, max}. Determines whether objective is + minimizing or maximizing the metric attribute. + points_to_evaluate: Initial parameter suggestions to be run + first. This is for when you already have some good parameters + you want to run first to help the algorithm make better suggestions + for future parameters. Needs to be a list of dicts containing the + configurations. + evaluated_rewards: If you have previously evaluated the + parameters passed in as points_to_evaluate you can avoid + re-running those trials by passing in the reward attributes + as a list so the optimiser can be told the results without + needing to re-compute the trial. Must be the same length as + points_to_evaluate. + random_state_seed: Seed for reproducible + results. Defaults to None. Please note that setting this to a value + will change global random states for `numpy` and `torch` + on initalization and loading from checkpoint. + max_concurrent: Number of maximum concurrent trials. + If this Searcher is used in a ``ConcurrencyLimiter``, the + ``max_concurrent`` value passed to it will override the + value passed here. + **kwargs: The keyword arguments will be passed to `HEBO()``. + + Tune automatically converts search spaces to HEBO's format: + + .. code-block:: python + + from ray import tune + from ray.tune.search.hebo import HEBOSearch + + config = { + "width": tune.uniform(0, 20), + "height": tune.uniform(-100, 100) + } + + hebo = HEBOSearch(metric="mean_loss", mode="min") + tuner = tune.Tuner( + trainable_function, + tune_config=tune.TuneConfig( + search_alg=hebo + ), + param_space=config + ) + tuner.fit() + + Alternatively, you can pass a HEBO `DesignSpace` object manually to the + Searcher: + + .. code-block:: python + + from ray import tune + from ray.tune.search.hebo import HEBOSearch + from hebo.design_space.design_space import DesignSpace + + space_config = [ + {'name' : 'width', 'type' : 'num', 'lb' : 0, 'ub' : 20}, + {'name' : 'height', 'type' : 'num', 'lb' : -100, 'ub' : 100}, + ] + space = DesignSpace().parse(space_config) + + hebo = HEBOSearch(space, metric="mean_loss", mode="min") + tuner = tune.Tuner( + trainable_function, + tune_config=tune.TuneConfig( + search_alg=hebo + ) + ) + tuner.fit() + + """ + + def __init__( + self, + space: Optional[ + Union[Dict, "hebo.design_space.design_space.DesignSpace"] + ] = None, + metric: Optional[str] = None, + mode: Optional[str] = None, + points_to_evaluate: Optional[List[Dict]] = None, + evaluated_rewards: Optional[List] = None, + random_state_seed: Optional[int] = None, + max_concurrent: int = 8, + **kwargs, + ): + assert hebo is not None, ( + "HEBO must be installed! You can install HEBO with" + " the command: `pip install 'HEBO>=0.2.0'`." + "This error may also be caused if HEBO" + " dependencies have bad versions. Try updating HEBO" + " first." + ) + if mode: + assert mode in ["min", "max"], "`mode` must be 'min' or 'max'." + assert ( + isinstance(max_concurrent, int) and max_concurrent >= 1 + ), "`max_concurrent` must be an integer and at least 1." + if random_state_seed is not None: + assert isinstance( + random_state_seed, int + ), "random_state_seed must be None or int, got '{}'.".format( + type(random_state_seed) + ) + super(HEBOSearch, self).__init__(metric=metric, mode=mode) + + if isinstance(space, dict) and space: + resolved_vars, domain_vars, grid_vars = parse_spec_vars(space) + if resolved_vars: + raise TypeError(SPACE_ERROR_MESSAGE) + if domain_vars or grid_vars: + logger.warning( + UNRESOLVED_SEARCH_SPACE.format(par="space", cls=type(self)) + ) + space = self.convert_search_space(space) + elif space is not None and not isinstance( + space, hebo.design_space.design_space.DesignSpace + ): + raise TypeError(SPACE_ERROR_MESSAGE + " Got {}.".format(type(space))) + + self._hebo_config = kwargs + self._random_state_seed = random_state_seed + self._space = space + self._points_to_evaluate = points_to_evaluate + self._evaluated_rewards = evaluated_rewards + self._initial_points = [] + self._live_trial_mapping = {} + + self._max_concurrent = max_concurrent + self._suggestions_cache = [] + self._batch_filled = False + + self._opt = None + if space: + self._setup_optimizer() + + def set_max_concurrency(self, max_concurrent: int) -> bool: + self._max_concurrent = max_concurrent + return True + + def _setup_optimizer(self): + # HEBO internally minimizes, so "max" => -1 + if self._mode == "max": + self._metric_op = -1.0 + elif self._mode == "min": + self._metric_op = 1.0 + + if self._metric is None and self._mode: + # If only a mode was passed, use anonymous metric + self._metric = DEFAULT_METRIC + + if not isinstance(self._space, hebo.design_space.design_space.DesignSpace): + raise ValueError( + f"Invalid search space: {type(self._space)}. Either pass a " + f"valid search space to the `HEBOSearch` class or pass " + f"a `param_space` parameter to `tune.Tuner()`" + ) + + if self._space.num_paras <= 0: + raise ValueError( + "Got empty search space. Please make sure to pass " + "a valid search space with at least one parameter to " + "`HEBOSearch`" + ) + + if self._random_state_seed is not None: + np.random.seed(self._random_state_seed) + torch.random.manual_seed(self._random_state_seed) + + self._opt = hebo.optimizers.hebo.HEBO(space=self._space, **self._hebo_config) + + if self._points_to_evaluate: + validate_warmstart( + self._space.para_names, + self._points_to_evaluate, + self._evaluated_rewards, + ) + if self._evaluated_rewards: + self._opt.observe( + pd.DataFrame(self._points_to_evaluate), + np.array(self._evaluated_rewards) * self._metric_op, + ) + else: + self._initial_points = self._points_to_evaluate + + def set_search_properties( + self, metric: Optional[str], mode: Optional[str], config: Dict, **spec + ) -> bool: + if self._opt: + return False + space = self.convert_search_space(config) + self._space = space + + if metric: + self._metric = metric + if mode: + self._mode = mode + + self._setup_optimizer() + return True + + def suggest(self, trial_id: str) -> Optional[Dict]: + if not self._opt: + raise RuntimeError( + UNDEFINED_SEARCH_SPACE.format( + cls=self.__class__.__name__, space="space" + ) + ) + + if not self._metric or not self._mode: + raise RuntimeError( + UNDEFINED_METRIC_MODE.format( + cls=self.__class__.__name__, metric=self._metric, mode=self._mode + ) + ) + + if not self._live_trial_mapping: + self._batch_filled = False + + if self._initial_points: + params = self._initial_points.pop(0) + suggestion = pd.DataFrame([params], index=[0]) + else: + if ( + self._batch_filled + or len(self._live_trial_mapping) >= self._max_concurrent + ): + return None + if not self._suggestions_cache: + suggestion = self._opt.suggest(n_suggestions=self._max_concurrent) + self._suggestions_cache = suggestion.to_dict("records") + params = self._suggestions_cache.pop(0) + suggestion = pd.DataFrame([params], index=[0]) + self._live_trial_mapping[trial_id] = suggestion + if len(self._live_trial_mapping) >= self._max_concurrent: + self._batch_filled = True + return unflatten_dict(params) + + def on_trial_complete( + self, trial_id: str, result: Optional[Dict] = None, error: bool = False + ): + """Notification for the completion of trial. + + HEBO always minimizes.""" + + if result: + self._process_result(trial_id, result) + self._live_trial_mapping.pop(trial_id) + + def _process_result(self, trial_id: str, result: Dict): + trial_info = self._live_trial_mapping[trial_id] + if result and not is_nan_or_inf(result[self._metric]): + self._opt.observe( + trial_info, np.array([self._metric_op * result[self._metric]]) + ) + + def add_evaluated_point( + self, + parameters: Dict, + value: float, + error: bool = False, + pruned: bool = False, + intermediate_values: Optional[List[float]] = None, + ): + if intermediate_values: + logger.warning("HEBO doesn't use intermediate_values. Ignoring.") + if not error and not pruned: + self._opt.observe( + pd.DataFrame( + [ + { + k: v + for k, v in parameters.items() + if k in self._opt.space.para_names + } + ] + ), + np.array([value]) * self._metric_op, + ) + else: + logger.warning( + "Only non errored and non pruned points can be added to HEBO." + ) + + def save(self, checkpoint_path: str): + """Storing current optimizer state.""" + if self._random_state_seed is not None: + numpy_random_state = np.random.get_state() + torch_random_state = torch.get_rng_state() + else: + numpy_random_state = None + torch_random_state = None + save_object = self.__dict__.copy() + save_object["__numpy_random_state"] = numpy_random_state + save_object["__torch_random_state"] = torch_random_state + with open(checkpoint_path, "wb") as f: + pickle.dump(save_object, f) + + def restore(self, checkpoint_path: str): + """Restoring current optimizer state.""" + with open(checkpoint_path, "rb") as f: + save_object = pickle.load(f) + + if isinstance(save_object, dict): + numpy_random_state = save_object.pop("__numpy_random_state", None) + torch_random_state = save_object.pop("__torch_random_state", None) + self.__dict__.update(save_object) + else: + # Backwards compatibility + ( + self._opt, + self._initial_points, + numpy_random_state, + torch_random_state, + self._live_trial_mapping, + self._max_concurrent, + self._suggestions_cache, + self._space, + self._hebo_config, + self._batch_filled, + ) = save_object + if numpy_random_state is not None: + np.random.set_state(numpy_random_state) + if torch_random_state is not None: + torch.random.set_rng_state(torch_random_state) + + @staticmethod + def convert_search_space(spec: Dict, prefix: str = "") -> Dict: + resolved_vars, domain_vars, grid_vars = parse_spec_vars(spec) + + params = [] + + if not domain_vars and not grid_vars: + return {} + + if grid_vars: + raise ValueError( + "Grid search parameters cannot be automatically converted " + "to a HEBO search space." + ) + + def resolve_value(par: str, domain: Domain): + sampler = domain.get_sampler() + if isinstance(sampler, Quantized): + logger.warning( + "HEBO search does not support quantization. " + "Dropped quantization." + ) + sampler = sampler.get_sampler() + + if isinstance(domain, Float): + if isinstance(sampler, LogUniform): + return { + "name": par, + "type": "pow", + "lb": domain.lower, + "ub": domain.upper, + "base": sampler.base, + } + elif isinstance(sampler, Uniform): + return { + "name": par, + "type": "num", + "lb": domain.lower, + "ub": domain.upper, + } + + elif isinstance(domain, Integer): + if isinstance(sampler, LogUniform): + return { + "name": par, + "type": "pow_int", + "lb": domain.lower, + "ub": domain.upper - 1, # Upper bound exclusive + "base": sampler.base, + } + elif isinstance(sampler, Uniform): + return { + "name": par, + "type": "int", + "lb": domain.lower, + "ub": domain.upper - 1, # Upper bound exclusive + } + elif isinstance(domain, Categorical): + return { + "name": par, + "type": "cat", + "categories": list(domain.categories), + } + + raise ValueError( + "HEBO does not support parameters of type " + "`{}` with samplers of type `{}`".format( + type(domain).__name__, type(domain.sampler).__name__ + ) + ) + + for path, domain in domain_vars: + par = "/".join([str(p) for p in ((prefix,) + path if prefix else path)]) + value = resolve_value(par, domain) + params.append(value) + + return hebo.design_space.design_space.DesignSpace().parse(params) diff --git a/.venv/lib/python3.11/site-packages/ray/tune/search/hyperopt/__init__.py b/.venv/lib/python3.11/site-packages/ray/tune/search/hyperopt/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..3b142015f04bf2eec6b1be7e54d545621dfe2953 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/tune/search/hyperopt/__init__.py @@ -0,0 +1,3 @@ +from ray.tune.search.hyperopt.hyperopt_search import HyperOptSearch + +__all__ = ["HyperOptSearch"] diff --git a/.venv/lib/python3.11/site-packages/ray/tune/search/hyperopt/__pycache__/__init__.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/tune/search/hyperopt/__pycache__/__init__.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..5862d1b541e0f639ad4b322b0f3d93456a4dc681 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/tune/search/hyperopt/__pycache__/__init__.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/tune/search/hyperopt/__pycache__/hyperopt_search.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/tune/search/hyperopt/__pycache__/hyperopt_search.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..43a179631561b91b25931805a35e21c23af16e92 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/tune/search/hyperopt/__pycache__/hyperopt_search.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/tune/search/hyperopt/hyperopt_search.py b/.venv/lib/python3.11/site-packages/ray/tune/search/hyperopt/hyperopt_search.py new file mode 100644 index 0000000000000000000000000000000000000000..4988325dde2d13fe04b32d97b5e71b2b320a0fb2 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/tune/search/hyperopt/hyperopt_search.py @@ -0,0 +1,559 @@ +import copy +import logging +from functools import partial +from typing import Any, Dict, List, Optional + +import numpy as np + +# Use cloudpickle instead of pickle to make lambda funcs in HyperOpt pickleable +from ray import cloudpickle +from ray.tune.error import TuneError +from ray.tune.result import DEFAULT_METRIC +from ray.tune.search import ( + UNDEFINED_METRIC_MODE, + UNDEFINED_SEARCH_SPACE, + UNRESOLVED_SEARCH_SPACE, + Searcher, +) +from ray.tune.search.sample import ( + Categorical, + Domain, + Float, + Integer, + LogUniform, + Normal, + Quantized, + Uniform, +) +from ray.tune.search.variant_generator import assign_value, parse_spec_vars +from ray.tune.utils import flatten_dict + +try: + hyperopt_logger = logging.getLogger("hyperopt") + hyperopt_logger.setLevel(logging.WARNING) + import hyperopt as hpo + from hyperopt.pyll import Apply +except ImportError: + hpo = None + Apply = None + + +logger = logging.getLogger(__name__) + + +HYPEROPT_UNDEFINED_DETAILS = ( + " This issue can also come up with HyperOpt if your search space only " + "contains constant variables, which is not supported by HyperOpt. In that case, " + "don't pass any searcher or add sample variables to the search space." +) + + +class HyperOptSearch(Searcher): + """A wrapper around HyperOpt to provide trial suggestions. + + HyperOpt a Python library for serial and parallel optimization + over awkward search spaces, which may include real-valued, discrete, + and conditional dimensions. More info can be found at + http://hyperopt.github.io/hyperopt. + + HyperOptSearch uses the Tree-structured Parzen Estimators algorithm, + though it can be trivially extended to support any algorithm HyperOpt + supports. + + To use this search algorithm, you will need to install HyperOpt: + + .. code-block:: bash + + pip install -U hyperopt + + + Parameters: + space: HyperOpt configuration. Parameters will be sampled + from this configuration and will be used to override + parameters generated in the variant generation process. + metric: The training result objective value attribute. If None + but a mode was passed, the anonymous metric `_metric` will be used + per default. + mode: One of {min, max}. Determines whether objective is + minimizing or maximizing the metric attribute. + points_to_evaluate: Initial parameter suggestions to be run + first. This is for when you already have some good parameters + you want to run first to help the algorithm make better suggestions + for future parameters. Needs to be a list of dicts containing the + configurations. + n_initial_points: number of random evaluations of the + objective function before starting to aproximate it with + tree parzen estimators. Defaults to 20. + random_state_seed: seed for reproducible + results. Defaults to None. + gamma: parameter governing the tree parzen + estimators suggestion algorithm. Defaults to 0.25. + + Tune automatically converts search spaces to HyperOpt's format: + + .. code-block:: python + + config = { + 'width': tune.uniform(0, 20), + 'height': tune.uniform(-100, 100), + 'activation': tune.choice(["relu", "tanh"]) + } + + current_best_params = [{ + 'width': 10, + 'height': 0, + 'activation': "relu", + }] + + hyperopt_search = HyperOptSearch( + metric="mean_loss", mode="min", + points_to_evaluate=current_best_params) + + tuner = tune.Tuner( + trainable, + tune_config=tune.TuneConfig( + search_alg=hyperopt_search + ), + param_space=config + ) + tuner.fit() + + If you would like to pass the search space manually, the code would + look like this: + + .. code-block:: python + + space = { + 'width': hp.uniform('width', 0, 20), + 'height': hp.uniform('height', -100, 100), + 'activation': hp.choice("activation", ["relu", "tanh"]) + } + + current_best_params = [{ + 'width': 10, + 'height': 0, + 'activation': "relu", + }] + + hyperopt_search = HyperOptSearch( + space, metric="mean_loss", mode="min", + points_to_evaluate=current_best_params) + + tuner = tune.Tuner( + trainable, + tune_config=tune.TuneConfig( + search_alg=hyperopt_search + ), + ) + tuner.fit() + + """ + + def __init__( + self, + space: Optional[Dict] = None, + metric: Optional[str] = None, + mode: Optional[str] = None, + points_to_evaluate: Optional[List[Dict]] = None, + n_initial_points: int = 20, + random_state_seed: Optional[int] = None, + gamma: float = 0.25, + ): + assert ( + hpo is not None + ), "HyperOpt must be installed! Run `pip install hyperopt`." + if mode: + assert mode in ["min", "max"], "`mode` must be 'min' or 'max'." + super(HyperOptSearch, self).__init__( + metric=metric, + mode=mode, + ) + # hyperopt internally minimizes, so "max" => -1 + if mode == "max": + self.metric_op = -1.0 + elif mode == "min": + self.metric_op = 1.0 + + if n_initial_points is None: + self.algo = hpo.tpe.suggest + else: + self.algo = partial(hpo.tpe.suggest, n_startup_jobs=n_initial_points) + if gamma is not None: + self.algo = partial(self.algo, gamma=gamma) + + self._points_to_evaluate = copy.deepcopy(points_to_evaluate) + + self._live_trial_mapping = {} + self.rstate = np.random.RandomState(random_state_seed) + + self.domain = None + if isinstance(space, dict) and space: + resolved_vars, domain_vars, grid_vars = parse_spec_vars(space) + if domain_vars or grid_vars: + logger.warning( + UNRESOLVED_SEARCH_SPACE.format(par="space", cls=type(self)) + ) + space = self.convert_search_space(space) + self._space = space + self._setup_hyperopt() + + def _setup_hyperopt(self) -> None: + from hyperopt.fmin import generate_trials_to_calculate + + if not self._space: + raise RuntimeError( + UNDEFINED_SEARCH_SPACE.format( + cls=self.__class__.__name__, space="space" + ) + + HYPEROPT_UNDEFINED_DETAILS + ) + + if self._metric is None and self._mode: + # If only a mode was passed, use anonymous metric + self._metric = DEFAULT_METRIC + + if self._points_to_evaluate is None: + self._hpopt_trials = hpo.Trials() + self._points_to_evaluate = 0 + else: + assert isinstance(self._points_to_evaluate, (list, tuple)) + + for i in range(len(self._points_to_evaluate)): + config = self._points_to_evaluate[i] + self._convert_categories_to_indices(config) + # HyperOpt treats initial points as LIFO, reverse to get FIFO + self._points_to_evaluate = list(reversed(self._points_to_evaluate)) + self._hpopt_trials = generate_trials_to_calculate(self._points_to_evaluate) + self._hpopt_trials.refresh() + self._points_to_evaluate = len(self._points_to_evaluate) + + self.domain = hpo.Domain(lambda spc: spc, self._space) + + def _convert_categories_to_indices(self, config) -> None: + """Convert config parameters for categories into hyperopt-compatible + representations where instead the index of the category is expected.""" + + def _lookup(config_dict, space_dict, key): + if isinstance(config_dict[key], dict): + for k in config_dict[key]: + _lookup(config_dict[key], space_dict[key], k) + else: + if ( + key in space_dict + and isinstance(space_dict[key], hpo.base.pyll.Apply) + and space_dict[key].name == "switch" + ): + if len(space_dict[key].pos_args) > 0: + categories = [ + a.obj + for a in space_dict[key].pos_args[1:] + if a.name == "literal" + ] + try: + idx = categories.index(config_dict[key]) + except ValueError as exc: + msg = ( + f"Did not find category with value " + f"`{config_dict[key]}` in " + f"hyperopt parameter `{key}`. " + ) + + if isinstance(config_dict[key], int): + msg += ( + "In previous versions, a numerical " + "index was expected for categorical " + "values of `points_to_evaluate`, " + "but in ray>=1.2.0, the categorical " + "value is expected to be directly " + "provided. " + ) + + msg += "Please make sure the specified category is valid." + raise ValueError(msg) from exc + config_dict[key] = idx + + for k in config: + _lookup(config, self._space, k) + + def set_search_properties( + self, metric: Optional[str], mode: Optional[str], config: Dict, **spec + ) -> bool: + if self.domain: + return False + space = self.convert_search_space(config) + self._space = space + + if metric: + self._metric = metric + if mode: + self._mode = mode + + self.metric_op = -1.0 if self._mode == "max" else 1.0 + + self._setup_hyperopt() + return True + + def suggest(self, trial_id: str) -> Optional[Dict]: + if not self.domain: + raise RuntimeError( + UNDEFINED_SEARCH_SPACE.format( + cls=self.__class__.__name__, space="space" + ) + + HYPEROPT_UNDEFINED_DETAILS + ) + if not self._metric or not self._mode: + raise RuntimeError( + UNDEFINED_METRIC_MODE.format( + cls=self.__class__.__name__, metric=self._metric, mode=self._mode + ) + ) + + if self._points_to_evaluate > 0: + using_point_to_evaluate = True + new_trial = self._hpopt_trials.trials[self._points_to_evaluate - 1] + self._points_to_evaluate -= 1 + else: + using_point_to_evaluate = False + new_ids = self._hpopt_trials.new_trial_ids(1) + self._hpopt_trials.refresh() + + # Get new suggestion from Hyperopt + new_trials = self.algo( + new_ids, + self.domain, + self._hpopt_trials, + self.rstate.randint(2**31 - 1), + ) + self._hpopt_trials.insert_trial_docs(new_trials) + self._hpopt_trials.refresh() + new_trial = new_trials[0] + self._live_trial_mapping[trial_id] = (new_trial["tid"], new_trial) + + # Taken from HyperOpt.base.evaluate + config = hpo.base.spec_from_misc(new_trial["misc"]) + + # We have to flatten nested spaces here so parameter names match + config = flatten_dict(config, flatten_list=True) + + ctrl = hpo.base.Ctrl(self._hpopt_trials, current_trial=new_trial) + memo = self.domain.memo_from_config(config) + hpo.utils.use_obj_for_literal_in_memo( + self.domain.expr, ctrl, hpo.base.Ctrl, memo + ) + + try: + suggested_config = hpo.pyll.rec_eval( + self.domain.expr, + memo=memo, + print_node_on_error=self.domain.rec_eval_print_node_on_error, + ) + except (AssertionError, TypeError) as e: + if using_point_to_evaluate and ( + isinstance(e, AssertionError) or "GarbageCollected" in str(e) + ): + raise ValueError( + "HyperOpt encountered a GarbageCollected switch argument. " + "Usually this is caused by a config in " + "`points_to_evaluate` " + "missing a key present in `space`. Ensure that " + "`points_to_evaluate` contains " + "all non-constant keys from `space`.\n" + "Config from `points_to_evaluate`: " + f"{config}\n" + "HyperOpt search space: " + f"{self._space}" + ) from e + raise e + return copy.deepcopy(suggested_config) + + def on_trial_result(self, trial_id: str, result: Dict) -> None: + ho_trial = self._get_hyperopt_trial(trial_id) + if ho_trial is None: + return + now = hpo.utils.coarse_utcnow() + ho_trial["book_time"] = now + ho_trial["refresh_time"] = now + + def on_trial_complete( + self, trial_id: str, result: Optional[Dict] = None, error: bool = False + ) -> None: + """Notification for the completion of trial. + + The result is internally negated when interacting with HyperOpt + so that HyperOpt can "maximize" this value, as it minimizes on default. + """ + ho_trial = self._get_hyperopt_trial(trial_id) + if ho_trial is None: + return + ho_trial["refresh_time"] = hpo.utils.coarse_utcnow() + if error: + ho_trial["state"] = hpo.base.JOB_STATE_ERROR + ho_trial["misc"]["error"] = (str(TuneError), "Tune Error") + self._hpopt_trials.refresh() + elif result: + self._process_result(trial_id, result) + del self._live_trial_mapping[trial_id] + + def _process_result(self, trial_id: str, result: Dict) -> None: + ho_trial = self._get_hyperopt_trial(trial_id) + if not ho_trial: + return + ho_trial["refresh_time"] = hpo.utils.coarse_utcnow() + + ho_trial["state"] = hpo.base.JOB_STATE_DONE + hp_result = self._to_hyperopt_result(result) + ho_trial["result"] = hp_result + self._hpopt_trials.refresh() + + def _to_hyperopt_result(self, result: Dict) -> Dict: + try: + return {"loss": self.metric_op * result[self.metric], "status": "ok"} + except KeyError as e: + raise RuntimeError( + f"Hyperopt expected to see the metric `{self.metric}` in the " + f"last result, but it was not found. To fix this, make " + f"sure your call to `tune.report` or your return value of " + f"your trainable class `step()` contains the above metric " + f"as a key." + ) from e + + def _get_hyperopt_trial(self, trial_id: str) -> Optional[Dict]: + if trial_id not in self._live_trial_mapping: + return + hyperopt_tid = self._live_trial_mapping[trial_id][0] + return [t for t in self._hpopt_trials.trials if t["tid"] == hyperopt_tid][0] + + def get_state(self) -> Dict: + return { + "hyperopt_trials": self._hpopt_trials, + "rstate": self.rstate.get_state(), + } + + def set_state(self, state: Dict) -> None: + self._hpopt_trials = state["hyperopt_trials"] + self.rstate.set_state(state["rstate"]) + + def save(self, checkpoint_path: str) -> None: + save_object = self.__dict__.copy() + save_object["__rstate"] = self.rstate.get_state() + with open(checkpoint_path, "wb") as f: + cloudpickle.dump(save_object, f) + + def restore(self, checkpoint_path: str) -> None: + with open(checkpoint_path, "rb") as f: + save_object = cloudpickle.load(f) + + if "__rstate" not in save_object: + # Backwards compatibility + self.set_state(save_object) + else: + self.rstate.set_state(save_object.pop("__rstate")) + self.__dict__.update(save_object) + + @staticmethod + def convert_search_space(spec: Dict, prefix: str = "") -> Dict: + spec = copy.deepcopy(spec) + resolved_vars, domain_vars, grid_vars = parse_spec_vars(spec) + + if not domain_vars and not grid_vars: + return {} + + if grid_vars: + raise ValueError( + "Grid search parameters cannot be automatically converted " + "to a HyperOpt search space." + ) + + def resolve_value(par: str, domain: Domain) -> Any: + quantize = None + + sampler = domain.get_sampler() + if isinstance(sampler, Quantized): + quantize = sampler.q + sampler = sampler.sampler + + if isinstance(domain, Float): + if isinstance(sampler, LogUniform): + if quantize: + return hpo.hp.qloguniform( + par, np.log(domain.lower), np.log(domain.upper), quantize + ) + return hpo.hp.loguniform( + par, np.log(domain.lower), np.log(domain.upper) + ) + elif isinstance(sampler, Uniform): + if quantize: + return hpo.hp.quniform( + par, domain.lower, domain.upper, quantize + ) + return hpo.hp.uniform(par, domain.lower, domain.upper) + elif isinstance(sampler, Normal): + if quantize: + return hpo.hp.qnormal(par, sampler.mean, sampler.sd, quantize) + return hpo.hp.normal(par, sampler.mean, sampler.sd) + + elif isinstance(domain, Integer): + if isinstance(sampler, LogUniform): + if quantize: + return hpo.base.pyll.scope.int( + hpo.hp.qloguniform( + par, + np.log(domain.lower), + np.log(domain.upper), + quantize, + ) + ) + return hpo.base.pyll.scope.int( + hpo.hp.qloguniform( + par, np.log(domain.lower), np.log(domain.upper - 1), 1.0 + ) + ) + elif isinstance(sampler, Uniform): + if quantize: + return hpo.base.pyll.scope.int( + hpo.hp.quniform( + par, domain.lower, domain.upper - 1, quantize + ) + ) + return hpo.hp.uniformint(par, domain.lower, high=domain.upper - 1) + elif isinstance(domain, Categorical): + if isinstance(sampler, Uniform): + return hpo.hp.choice( + par, + [ + ( + HyperOptSearch.convert_search_space( + category, prefix=par + ) + if isinstance(category, dict) + else ( + HyperOptSearch.convert_search_space( + dict(enumerate(category)), prefix=f"{par}/{i}" + ) + if isinstance(category, list) + and len(category) > 0 + and isinstance(category[0], Domain) + else ( + resolve_value(f"{par}/{i}", category) + if isinstance(category, Domain) + else category + ) + ) + ) + for i, category in enumerate(domain.categories) + ], + ) + + raise ValueError( + "HyperOpt does not support parameters of type " + "`{}` with samplers of type `{}`".format( + type(domain).__name__, type(domain.sampler).__name__ + ) + ) + + for path, domain in domain_vars: + par = "/".join([str(p) for p in ((prefix,) + path if prefix else path)]) + value = resolve_value(par, domain) + assign_value(spec, path, value) + + return spec diff --git a/.venv/lib/python3.11/site-packages/ray/tune/search/zoopt/__init__.py b/.venv/lib/python3.11/site-packages/ray/tune/search/zoopt/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..44ab8e345754da48205fbe712e4bb787a5ddfc2a --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/tune/search/zoopt/__init__.py @@ -0,0 +1,3 @@ +from ray.tune.search.zoopt.zoopt_search import ZOOptSearch + +__all__ = ["ZOOptSearch"] diff --git a/.venv/lib/python3.11/site-packages/ray/tune/search/zoopt/__pycache__/__init__.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/tune/search/zoopt/__pycache__/__init__.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..3eff046e1da97aa32dc3926fb1a4b720eed788ea Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/tune/search/zoopt/__pycache__/__init__.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/tune/search/zoopt/__pycache__/zoopt_search.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/tune/search/zoopt/__pycache__/zoopt_search.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..49fd0a08851f184730cbaf95c2ee08003f3b32bf Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/tune/search/zoopt/__pycache__/zoopt_search.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/tune/search/zoopt/zoopt_search.py b/.venv/lib/python3.11/site-packages/ray/tune/search/zoopt/zoopt_search.py new file mode 100644 index 0000000000000000000000000000000000000000..d5ec4e423f97e6b89decd0f55fbcbe3b7ec15a87 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/tune/search/zoopt/zoopt_search.py @@ -0,0 +1,379 @@ +import copy +import logging +from typing import Dict, List, Optional, Tuple + +import ray +import ray.cloudpickle as pickle +from ray.tune.result import DEFAULT_METRIC +from ray.tune.search import ( + UNDEFINED_METRIC_MODE, + UNDEFINED_SEARCH_SPACE, + UNRESOLVED_SEARCH_SPACE, + Searcher, +) +from ray.tune.search.sample import ( + Categorical, + Domain, + Float, + Integer, + Quantized, + Uniform, +) +from ray.tune.search.variant_generator import parse_spec_vars +from ray.tune.utils.util import unflatten_dict + +try: + import zoopt + from zoopt import Solution, ValueType +except ImportError: + zoopt = None + Solution = ValueType = None + +logger = logging.getLogger(__name__) + + +class ZOOptSearch(Searcher): + """A wrapper around ZOOpt to provide trial suggestions. + + ZOOptSearch is a library for derivative-free optimization. It is backed by + the `ZOOpt `__ package. Currently, + Asynchronous Sequential RAndomized COordinate Shrinking (ASRacos) + is implemented in Tune. + + To use ZOOptSearch, install zoopt (>=0.4.1): ``pip install -U zoopt``. + + Tune automatically converts search spaces to ZOOpt"s format: + + .. code-block:: python + + from ray import train, tune + from ray.tune.search.zoopt import ZOOptSearch + + "config": { + "iterations": 10, # evaluation times + "width": tune.uniform(-10, 10), + "height": tune.uniform(-10, 10) + } + + zoopt_search_config = { + "parallel_num": 8, # how many workers to parallel + } + + zoopt_search = ZOOptSearch( + algo="Asracos", # only support Asracos currently + budget=20, # must match `num_samples` in `tune.TuneConfig()`. + dim_dict=dim_dict, + metric="mean_loss", + mode="min", + **zoopt_search_config + ) + + tuner = tune.Tuner( + my_objective, + tune_config=tune.TuneConfig( + search_alg=zoopt_search, + num_samples=20 + ), + run_config=train.RunConfig( + name="zoopt_search", + stop={"timesteps_total": 10} + ), + param_space=config + ) + tuner.fit() + + If you would like to pass the search space manually, the code would + look like this: + + .. code-block:: python + + from ray import train, tune + from ray.tune.search.zoopt import ZOOptSearch + from zoopt import ValueType + + dim_dict = { + "height": (ValueType.CONTINUOUS, [-10, 10], 1e-2), + "width": (ValueType.DISCRETE, [-10, 10], False), + "layers": (ValueType.GRID, [4, 8, 16]) + } + + "config": { + "iterations": 10, # evaluation times + } + + zoopt_search_config = { + "parallel_num": 8, # how many workers to parallel + } + + zoopt_search = ZOOptSearch( + algo="Asracos", # only support Asracos currently + budget=20, # must match `num_samples` in `tune.TuneConfig()`. + dim_dict=dim_dict, + metric="mean_loss", + mode="min", + **zoopt_search_config + ) + + tuner = tune.Tuner( + my_objective, + tune_config=tune.TuneConfig( + search_alg=zoopt_search, + num_samples=20 + ), + run_config=train.RunConfig( + name="zoopt_search", + stop={"timesteps_total": 10} + ), + ) + tuner.fit() + + Parameters: + algo: To specify an algorithm in zoopt you want to use. + Only support ASRacos currently. + budget: Number of samples. + dim_dict: Dimension dictionary. + For continuous dimensions: (continuous, search_range, precision); + For discrete dimensions: (discrete, search_range, has_order); + For grid dimensions: (grid, grid_list). + More details can be found in zoopt package. + metric: The training result objective value attribute. If None + but a mode was passed, the anonymous metric `_metric` will be used + per default. + mode: One of {min, max}. Determines whether objective is + minimizing or maximizing the metric attribute. + points_to_evaluate: Initial parameter suggestions to be run + first. This is for when you already have some good parameters + you want to run first to help the algorithm make better suggestions + for future parameters. Needs to be a list of dicts containing the + configurations. + parallel_num: How many workers to parallel. Note that initial + phase may start less workers than this number. More details can + be found in zoopt package. + """ + + optimizer = None + + def __init__( + self, + algo: str = "asracos", + budget: Optional[int] = None, + dim_dict: Optional[Dict] = None, + metric: Optional[str] = None, + mode: Optional[str] = None, + points_to_evaluate: Optional[List[Dict]] = None, + parallel_num: int = 1, + **kwargs + ): + assert ( + zoopt is not None + ), "ZOOpt not found - please install zoopt by `pip install -U zoopt`." + assert budget is not None, "`budget` should not be None!" + if mode: + assert mode in ["min", "max"], "`mode` must be 'min' or 'max'." + _algo = algo.lower() + assert _algo in [ + "asracos", + "sracos", + ], "`algo` must be in ['asracos', 'sracos'] currently" + + self._algo = _algo + + if isinstance(dim_dict, dict) and dim_dict: + resolved_vars, domain_vars, grid_vars = parse_spec_vars(dim_dict) + if domain_vars or grid_vars: + logger.warning( + UNRESOLVED_SEARCH_SPACE.format(par="dim_dict", cls=type(self)) + ) + dim_dict = self.convert_search_space(dim_dict, join=True) + + self._dim_dict = dim_dict + self._budget = budget + + self._metric = metric + if mode == "max": + self._metric_op = -1.0 + elif mode == "min": + self._metric_op = 1.0 + + self._points_to_evaluate = copy.deepcopy(points_to_evaluate) + + self._live_trial_mapping = {} + + self._dim_keys = [] + self.solution_dict = {} + self.best_solution_list = [] + self.optimizer = None + + self.kwargs = kwargs + + self.parallel_num = parallel_num + + super(ZOOptSearch, self).__init__(metric=self._metric, mode=mode) + + if self._dim_dict: + self._setup_zoopt() + + def _setup_zoopt(self): + if self._metric is None and self._mode: + # If only a mode was passed, use anonymous metric + self._metric = DEFAULT_METRIC + + _dim_list = [] + for k in self._dim_dict: + self._dim_keys.append(k) + _dim_list.append(self._dim_dict[k]) + + init_samples = None + if self._points_to_evaluate: + logger.warning( + "`points_to_evaluate` is ignored by ZOOpt in versions <= 0.4.1." + ) + init_samples = [ + Solution(x=tuple(point[dim] for dim in self._dim_keys)) + for point in self._points_to_evaluate + ] + dim = zoopt.Dimension2(_dim_list) + par = zoopt.Parameter(budget=self._budget, init_samples=init_samples) + if self._algo == "sracos" or self._algo == "asracos": + from zoopt.algos.opt_algorithms.racos.sracos import SRacosTune + + self.optimizer = SRacosTune( + dimension=dim, + parameter=par, + parallel_num=self.parallel_num, + **self.kwargs + ) + + def set_search_properties( + self, metric: Optional[str], mode: Optional[str], config: Dict, **spec + ) -> bool: + if self._dim_dict: + return False + space = self.convert_search_space(config) + self._dim_dict = space + + if metric: + self._metric = metric + if mode: + self._mode = mode + + if self._mode == "max": + self._metric_op = -1.0 + elif self._mode == "min": + self._metric_op = 1.0 + + self._setup_zoopt() + return True + + def suggest(self, trial_id: str) -> Optional[Dict]: + if not self._dim_dict or not self.optimizer: + raise RuntimeError( + UNDEFINED_SEARCH_SPACE.format( + cls=self.__class__.__name__, space="dim_dict" + ) + ) + if not self._metric or not self._mode: + raise RuntimeError( + UNDEFINED_METRIC_MODE.format( + cls=self.__class__.__name__, metric=self._metric, mode=self._mode + ) + ) + + _solution = self.optimizer.suggest() + + if _solution == "FINISHED": + if ray.__version__ >= "0.8.7": + return Searcher.FINISHED + else: + return None + + if _solution: + self.solution_dict[str(trial_id)] = _solution + _x = _solution.get_x() + new_trial = dict(zip(self._dim_keys, _x)) + self._live_trial_mapping[trial_id] = new_trial + return unflatten_dict(new_trial) + + def on_trial_complete( + self, trial_id: str, result: Optional[Dict] = None, error: bool = False + ): + """Notification for the completion of trial.""" + if result: + _solution = self.solution_dict[str(trial_id)] + _best_solution_so_far = self.optimizer.complete( + _solution, self._metric_op * result[self._metric] + ) + if _best_solution_so_far: + self.best_solution_list.append(_best_solution_so_far) + + del self._live_trial_mapping[trial_id] + + def save(self, checkpoint_path: str): + save_object = self.__dict__ + with open(checkpoint_path, "wb") as outputFile: + pickle.dump(save_object, outputFile) + + def restore(self, checkpoint_path: str): + with open(checkpoint_path, "rb") as inputFile: + save_object = pickle.load(inputFile) + self.__dict__.update(save_object) + + @staticmethod + def convert_search_space(spec: Dict, join: bool = False) -> Dict[str, Tuple]: + spec = copy.deepcopy(spec) + resolved_vars, domain_vars, grid_vars = parse_spec_vars(spec) + + if not domain_vars and not grid_vars: + return {} + + if grid_vars: + raise ValueError( + "Grid search parameters cannot be automatically converted " + "to a ZOOpt search space." + ) + + def resolve_value(domain: Domain) -> Tuple: + quantize = None + + sampler = domain.get_sampler() + if isinstance(sampler, Quantized): + quantize = sampler.q + sampler = sampler.sampler + + if isinstance(domain, Float): + precision = quantize or 1e-12 + if isinstance(sampler, Uniform): + return ( + ValueType.CONTINUOUS, + [domain.lower, domain.upper], + precision, + ) + + elif isinstance(domain, Integer): + if isinstance(sampler, Uniform): + return (ValueType.DISCRETE, [domain.lower, domain.upper - 1], True) + + elif isinstance(domain, Categorical): + # Categorical variables would use ValueType.DISCRETE with + # has_partial_order=False, however, currently we do not + # keep track of category values and cannot automatically + # translate back and forth between them. + if isinstance(sampler, Uniform): + return (ValueType.GRID, domain.categories) + + raise ValueError( + "ZOOpt does not support parameters of type " + "`{}` with samplers of type `{}`".format( + type(domain).__name__, type(domain.sampler).__name__ + ) + ) + + conv_spec = { + "/".join(path): resolve_value(domain) for path, domain in domain_vars + } + + if join: + spec.update(conv_spec) + conv_spec = spec + + return conv_spec