diff --git a/.gitattributes b/.gitattributes index 403db22d381680a1ef963b6288d40f01393ee1b8..2180227b7a0e6b7d31765bb94941cf79afa1d08b 100644 --- a/.gitattributes +++ b/.gitattributes @@ -156,3 +156,4 @@ tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_inductor/_ .venv/lib/python3.11/site-packages/ray/data/__pycache__/read_api.cpython-311.pyc filter=lfs diff=lfs merge=lfs -text .venv/lib/python3.11/site-packages/numpy/ma/tests/__pycache__/test_extras.cpython-311.pyc filter=lfs diff=lfs merge=lfs -text .venv/lib/python3.11/site-packages/ray/serve/_private/__pycache__/deployment_state.cpython-311.pyc filter=lfs diff=lfs merge=lfs -text +.venv/lib/python3.11/site-packages/xgrammar/xgrammar_bindings.cpython-311-x86_64-linux-gnu.so filter=lfs diff=lfs merge=lfs -text diff --git a/.venv/lib/python3.11/site-packages/ray/air/_internal/__init__.py b/.venv/lib/python3.11/site-packages/ray/air/_internal/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/.venv/lib/python3.11/site-packages/ray/air/_internal/__pycache__/filelock.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/air/_internal/__pycache__/filelock.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..ffbb56ba0908e66c0af7b59e842fd61d1de402b7 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/air/_internal/__pycache__/filelock.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/air/_internal/__pycache__/json.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/air/_internal/__pycache__/json.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..dcdcedb59c8f4ae296e96f52ec1d763d20e63cd7 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/air/_internal/__pycache__/json.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/air/_internal/__pycache__/mlflow.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/air/_internal/__pycache__/mlflow.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..1fdd64ba737849c0194a6837a915d44563968cf5 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/air/_internal/__pycache__/mlflow.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/air/_internal/__pycache__/tensorflow_utils.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/air/_internal/__pycache__/tensorflow_utils.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..3c5e15f23f73a52241fb44a2457b42894692f857 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/air/_internal/__pycache__/tensorflow_utils.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/air/_internal/__pycache__/torch_utils.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/air/_internal/__pycache__/torch_utils.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..8b508ce78ec47c3be0ece3ed40bf5782c5cee52e Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/air/_internal/__pycache__/torch_utils.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/air/_internal/__pycache__/uri_utils.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/air/_internal/__pycache__/uri_utils.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..2a93bc5d057266cadbbeea4e45cbd78909ad78e9 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/air/_internal/__pycache__/uri_utils.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/air/_internal/__pycache__/usage.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/air/_internal/__pycache__/usage.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..7d13ac60ae989d4bcad03734e87a4296629c9478 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/air/_internal/__pycache__/usage.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/air/_internal/__pycache__/util.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/air/_internal/__pycache__/util.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..310014ad6aa8c7a59ea0bdc91dbcb79d06da6b3c Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/air/_internal/__pycache__/util.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/air/_internal/config.py b/.venv/lib/python3.11/site-packages/ray/air/_internal/config.py new file mode 100644 index 0000000000000000000000000000000000000000..ce1df8f77acca6eea43f66ac2bf2e8d2d5b471aa --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/air/_internal/config.py @@ -0,0 +1,47 @@ +import dataclasses +from typing import Iterable + + +def ensure_only_allowed_dataclass_keys_updated( + dataclass: dataclasses.dataclass, + allowed_keys: Iterable[str], +): + """ + Validate dataclass by raising an exception if any key not included in + ``allowed_keys`` differs from the default value. + + A ``ValueError`` will also be raised if any of the ``allowed_keys`` + is not present in ``dataclass.__dict__``. + + Args: + dataclass: Dict or dataclass to check. + allowed_keys: dataclass attribute keys that can have a value different than + the default one. + """ + default_data = dataclass.__class__() + + allowed_keys = set(allowed_keys) + + # TODO: split keys_not_in_dict validation to a separate function. + keys_not_in_dict = [key for key in allowed_keys if key not in default_data.__dict__] + if keys_not_in_dict: + raise ValueError( + f"Key(s) {keys_not_in_dict} are not present in " + f"{dataclass.__class__.__name__}. " + "Remove them from `allowed_keys`. " + f"Valid keys: {list(default_data.__dict__.keys())}" + ) + + # These keys should not have been updated in the `dataclass` object + prohibited_keys = set(default_data.__dict__) - allowed_keys + + bad_keys = [ + key + for key in prohibited_keys + if dataclass.__dict__[key] != default_data.__dict__[key] + ] + if bad_keys: + raise ValueError( + f"Key(s) {bad_keys} are not allowed to be updated in the current context. " + "Remove them from the dataclass." + ) diff --git a/.venv/lib/python3.11/site-packages/ray/air/_internal/device_manager/__init__.py b/.venv/lib/python3.11/site-packages/ray/air/_internal/device_manager/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..824833df03ffe9e03f583856b9336d7610386d91 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/air/_internal/device_manager/__init__.py @@ -0,0 +1,92 @@ +import logging +import threading +from typing import Optional + +import ray +import ray._private.ray_constants as ray_constants +from ray.air._internal.device_manager.cpu import CPUTorchDeviceManager +from ray.air._internal.device_manager.hpu import HPUTorchDeviceManager +from ray.air._internal.device_manager.npu import NPUTorchDeviceManager +from ray.air._internal.device_manager.nvidia_gpu import CUDATorchDeviceManager +from ray.air._internal.device_manager.torch_device_manager import TorchDeviceManager + +logger = logging.getLogger(__name__) + + +DEFAULT_TORCH_DEVICE_MANAGER_CLS = CPUTorchDeviceManager + + +SUPPORTED_ACCELERATOR_TORCH_DEVICE_MANAGER = { + ray_constants.GPU: CUDATorchDeviceManager, + ray_constants.HPU: HPUTorchDeviceManager, + ray_constants.NPU: NPUTorchDeviceManager, +} + + +def register_custom_torch_dist_backend(backend: Optional[str] = None) -> None: + if backend == "hccl": + # The name for the communication backend of Habana and torch-npu is the same. + HPUTorchDeviceManager.register_custom_torch_dist_backend() + + NPUTorchDeviceManager.register_custom_torch_dist_backend() + + +_torch_device_manager = None +_torch_device_manager_lock = threading.Lock() + + +def get_torch_device_manager_by_context() -> TorchDeviceManager: + global _torch_device_manager + + with _torch_device_manager_lock: + if not _torch_device_manager: + existing_device_manager_cls = None + resources = ray.get_runtime_context().get_accelerator_ids() + + # select correct accelerator type from resources + for resource_type, resource_value in resources.items(): + device_manager_cls = SUPPORTED_ACCELERATOR_TORCH_DEVICE_MANAGER.get( + resource_type, None + ) + if resource_value and device_manager_cls: + # An error will raise when multiple accelerators are specified. + if existing_device_manager_cls: + raise RuntimeError( + "Unable to determine the appropriate DeviceManager " + f"for the specified resources {resources}." + ) + else: + existing_device_manager_cls = device_manager_cls + + device_manager_cls = ( + existing_device_manager_cls or DEFAULT_TORCH_DEVICE_MANAGER_CLS + ) + + _torch_device_manager = device_manager_cls() + + return _torch_device_manager + + +def get_torch_device_manager_by_device_type(device_type: str): + if device_type.lower() == ray_constants.GPU.lower() or device_type == "cuda": + return CUDATorchDeviceManager() + elif device_type.lower() == ray_constants.NPU.lower(): + return NPUTorchDeviceManager() + elif device_type.lower() == ray_constants.HPU.lower(): + return HPUTorchDeviceManager() + elif device_type.lower() == "cpu": + return CPUTorchDeviceManager() + + raise RuntimeError(f"Device type {device_type} cannot be recognized.") + + +__all__ = [ + TorchDeviceManager, + CPUTorchDeviceManager, + CUDATorchDeviceManager, + HPUTorchDeviceManager, + NPUTorchDeviceManager, + register_custom_torch_dist_backend, + get_torch_device_manager_by_context, + get_torch_device_manager_by_device_type, +] diff --git a/.venv/lib/python3.11/site-packages/ray/air/_internal/device_manager/__pycache__/__init__.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/air/_internal/device_manager/__pycache__/__init__.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..d283eda018bca41c23fa83b9062465ef1720807d Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/air/_internal/device_manager/__pycache__/__init__.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/air/_internal/device_manager/__pycache__/cpu.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/air/_internal/device_manager/__pycache__/cpu.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..b7634ac0efbc54cb216413e355403a7140ea1305 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/air/_internal/device_manager/__pycache__/cpu.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/air/_internal/device_manager/__pycache__/hpu.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/air/_internal/device_manager/__pycache__/hpu.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..05f53c5322cd93b66d2fd3b27324413e21342102 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/air/_internal/device_manager/__pycache__/hpu.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/air/_internal/device_manager/__pycache__/npu.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/air/_internal/device_manager/__pycache__/npu.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..d3ec345653d67bb22bab6cea81a29defb1d0d4a2 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/air/_internal/device_manager/__pycache__/npu.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/air/_internal/device_manager/__pycache__/nvidia_gpu.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/air/_internal/device_manager/__pycache__/nvidia_gpu.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..f48476036b521689254980ed98cd3700c0892406 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/air/_internal/device_manager/__pycache__/nvidia_gpu.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/air/_internal/device_manager/__pycache__/torch_device_manager.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/air/_internal/device_manager/__pycache__/torch_device_manager.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..d8fb155f0c5d446eb4def07e82cb9676a32289ec Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/air/_internal/device_manager/__pycache__/torch_device_manager.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/air/_internal/device_manager/cpu.py b/.venv/lib/python3.11/site-packages/ray/air/_internal/device_manager/cpu.py new file mode 100644 index 0000000000000000000000000000000000000000..76fa73765287c6bf68a0b2f6e7d4130297b799df --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/air/_internal/device_manager/cpu.py @@ -0,0 +1,30 @@ +from contextlib import contextmanager +from typing import List + +import torch + +from ray.air._internal.device_manager.torch_device_manager import TorchDeviceManager + + +class CPUTorchDeviceManager(TorchDeviceManager): + """CPU device manager""" + + def is_available(self) -> bool(): + return True + + def get_devices(self) -> List[torch.device]: + """Gets the correct torch device list configured for this process.""" + return [torch.device("cpu")] + + def supports_stream(self) -> bool: + """Validate if the device type support create a stream""" + return False + + def get_stream_context(self, stream): + """Return empty context mananger for CPU.""" + + @contextmanager + def default_context_manager(): + yield + + return default_context_manager() diff --git a/.venv/lib/python3.11/site-packages/ray/air/_internal/device_manager/hpu.py b/.venv/lib/python3.11/site-packages/ray/air/_internal/device_manager/hpu.py new file mode 100644 index 0000000000000000000000000000000000000000..bb402ea65b0d9378cb10d70b50bab3588be79102 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/air/_internal/device_manager/hpu.py @@ -0,0 +1,50 @@ +from contextlib import contextmanager +from typing import List, Union + +import torch + +from ray._private.accelerators.hpu import HPU_PACKAGE_AVAILABLE +from ray.air._internal.device_manager.torch_device_manager import TorchDeviceManager + +if HPU_PACKAGE_AVAILABLE: + import habana_frameworks.torch.hpu as torch_hpu + + +class HPUTorchDeviceManager(TorchDeviceManager): + """HPU device manager""" + + @staticmethod + def register_custom_torch_dist_backend(): + if HPU_PACKAGE_AVAILABLE: + import habana_frameworks.torch.core # noqa: F401 + import habana_frameworks.torch.distributed.hccl # noqa: F401 + + def is_available(self) -> bool(): + if not HPU_PACKAGE_AVAILABLE: + return False + + return torch_hpu.is_available() + + def get_devices(self) -> List[torch.device]: + if not self.is_available(): + raise RuntimeError( + "Using HPUTorchDeviceManager but torch hpu is not available." + ) + + return [torch.device("hpu")] + + def set_device(self, device: Union[torch.device, int, str, None]): + torch_hpu.set_device(device) + + def supports_stream(self) -> bool: + """Validate if the device type support create a stream""" + return False + + def get_stream_context(self, stream): + """Get HPU stream context manager, empty so far.""" + + @contextmanager + def default_context_manager(): + yield + + return default_context_manager() diff --git a/.venv/lib/python3.11/site-packages/ray/air/_internal/device_manager/npu.py b/.venv/lib/python3.11/site-packages/ray/air/_internal/device_manager/npu.py new file mode 100644 index 0000000000000000000000000000000000000000..aa6d7bad24081917020b88e4ad90e4335cc631ca --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/air/_internal/device_manager/npu.py @@ -0,0 +1,105 @@ +import os +from importlib.util import find_spec +from typing import List, Union + +import torch + +import ray +import ray._private.ray_constants as ray_constants +from ray.air._internal.device_manager.torch_device_manager import TorchDeviceManager + + +def is_package_present(package_name: str) -> bool: + try: + return find_spec(package_name) is not None + except ModuleNotFoundError: + return False + + +NPU_TORCH_PACKAGE_AVAILABLE = is_package_present("torch_npu") + + +if NPU_TORCH_PACKAGE_AVAILABLE: + import torch_npu # noqa: F401 + + +class NPUTorchDeviceManager(TorchDeviceManager): + """Ascend NPU device manager""" + + @staticmethod + def register_custom_torch_dist_backend(): + if NPU_TORCH_PACKAGE_AVAILABLE: + import torch_npu # noqa: F401, F811 + + def is_available(self) -> bool: + if not NPU_TORCH_PACKAGE_AVAILABLE: + return False + + return torch.npu.is_available() + + def get_devices(self) -> List[torch.device]: + """Gets the correct torch device list configured for this process. + + Returns a list of torch NPU devices allocated for the current worker. + If no NPUs are assigned, then it returns a list with a single CPU device. + """ + if NPU_TORCH_PACKAGE_AVAILABLE and torch.npu.is_available(): + npu_ids = [ + str(id) + for id in ray.get_runtime_context().get_accelerator_ids()[ + ray_constants.NPU + ] + ] + + device_ids = [] + + if len(npu_ids) > 0: + npu_visible_str = os.environ.get( + ray_constants.NPU_RT_VISIBLE_DEVICES_ENV_VAR, "" + ) + if npu_visible_str and npu_visible_str != "NoDevFiles": + npu_visible_list = npu_visible_str.split(",") + else: + npu_visible_list = [] + + for npu_id in npu_ids: + try: + device_ids.append(npu_visible_list.index(npu_id)) + except IndexError: + raise RuntimeError( + "ASCEND_RT_VISIBLE_DEVICES set incorrectly. " + f"Got {npu_visible_str}, expected to include {npu_id}. " + "Did you override the `ASCEND_RT_VISIBLE_DEVICES` " + "environment variable?" + ) + else: + # If called on the driver or outside of Ray Train, return the + # 0th device. + device_ids.append(0) + + devices = [torch.device(f"npu:{device_id}") for device_id in device_ids] + else: + raise RuntimeError( + "Using NPUTorchDeviceManager but torch npu is not available." + ) + + return devices + + def set_device(self, device: Union[torch.device, int]): + torch.npu.set_device(device) + + def supports_stream(self) -> bool: + """Validate if the device type support to create a stream""" + return True + + def create_stream(self, device): + """Create a stream on NPU device""" + return torch.npu.Stream(device) + + def get_stream_context(self, stream): + """Get a torch.stream context on NPU device""" + return torch.npu.stream(stream) + + def get_current_stream(self): + """Get current stream for NPU device""" + return torch.npu.current_stream() diff --git a/.venv/lib/python3.11/site-packages/ray/air/_internal/device_manager/nvidia_gpu.py b/.venv/lib/python3.11/site-packages/ray/air/_internal/device_manager/nvidia_gpu.py new file mode 100644 index 0000000000000000000000000000000000000000..f4bb1b54097e153cc7784c849c46331478d87988 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/air/_internal/device_manager/nvidia_gpu.py @@ -0,0 +1,79 @@ +import os +from typing import List, Union + +import torch + +import ray +from ray.air._internal.device_manager.torch_device_manager import TorchDeviceManager + + +class CUDATorchDeviceManager(TorchDeviceManager): + """CUDA device manager""" + + def is_available(self) -> bool(): + return torch.cuda.is_available() + + def get_devices(self) -> List[torch.device]: + """Gets the correct torch device list configured for this process. + + Returns a list of torch CUDA devices allocated for the current worker. + If no GPUs are assigned, then it returns a list with a single CPU device. + + Assumes that `CUDA_VISIBLE_DEVICES` is set and is a + superset of the `ray.get_gpu_ids()`. + """ + + # GPU IDs are assigned by Ray after you specify "use_gpu" + # GPU `ray.get_gpu_ids()` may return ints or may return strings. + # We should always convert to strings. + gpu_ids = [str(id) for id in ray.get_gpu_ids()] + + device_ids = [] + + if len(gpu_ids) > 0: + cuda_visible_str = os.environ.get("CUDA_VISIBLE_DEVICES", "") + if cuda_visible_str and cuda_visible_str != "NoDevFiles": + cuda_visible_list = cuda_visible_str.split(",") + else: + cuda_visible_list = [] + + # By default, there should only be one GPU ID if `use_gpu=True`. + # If there are multiple GPUs, return a list of devices. + # If using fractional GPUs, these IDs are not guaranteed + # to be unique across different processes. + for gpu_id in gpu_ids: + try: + device_ids.append(cuda_visible_list.index(gpu_id)) + except IndexError: + raise RuntimeError( + "CUDA_VISIBLE_DEVICES set incorrectly. " + f"Got {cuda_visible_str}, expected to include {gpu_id}. " + "Did you override the `CUDA_VISIBLE_DEVICES` environment" + " variable? If not, please help file an issue on Github." + ) + + else: + # If called on the driver or outside of Ray Train, return the + # 0th device. + device_ids.append(0) + + return [torch.device(f"cuda:{device_id}") for device_id in device_ids] + + def set_device(self, device: Union[torch.device, int, str, None]): + torch.cuda.set_device(device) + + def supports_stream(self) -> bool: + """Validate if the device type support create a stream""" + return True + + def create_stream(self, device: torch.device) -> torch.cuda.Stream: + """Create a stream on cuda device""" + return torch.cuda.Stream(device) + + def get_stream_context(self, stream): + """Get a stream context for cuda device""" + return torch.cuda.stream(stream) + + def get_current_stream(self) -> torch.cuda.Stream: + """Get current stream for cuda device""" + return torch.cuda.current_stream() diff --git a/.venv/lib/python3.11/site-packages/ray/air/_internal/device_manager/torch_device_manager.py b/.venv/lib/python3.11/site-packages/ray/air/_internal/device_manager/torch_device_manager.py new file mode 100644 index 0000000000000000000000000000000000000000..d522a477ef58a5021300eab3415abe99d3079213 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/air/_internal/device_manager/torch_device_manager.py @@ -0,0 +1,40 @@ +from abc import ABC +from typing import List, Union + +import torch + + +class TorchDeviceManager(ABC): + """This class contains the function needed for supporting + an acclerator family in Ray AI Library. + """ + + def is_available(self) -> bool: + """Validate if device is available.""" + ... + + def get_devices(self) -> List[torch.device]: + """Gets the correct torch device configured for this process""" + ... + + def set_device(self, device: Union[torch.device, int, str, None]): + """Set the correct device for this process""" + ... + + def supports_stream(self) -> bool: + """Validate if the device type support create a stream""" + ... + + def create_stream(self, device: torch.device): + """Create a device stream""" + ... + + def get_stream_context(self, stream): + """Get a stream context of device. If device didn't support stream, + this should return a empty context manager instead of None. + """ + ... + + def get_current_stream(self): + """Get current stream on accelerators like torch.cuda.current_stream""" + ... diff --git a/.venv/lib/python3.11/site-packages/ray/air/_internal/filelock.py b/.venv/lib/python3.11/site-packages/ray/air/_internal/filelock.py new file mode 100644 index 0000000000000000000000000000000000000000..9dd86d023e264f7328dad53dc0417657d246872b --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/air/_internal/filelock.py @@ -0,0 +1,46 @@ +import hashlib +import os +from pathlib import Path + +from filelock import FileLock + +import ray + +RAY_LOCKFILE_DIR = "_ray_lockfiles" + + +class TempFileLock: + """FileLock wrapper that uses temporary file locks. + + The temporary directory that these locks are saved to can be configured via + the `RAY_TMPDIR` environment variable. + + Args: + path: The file path that this temporary file lock is used for. + This will be used to generate the lockfile filename. + Ex: For concurrent writes to a file, this is the common filepath + that multiple processes are writing to. + **kwargs: Additional keyword arguments to pass to the underlying `FileLock`. + """ + + def __init__(self, path: str, **kwargs): + self.path = path + temp_dir = Path(ray._private.utils.get_user_temp_dir()).resolve() + self._lock_dir = temp_dir / RAY_LOCKFILE_DIR + self._path_hash = hashlib.sha1( + str(Path(self.path).resolve()).encode("utf-8") + ).hexdigest() + self._lock_path = self._lock_dir / f"{self._path_hash}.lock" + + os.makedirs(str(self._lock_dir), exist_ok=True) + self._lock = FileLock(self._lock_path, **kwargs) + + def __enter__(self): + self._lock.acquire() + return self + + def __exit__(self, type, value, traceback): + self._lock.release() + + def __getattr__(self, name): + return getattr(self._lock, name) diff --git a/.venv/lib/python3.11/site-packages/ray/air/_internal/json.py b/.venv/lib/python3.11/site-packages/ray/air/_internal/json.py new file mode 100644 index 0000000000000000000000000000000000000000..2e88824e7109106a54020e3d5eec2629043f3ee1 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/air/_internal/json.py @@ -0,0 +1,31 @@ +import json +import numbers + +import numpy as np + + +class SafeFallbackEncoder(json.JSONEncoder): + def __init__(self, nan_str="null", **kwargs): + super(SafeFallbackEncoder, self).__init__(**kwargs) + self.nan_str = nan_str + + def default(self, value): + try: + if type(value).__module__ == np.__name__ and isinstance(value, np.ndarray): + return value.tolist() + + if isinstance(value, np.bool_): + return bool(value) + + if np.isnan(value): + return self.nan_str + + if issubclass(type(value), numbers.Integral): + return int(value) + if issubclass(type(value), numbers.Number): + return float(value) + + return super(SafeFallbackEncoder, self).default(value) + + except Exception: + return str(value) # give up, just stringify it (ok for logs) diff --git a/.venv/lib/python3.11/site-packages/ray/air/_internal/mlflow.py b/.venv/lib/python3.11/site-packages/ray/air/_internal/mlflow.py new file mode 100644 index 0000000000000000000000000000000000000000..727318ce839ff0bf0a78bca5346c66e44b245b75 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/air/_internal/mlflow.py @@ -0,0 +1,342 @@ +import logging +import os +from copy import deepcopy +from typing import TYPE_CHECKING, Dict, Optional + +from packaging import version + +from ray._private.dict import flatten_dict + +if TYPE_CHECKING: + from mlflow.entities import Run + from mlflow.tracking import MlflowClient + +logger = logging.getLogger(__name__) + + +class _MLflowLoggerUtil: + """Util class for setting up and logging to MLflow. + + Use this util for any library that needs MLflow logging/tracking logic + such as Ray Tune or Ray Train. + """ + + def __init__(self): + import mlflow + + self._mlflow = mlflow + self.experiment_id = None + + def __deepcopy__(self, memo=None): + # mlflow is a module, and thus cannot be copied + _mlflow = self._mlflow + self.__dict__.pop("_mlflow") + dict_copy = deepcopy(self.__dict__, memo) + copied_object = _MLflowLoggerUtil() + copied_object.__dict__.update(dict_copy) + self._mlflow = _mlflow + copied_object._mlflow = _mlflow + return copied_object + + def setup_mlflow( + self, + tracking_uri: Optional[str] = None, + registry_uri: Optional[str] = None, + experiment_id: Optional[str] = None, + experiment_name: Optional[str] = None, + tracking_token: Optional[str] = None, + artifact_location: Optional[str] = None, + create_experiment_if_not_exists: bool = True, + ): + """ + Sets up MLflow. + + Sets the Mlflow tracking uri & token, and registry URI. Also sets + the MLflow experiment that the logger should use, and possibly + creates new experiment if it does not exist. + + Args: + tracking_uri: The tracking URI for the MLflow tracking + server. + registry_uri: The registry URI for the MLflow model registry. + experiment_id: The id of an already existing MLflow + experiment to use for logging. If None is passed in + here and the MFLOW_EXPERIMENT_ID is not set, or the + experiment with this id does not exist, + ``experiment_name`` will be used instead. This argument takes + precedence over ``experiment_name`` if both are passed in. + experiment_name: The experiment name to use for logging. + If None is passed in here, the MLFLOW_EXPERIMENT_NAME environment + variable is used to determine the experiment name. + If the experiment with the name already exists with MLflow, + it will be reused. If not, a new experiment will be created + with the provided name if + ``create_experiment_if_not_exists`` is set to True. + artifact_location: The location to store run artifacts. + If not provided, MLFlow picks an appropriate default. + Ignored if experiment already exists. + tracking_token: Tracking token used to authenticate with MLflow. + create_experiment_if_not_exists: Whether to create an + experiment with the provided name if it does not already + exist. Defaults to True. + + Returns: + Whether setup is successful. + """ + if tracking_token: + os.environ["MLFLOW_TRACKING_TOKEN"] = tracking_token + + self._mlflow.set_tracking_uri(tracking_uri) + self._mlflow.set_registry_uri(registry_uri) + + # First check experiment_id. + experiment_id = ( + experiment_id + if experiment_id is not None + else os.environ.get("MLFLOW_EXPERIMENT_ID") + ) + if experiment_id is not None: + from mlflow.exceptions import MlflowException + + try: + self._mlflow.get_experiment(experiment_id=experiment_id) + logger.debug( + f"Experiment with provided id {experiment_id} " + "exists. Setting that as the experiment." + ) + self.experiment_id = experiment_id + return + except MlflowException: + pass + + # Then check experiment_name. + experiment_name = ( + experiment_name + if experiment_name is not None + else os.environ.get("MLFLOW_EXPERIMENT_NAME") + ) + if experiment_name is not None and self._mlflow.get_experiment_by_name( + name=experiment_name + ): + logger.debug( + f"Experiment with provided name {experiment_name} " + "exists. Setting that as the experiment." + ) + self.experiment_id = self._mlflow.get_experiment_by_name( + experiment_name + ).experiment_id + return + + # An experiment with the provided id or name does not exist. + # Create a new experiment if applicable. + if experiment_name and create_experiment_if_not_exists: + logger.debug( + "Existing experiment not found. Creating new " + f"experiment with name: {experiment_name}" + ) + self.experiment_id = self._mlflow.create_experiment( + name=experiment_name, artifact_location=artifact_location + ) + return + + if create_experiment_if_not_exists: + raise ValueError( + f"Experiment with the provided experiment_id: " + f"{experiment_id} does not exist and no " + f"experiment_name provided. At least one of " + f"these has to be provided." + ) + else: + raise ValueError( + f"Experiment with the provided experiment_id: " + f"{experiment_id} or experiment_name: " + f"{experiment_name} does not exist. Please " + f"create an MLflow experiment and provide " + f"either its id or name." + ) + + def _parse_dict(self, dict_to_log: Dict) -> Dict: + """Parses provided dict to convert all values to float. + + MLflow can only log metrics that are floats. This does not apply to + logging parameters or artifacts. + + Args: + dict_to_log: The dictionary containing the metrics to log. + + Returns: + A dictionary containing the metrics to log with all values being + converted to floats, or skipped if not able to be converted. + """ + new_dict = {} + for key, value in dict_to_log.items(): + try: + value = float(value) + new_dict[key] = value + except (ValueError, TypeError): + logger.debug( + "Cannot log key {} with value {} since the " + "value cannot be converted to float.".format(key, value) + ) + continue + + return new_dict + + def start_run( + self, + run_name: Optional[str] = None, + tags: Optional[Dict] = None, + set_active: bool = False, + ) -> "Run": + """Starts a new run and possibly sets it as the active run. + + Args: + tags: Tags to set for the new run. + set_active: Whether to set the new run as the active run. + If an active run already exists, then that run is returned. + + Returns: + The newly created MLflow run. + """ + import mlflow + from mlflow.utils.mlflow_tags import MLFLOW_RUN_NAME + + if tags is None: + tags = {} + + if set_active: + return self._start_active_run(run_name=run_name, tags=tags) + + client = self._get_client() + # If `mlflow==1.30.0` and we don't use `run_name`, then MLflow might error. For + # more information, see #29749. + if version.parse(mlflow.__version__) >= version.parse("1.30.0"): + run = client.create_run( + run_name=run_name, experiment_id=self.experiment_id, tags=tags + ) + else: + tags[MLFLOW_RUN_NAME] = run_name + run = client.create_run(experiment_id=self.experiment_id, tags=tags) + + return run + + def _start_active_run( + self, run_name: Optional[str] = None, tags: Optional[Dict] = None + ) -> "Run": + """Starts a run and sets it as the active run if one does not exist. + + If an active run already exists, then returns it. + """ + active_run = self._mlflow.active_run() + if active_run: + return active_run + + return self._mlflow.start_run( + run_name=run_name, experiment_id=self.experiment_id, tags=tags + ) + + def _run_exists(self, run_id: str) -> bool: + """Check if run with the provided id exists.""" + from mlflow.exceptions import MlflowException + + try: + self._mlflow.get_run(run_id=run_id) + return True + except MlflowException: + return False + + def _get_client(self) -> "MlflowClient": + """Returns an ml.tracking.MlflowClient instance to use for logging.""" + tracking_uri = self._mlflow.get_tracking_uri() + registry_uri = self._mlflow.get_registry_uri() + + from mlflow.tracking import MlflowClient + + return MlflowClient(tracking_uri=tracking_uri, registry_uri=registry_uri) + + def log_params(self, params_to_log: Dict, run_id: Optional[str] = None): + """Logs the provided parameters to the run specified by run_id. + + If no ``run_id`` is passed in, then logs to the current active run. + If there is not active run, then creates a new run and sets it as + the active run. + + Args: + params_to_log: Dictionary of parameters to log. + run_id (Optional[str]): The ID of the run to log to. + """ + params_to_log = flatten_dict(params_to_log) + + if run_id and self._run_exists(run_id): + client = self._get_client() + for key, value in params_to_log.items(): + client.log_param(run_id=run_id, key=key, value=value) + + else: + for key, value in params_to_log.items(): + self._mlflow.log_param(key=key, value=value) + + def log_metrics(self, step, metrics_to_log: Dict, run_id: Optional[str] = None): + """Logs the provided metrics to the run specified by run_id. + + + If no ``run_id`` is passed in, then logs to the current active run. + If there is not active run, then creates a new run and sets it as + the active run. + + Args: + metrics_to_log: Dictionary of metrics to log. + run_id (Optional[str]): The ID of the run to log to. + """ + metrics_to_log = flatten_dict(metrics_to_log) + metrics_to_log = self._parse_dict(metrics_to_log) + + if run_id and self._run_exists(run_id): + client = self._get_client() + for key, value in metrics_to_log.items(): + client.log_metric(run_id=run_id, key=key, value=value, step=step) + + else: + for key, value in metrics_to_log.items(): + self._mlflow.log_metric(key=key, value=value, step=step) + + def save_artifacts(self, dir: str, run_id: Optional[str] = None): + """Saves directory as artifact to the run specified by run_id. + + If no ``run_id`` is passed in, then saves to the current active run. + If there is not active run, then creates a new run and sets it as + the active run. + + Args: + dir: Path to directory containing the files to save. + run_id (Optional[str]): The ID of the run to log to. + """ + if run_id and self._run_exists(run_id): + client = self._get_client() + client.log_artifacts(run_id=run_id, local_dir=dir) + else: + self._mlflow.log_artifacts(local_dir=dir) + + def end_run(self, status: Optional[str] = None, run_id=None): + """Terminates the run specified by run_id. + + If no ``run_id`` is passed in, then terminates the + active run if one exists. + + Args: + status (Optional[str]): The status to set when terminating the run. + run_id (Optional[str]): The ID of the run to terminate. + + """ + if ( + run_id + and self._run_exists(run_id) + and not ( + self._mlflow.active_run() + and self._mlflow.active_run().info.run_id == run_id + ) + ): + client = self._get_client() + client.set_terminated(run_id=run_id, status=status) + else: + self._mlflow.end_run(status=status) diff --git a/.venv/lib/python3.11/site-packages/ray/air/_internal/tensorflow_utils.py b/.venv/lib/python3.11/site-packages/ray/air/_internal/tensorflow_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..46b3c2d1d1b7d5a4b86ffdf46fa6cd8598cca63a --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/air/_internal/tensorflow_utils.py @@ -0,0 +1,137 @@ +from typing import TYPE_CHECKING, Dict, List, Optional, Tuple, Union + +import numpy as np +import pyarrow +import tensorflow as tf + +from ray.air.util.data_batch_conversion import _unwrap_ndarray_object_type_if_needed +from ray.air.util.tensor_extensions.arrow import get_arrow_extension_tensor_types + +if TYPE_CHECKING: + from ray.data._internal.pandas_block import PandasBlockSchema + + +def convert_ndarray_to_tf_tensor( + ndarray: np.ndarray, + dtype: Optional[tf.dtypes.DType] = None, + type_spec: Optional[tf.TypeSpec] = None, +) -> tf.Tensor: + """Convert a NumPy ndarray to a TensorFlow Tensor. + + Args: + ndarray: A NumPy ndarray that we wish to convert to a TensorFlow Tensor. + dtype: A TensorFlow dtype for the created tensor; if None, the dtype will be + inferred from the NumPy ndarray data. + type_spec: A type spec that specifies the shape and dtype of the returned + tensor. If you specify ``dtype``, the dtype stored in the type spec is + ignored. + + Returns: A TensorFlow Tensor. + """ + if dtype is None and type_spec is not None: + dtype = type_spec.dtype + + is_ragged = isinstance(type_spec, tf.RaggedTensorSpec) + ndarray = _unwrap_ndarray_object_type_if_needed(ndarray) + if is_ragged: + return tf.ragged.constant(ndarray, dtype=dtype) + else: + return tf.convert_to_tensor(ndarray, dtype=dtype) + + +def convert_ndarray_batch_to_tf_tensor_batch( + ndarrays: Union[np.ndarray, Dict[str, np.ndarray]], + dtypes: Optional[Union[tf.dtypes.DType, Dict[str, tf.dtypes.DType]]] = None, +) -> Union[tf.Tensor, Dict[str, tf.Tensor]]: + """Convert a NumPy ndarray batch to a TensorFlow Tensor batch. + + Args: + ndarray: A (dict of) NumPy ndarray(s) that we wish to convert to a TensorFlow + Tensor. + dtype: A (dict of) TensorFlow dtype(s) for the created tensor; if None, the + dtype will be inferred from the NumPy ndarray data. + + Returns: A (dict of) TensorFlow Tensor(s). + """ + if isinstance(ndarrays, np.ndarray): + # Single-tensor case. + if isinstance(dtypes, dict): + if len(dtypes) != 1: + raise ValueError( + "When constructing a single-tensor batch, only a single dtype " + f"should be given, instead got: {dtypes}" + ) + dtypes = next(iter(dtypes.values())) + batch = convert_ndarray_to_tf_tensor(ndarrays, dtypes) + else: + # Multi-tensor case. + batch = { + col_name: convert_ndarray_to_tf_tensor( + col_ndarray, + dtype=dtypes[col_name] if isinstance(dtypes, dict) else dtypes, + ) + for col_name, col_ndarray in ndarrays.items() + } + + return batch + + +def get_type_spec( + schema: Union["pyarrow.lib.Schema", "PandasBlockSchema"], + columns: Union[str, List[str]], +) -> Union[tf.TypeSpec, Dict[str, tf.TypeSpec]]: + import pyarrow as pa + + from ray.data.extensions import TensorDtype + + tensor_extension_types = get_arrow_extension_tensor_types() + + assert not isinstance(schema, type) + + dtypes: Dict[str, Union[np.dtype, pa.DataType]] = dict( + zip(schema.names, schema.types) + ) + + def get_dtype(dtype: Union[np.dtype, pa.DataType]) -> tf.dtypes.DType: + if isinstance(dtype, pa.ListType): + dtype = dtype.value_type + if isinstance(dtype, pa.DataType): + dtype = dtype.to_pandas_dtype() + if isinstance(dtype, TensorDtype): + dtype = dtype.element_dtype + res = tf.dtypes.as_dtype(dtype) + return res + + def get_shape(dtype: Union[np.dtype, pa.DataType]) -> Tuple[int, ...]: + shape = (None,) + if isinstance(dtype, tensor_extension_types): + dtype = dtype.to_pandas_dtype() + if isinstance(dtype, pa.ListType): + shape += (None,) + elif isinstance(dtype, TensorDtype): + shape += dtype.element_shape + return shape + + def get_tensor_spec( + dtype: Union[np.dtype, pa.DataType], *, name: str + ) -> tf.TypeSpec: + + shape, dtype = get_shape(dtype), get_dtype(dtype) + # Batch dimension is always `None`. So, if there's more than one `None`-valued + # dimension, then the tensor is ragged. + is_ragged = sum(dim is None for dim in shape) > 1 + if is_ragged: + type_spec = tf.RaggedTensorSpec(shape, dtype=dtype) + else: + type_spec = tf.TensorSpec(shape, dtype=dtype, name=name) + return type_spec + + if isinstance(columns, str): + name, dtype = columns, dtypes[columns] + return get_tensor_spec(dtype, name=name) + + return { + name: get_tensor_spec(dtype, name=name) + for name, dtype in dtypes.items() + if name in columns + } diff --git a/.venv/lib/python3.11/site-packages/ray/air/_internal/torch_utils.py b/.venv/lib/python3.11/site-packages/ray/air/_internal/torch_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..caeb27a20a30a4709a270f703fbaff245a040ec6 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/air/_internal/torch_utils.py @@ -0,0 +1,294 @@ +import warnings +from typing import Any, Dict, List, Optional, Union + +import numpy as np +import pandas as pd +import torch + +from ray.air._internal.device_manager import get_torch_device_manager_by_context +from ray.air.util.data_batch_conversion import _unwrap_ndarray_object_type_if_needed + + +def get_devices() -> List[torch.device]: + """Gets the correct torch device list configured for this process. + + Returns a list of torch accelerator (GPU, HPU, NPU...) devices allocated for + the current worker. + If no accelerators are assigned, then it returns a list with a single CPU device. + """ + return get_torch_device_manager_by_context().get_devices() + + +def convert_pandas_to_torch_tensor( + data_batch: pd.DataFrame, + columns: Optional[Union[List[str], List[List[str]]]] = None, + column_dtypes: Optional[Union[torch.dtype, List[torch.dtype]]] = None, + unsqueeze: bool = True, +) -> Union[torch.Tensor, List[torch.Tensor]]: + """Converts a Pandas dataframe to a torch Tensor or list of torch Tensors. + + The format of the return type will match the format of ``columns``. If a + list of columns is provided, the return type will be a single tensor. If + ``columns`` is a list of lists, then the return type will be a list of + tensors. + + Args: + data_batch: The pandas dataframe to convert to a + torch tensor. + columns: + The names of the columns in the dataframe to include in the + torch tensor. If this arg is a List[List[str]], then the return + type will be a List of tensors. This is useful for multi-input + models. If None, then use all columns in the ``data_batch``. + column_dtype: The + torch dtype to use for the tensor. If set to None, + then automatically infer the dtype. + unsqueeze: If set to True, the tensors + will be unsqueezed (reshaped to (N, 1)) before being concatenated into + the final tensor. Otherwise, they will be left as is, that is + (N, ). Defaults to True. + + Returns: + Either a torch tensor of size (N, len(columns)) where N is the + number of rows in the ``data_batch`` Dataframe, or a list of + tensors, where the size of item i is (N, len(columns[i])). + + """ + + multi_input = columns and (isinstance(columns[0], (list, tuple))) + + if not multi_input and column_dtypes and not isinstance(column_dtypes, torch.dtype): + raise TypeError( + "If `columns` is a list of strings, " + "`column_dtypes` must be None or a single `torch.dtype`." + f"Got {type(column_dtypes)} instead." + ) + + columns = columns if columns else [] + + def tensorize(vals, dtype): + """This recursive function allows to convert pyarrow List dtypes + to multi-dimensional tensors.""" + if isinstance(vals, pd.api.extensions.ExtensionArray): + # torch.as_tensor() does not yet support the __array__ protocol, so we need + # to convert extension arrays to ndarrays manually before converting to a + # Torch tensor. + # See https://github.com/pytorch/pytorch/issues/51156. + vals = vals.to_numpy() + + if vals.dtype.type is np.object_: + # Column has an object dtype which Torch can't handle, so we try to + # tensorize each column element and then stack the resulting tensors. + tensors = [tensorize(x, dtype) for x in vals] + try: + return torch.stack(tensors) + except RuntimeError: + # NOTE: RuntimeError is raised when trying to stack ragged tensors. + # Try to coerce the tensor to a nested tensor, if possible. + # If this fails, the exception will be propagated up to the caller. + return torch.nested_tensor(tensors) + else: + return torch.as_tensor(vals, dtype=dtype) + + def get_tensor_for_columns(columns, dtype): + feature_tensors = [] + + if columns: + batch = data_batch[columns] + else: + batch = data_batch + + for col in batch.columns: + col_vals = batch[col].values + try: + t = tensorize(col_vals, dtype=dtype) + except Exception as e: + raise ValueError( + f"Failed to convert column {col} to a Torch Tensor of dtype " + f"{dtype}. See above exception chain for the exact failure." + ) from e + if unsqueeze: + t = t.unsqueeze(1) + feature_tensors.append(t) + + if len(feature_tensors) > 1: + feature_tensor = torch.cat(feature_tensors, dim=1) + else: + feature_tensor = feature_tensors[0] + return feature_tensor + + if multi_input: + if type(column_dtypes) not in [list, tuple]: + column_dtypes = [column_dtypes] * len(columns) + return [ + get_tensor_for_columns(columns=subcolumns, dtype=dtype) + for subcolumns, dtype in zip(columns, column_dtypes) + ] + else: + return get_tensor_for_columns(columns=columns, dtype=column_dtypes) + + +def convert_ndarray_to_torch_tensor( + ndarray: np.ndarray, + dtype: Optional[torch.dtype] = None, + device: Optional[str] = None, +) -> torch.Tensor: + """Convert a NumPy ndarray to a Torch Tensor. + + Args: + ndarray: A NumPy ndarray that we wish to convert to a Torch Tensor. + dtype: A Torch dtype for the created tensor; if None, the dtype will be + inferred from the NumPy ndarray data. + device: The device on which the tensor(s) should be placed; if None, the Torch + tensor(s) will be constructed on the CPU. + + Returns: A Torch Tensor. + """ + ndarray = _unwrap_ndarray_object_type_if_needed(ndarray) + + # Object dtype cannot be converted into PyTorch Tensor. + if ndarray.dtype.type is np.object_: + raise RuntimeError( + "Numpy array of object dtype cannot be converted to a Torch Tensor. This " + "may because the numpy array is a ragged tensor--it contains items of " + "different sizes. If using `iter_torch_batches()` API, you can pass in a " + "`collate_fn` argument to specify custom logic to convert the Numpy array " + "batch to a Torch tensor batch." + ) + + # The numpy array is not always writeable as it can come from the Ray object store. + # Numpy will throw a verbose warning here, which we suppress, as we don't write + # to the tensors. We also don't want to copy the array to avoid memory overhead. + # Original warning: https://github.com/pytorch/pytorch/blob/v1.13.0/ + # torch/csrc/utils/tensor_numpy.cpp#L198-L206 + with warnings.catch_warnings(): + warnings.simplefilter("ignore") + return torch.as_tensor(ndarray, dtype=dtype, device=device) + + +def convert_ndarray_batch_to_torch_tensor_batch( + ndarrays: Union[np.ndarray, Dict[str, np.ndarray]], + dtypes: Optional[Union[torch.dtype, Dict[str, torch.dtype]]] = None, + device: Optional[str] = None, +) -> Union[torch.Tensor, Dict[str, torch.Tensor]]: + """Convert a NumPy ndarray batch to a Torch Tensor batch. + + Args: + ndarray: A (dict of) NumPy ndarray(s) that we wish to convert to a Torch Tensor. + dtype: A (dict of) Torch dtype(s) for the created tensor; if None, the dtype + will be inferred from the NumPy ndarray data. + device: The device on which the tensor(s) should be placed; if None, the Torch + tensor(s) will be constructed on the CPU. + + Returns: A (dict of) Torch Tensor(s). + """ + if isinstance(ndarrays, np.ndarray): + # Single-tensor case. + if isinstance(dtypes, dict): + if len(dtypes) != 1: + raise ValueError( + "When constructing a single-tensor batch, only a single dtype " + f"should be given, instead got: {dtypes}" + ) + dtypes = next(iter(dtypes.values())) + batch = convert_ndarray_to_torch_tensor(ndarrays, dtype=dtypes, device=device) + else: + # Multi-tensor case. + batch = { + col_name: convert_ndarray_to_torch_tensor( + col_ndarray, + dtype=dtypes[col_name] if isinstance(dtypes, dict) else dtypes, + device=device, + ) + for col_name, col_ndarray in ndarrays.items() + } + + return batch + + +def load_torch_model( + saved_model: Union[torch.nn.Module, Dict], + model_definition: Optional[torch.nn.Module] = None, +) -> torch.nn.Module: + """Loads a PyTorch model from the provided ``saved_model``. + + ``model_definition`` is only used when ``saved_model`` is + a torch state dict, which will be loaded into ``model_definition``. + Otherwise, ``model_definition`` is discarded. + """ + if isinstance(saved_model, torch.nn.Module): + return saved_model + elif isinstance(saved_model, dict): + if not model_definition: + raise ValueError( + "Attempting to load torch model from a " + "state_dict, but no `model_definition` was " + "provided." + ) + model_definition.load_state_dict(saved_model) + return model_definition + else: + raise ValueError( + f"Saved model is of type {type(saved_model)}. " + f"The model saved in the checkpoint is expected " + f"to be of type `torch.nn.Module`, or a model " + f"state dict of type dict." + ) + + +def contains_tensor(obj): + if isinstance(obj, torch.Tensor): + return True + elif isinstance(obj, dict): + for k, v in obj.items(): + if contains_tensor(k): + return True + if contains_tensor(v): + return True + elif isinstance(obj, (list, tuple)): + for v in obj: + if contains_tensor(v): + return True + return False + + +# Not present in torch<=1.7.0 +# Adapted from https://github.com/pytorch/pytorch/blob/\ +# c18da597e0bb1c1aecc97c77a73fed1849057fa4/torch/nn/modules/utils.py +def consume_prefix_in_state_dict_if_present_not_in_place( + state_dict: Dict[str, Any], prefix: str +) -> Dict[str, Any]: + """Strip the prefix in state_dict, if any and return a new dict. + + Adapted from https://github.com/pytorch/pytorch/blob/\ +c18da597e0bb1c1aecc97c77a73fed1849057fa4/torch/nn/modules/utils.py + The original method modified the dict in-place. + + Args: + state_dict: a state-dict to be loaded to the model. + prefix: prefix. + + """ + copied = False + + for key in state_dict: + if key.startswith(prefix): + newkey = key[len(prefix) :] + if not copied: + # We are doing shallow copies here, so the performance + # impact should be negligible anyway, but this is + # a simple optimization. + state_dict = state_dict.copy() + copied = True + state_dict[newkey] = state_dict.pop(key) + + if "_metadata" in state_dict: + state_dict["_metadata"] = state_dict["_metadata"].copy() + metadata = state_dict["_metadata"] + for key in metadata: + if len(key) == 0: + continue + newkey = key[len(prefix) :] + metadata[newkey] = metadata.pop(key) + + return state_dict diff --git a/.venv/lib/python3.11/site-packages/ray/air/_internal/uri_utils.py b/.venv/lib/python3.11/site-packages/ray/air/_internal/uri_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..33c9ae7c10a29c57bac8f33a2abfd6678eb4a8de --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/air/_internal/uri_utils.py @@ -0,0 +1,101 @@ +import os +import urllib.parse +from pathlib import Path +from typing import Union + + +class URI: + """Represents a URI, supporting path appending and retrieving parent URIs. + + Example Usage: + + >>> s3_uri = URI("s3://bucket/a?scheme=http¶m=1") + >>> s3_uri + URI + >>> str(s3_uri / "b" / "c") + 's3://bucket/a/b/c?scheme=http¶m=1' + >>> str(s3_uri.parent) + 's3://bucket?scheme=http¶m=1' + >>> str(s3_uri) + 's3://bucket/a?scheme=http¶m=1' + >>> s3_uri.parent.name, s3_uri.name + ('bucket', 'a') + >>> local_path = URI("/tmp/local") + >>> str(local_path) + '/tmp/local' + >>> str(local_path.parent) + '/tmp' + >>> str(local_path / "b" / "c") + '/tmp/local/b/c' + + Args: + uri: The URI to represent. + Ex: s3://bucket?scheme=http&endpoint_override=localhost%3A900 + Ex: file:///a/b/c/d + """ + + def __init__(self, uri: str): + self._parsed = urllib.parse.urlparse(uri) + if not self._parsed.scheme: + # Just treat this as a regular path + self._path = Path(uri) + else: + self._path = Path(os.path.normpath(self._parsed.netloc + self._parsed.path)) + + def rstrip_subpath(self, subpath: Path) -> "URI": + """Returns a new URI that strips the given subpath from the end of this URI. + + Example: + >>> uri = URI("s3://bucket/a/b/c/?param=1") + >>> str(uri.rstrip_subpath(Path("b/c"))) + 's3://bucket/a?param=1' + + >>> uri = URI("/tmp/a/b/c/") + >>> str(uri.rstrip_subpath(Path("/b/c/.//"))) + '/tmp/a' + + """ + assert str(self._path).endswith(str(subpath)), (self._path, subpath) + stripped_path = str(self._path).replace(str(subpath), "") + return URI(self._get_str_representation(self._parsed, stripped_path)) + + @property + def name(self) -> str: + return self._path.name + + @property + def parent(self) -> "URI": + assert self._path.parent != ".", f"{str(self)} has no valid parent URI" + return URI(self._get_str_representation(self._parsed, self._path.parent)) + + @property + def scheme(self) -> str: + return self._parsed.scheme + + @property + def path(self) -> str: + return str(self._path) + + def __truediv__(self, path_to_append): + assert isinstance(path_to_append, str) + return URI( + self._get_str_representation(self._parsed, self._path / path_to_append) + ) + + @classmethod + def _get_str_representation( + cls, parsed_uri: urllib.parse.ParseResult, path: Union[str, Path] + ) -> str: + if not parsed_uri.scheme: + return str(path) + return parsed_uri._replace(netloc=str(path), path="").geturl() + + def __repr__(self): + return f"URI<{str(self)}>" + + def __str__(self): + return self._get_str_representation(self._parsed, self._path) + + +def is_uri(path: str) -> bool: + return bool(urllib.parse.urlparse(path).scheme) diff --git a/.venv/lib/python3.11/site-packages/ray/air/_internal/usage.py b/.venv/lib/python3.11/site-packages/ray/air/_internal/usage.py new file mode 100644 index 0000000000000000000000000000000000000000..64e41a83eb7edadd4320c67a147a93c6a5082212 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/air/_internal/usage.py @@ -0,0 +1,257 @@ +import collections +import json +import os +from enum import Enum +from typing import TYPE_CHECKING, Dict, List, Optional, Set, Union + +from ray._private.usage.usage_lib import TagKey, record_extra_usage_tag + +if TYPE_CHECKING: + from ray.train._internal.storage import StorageContext + from ray.train.trainer import BaseTrainer + from ray.tune import Callback + from ray.tune.schedulers import TrialScheduler + from ray.tune.search import BasicVariantGenerator, Searcher + + +AIR_TRAINERS = { + "HorovodTrainer", + "LightGBMTrainer", + "TensorflowTrainer", + "TorchTrainer", + "XGBoostTrainer", +} + +# searchers implemented by Ray Tune. +TUNE_SEARCHERS = { + "AxSearch", + "BayesOptSearch", + "TuneBOHB", + "HEBOSearch", + "HyperOptSearch", + "NevergradSearch", + "OptunaSearch", + "ZOOptSearch", +} + +# These are just wrappers around real searchers. +# We don't want to double tag in this case, otherwise, the real tag +# will be overwritten. +TUNE_SEARCHER_WRAPPERS = { + "ConcurrencyLimiter", + "Repeater", +} + +TUNE_SCHEDULERS = { + "FIFOScheduler", + "AsyncHyperBandScheduler", + "MedianStoppingRule", + "HyperBandScheduler", + "HyperBandForBOHB", + "PopulationBasedTraining", + "PopulationBasedTrainingReplay", + "PB2", + "ResourceChangingScheduler", +} + + +class AirEntrypoint(Enum): + TUNER = "Tuner.fit" + TRAINER = "Trainer.fit" + TUNE_RUN = "tune.run" + TUNE_RUN_EXPERIMENTS = "tune.run_experiments" + + +def _find_class_name(obj, allowed_module_path_prefix: str, whitelist: Set[str]): + """Find the class name of the object. If the object is not + under `allowed_module_path_prefix` or if its class is not in the whitelist, + return "Custom". + + Args: + obj: The object under inspection. + allowed_module_path_prefix: If the `obj`'s class is not under + the `allowed_module_path_prefix`, its class name will be anonymized. + whitelist: If the `obj`'s class is not in the `whitelist`, + it will be anonymized. + Returns: + The class name to be tagged with telemetry. + """ + module_path = obj.__module__ + cls_name = obj.__class__.__name__ + if module_path.startswith(allowed_module_path_prefix) and cls_name in whitelist: + return cls_name + else: + return "Custom" + + +def tag_air_trainer(trainer: "BaseTrainer"): + from ray.train.trainer import BaseTrainer + + assert isinstance(trainer, BaseTrainer) + trainer_name = _find_class_name(trainer, "ray.train", AIR_TRAINERS) + record_extra_usage_tag(TagKey.AIR_TRAINER, trainer_name) + + +def tag_searcher(searcher: Union["BasicVariantGenerator", "Searcher"]): + from ray.tune.search import BasicVariantGenerator, Searcher + + if isinstance(searcher, BasicVariantGenerator): + # Note this could be highly inflated as all train flows are treated + # as using BasicVariantGenerator. + record_extra_usage_tag(TagKey.TUNE_SEARCHER, "BasicVariantGenerator") + elif isinstance(searcher, Searcher): + searcher_name = _find_class_name( + searcher, "ray.tune.search", TUNE_SEARCHERS.union(TUNE_SEARCHER_WRAPPERS) + ) + if searcher_name in TUNE_SEARCHER_WRAPPERS: + # ignore to avoid double tagging with wrapper name. + return + record_extra_usage_tag(TagKey.TUNE_SEARCHER, searcher_name) + else: + assert False, ( + "Not expecting a non-BasicVariantGenerator, " + "non-Searcher type passed in for `tag_searcher`." + ) + + +def tag_scheduler(scheduler: "TrialScheduler"): + from ray.tune.schedulers import TrialScheduler + + assert isinstance(scheduler, TrialScheduler) + scheduler_name = _find_class_name(scheduler, "ray.tune.schedulers", TUNE_SCHEDULERS) + record_extra_usage_tag(TagKey.TUNE_SCHEDULER, scheduler_name) + + +def tag_setup_wandb(): + record_extra_usage_tag(TagKey.AIR_SETUP_WANDB_INTEGRATION_USED, "1") + + +def tag_setup_mlflow(): + record_extra_usage_tag(TagKey.AIR_SETUP_MLFLOW_INTEGRATION_USED, "1") + + +def _count_callbacks(callbacks: Optional[List["Callback"]]) -> Dict[str, int]: + """Creates a map of callback class name -> count given a list of callbacks.""" + from ray.air.integrations.comet import CometLoggerCallback + from ray.air.integrations.mlflow import MLflowLoggerCallback + from ray.air.integrations.wandb import WandbLoggerCallback + from ray.tune import Callback + from ray.tune.logger import LoggerCallback + from ray.tune.logger.aim import AimLoggerCallback + from ray.tune.utils.callback import DEFAULT_CALLBACK_CLASSES + + built_in_callbacks = ( + WandbLoggerCallback, + MLflowLoggerCallback, + CometLoggerCallback, + AimLoggerCallback, + ) + DEFAULT_CALLBACK_CLASSES + + callback_names = [callback_cls.__name__ for callback_cls in built_in_callbacks] + callback_counts = collections.defaultdict(int) + + callbacks = callbacks or [] + for callback in callbacks: + if not isinstance(callback, Callback): + # This will error later, but don't include this as custom usage. + continue + + callback_name = callback.__class__.__name__ + + if callback_name in callback_names: + callback_counts[callback_name] += 1 + elif isinstance(callback, LoggerCallback): + callback_counts["CustomLoggerCallback"] += 1 + else: + callback_counts["CustomCallback"] += 1 + + return callback_counts + + +def tag_callbacks(callbacks: Optional[List["Callback"]]) -> bool: + """Records built-in callback usage via a JSON str representing a + dictionary mapping callback class name -> counts. + + User-defined callbacks will increment the count under the `CustomLoggerCallback` + or `CustomCallback` key depending on which of the provided interfaces they subclass. + NOTE: This will NOT track the name of the user-defined callback, + nor its implementation. + + This will NOT report telemetry if no callbacks are provided by the user. + + Returns: + bool: True if usage was recorded, False otherwise. + """ + if not callbacks: + # User didn't pass in any callbacks -> no usage recorded. + return False + + callback_counts = _count_callbacks(callbacks) + + if callback_counts: + callback_counts_str = json.dumps(callback_counts) + record_extra_usage_tag(TagKey.AIR_CALLBACKS, callback_counts_str) + + +def tag_storage_type(storage: "StorageContext"): + """Records the storage configuration of an experiment. + + The storage configuration is set by `RunConfig(storage_path, storage_filesystem)`. + + The possible storage types (defined by `pyarrow.fs.FileSystem.type_name`) are: + - 'local' = pyarrow.fs.LocalFileSystem. This includes NFS usage. + - 'mock' = pyarrow.fs._MockFileSystem. This is used for testing. + - ('s3', 'gcs', 'abfs', 'hdfs'): Various remote storage schemes + with default implementations in pyarrow. + - 'custom' = All other storage schemes, which includes ALL cases where a + custom `storage_filesystem` is provided. + - 'other' = catches any other cases not explicitly handled above. + """ + whitelist = {"local", "mock", "s3", "gcs", "abfs", "hdfs"} + + if storage.custom_fs_provided: + storage_config_tag = "custom" + elif storage.storage_filesystem.type_name in whitelist: + storage_config_tag = storage.storage_filesystem.type_name + else: + storage_config_tag = "other" + + record_extra_usage_tag(TagKey.AIR_STORAGE_CONFIGURATION, storage_config_tag) + + +def tag_ray_air_env_vars() -> bool: + """Records usage of environment variables exposed by the Ray AIR libraries. + + NOTE: This does not track the values of the environment variables, nor + does this track environment variables not explicitly included in the + `all_ray_air_env_vars` allow-list. + + Returns: + bool: True if at least one environment var is supplied by the user. + """ + from ray.air.constants import AIR_ENV_VARS + from ray.train.constants import TRAIN_ENV_VARS + from ray.tune.constants import TUNE_ENV_VARS + + all_ray_air_env_vars = sorted( + set().union(AIR_ENV_VARS, TUNE_ENV_VARS, TRAIN_ENV_VARS) + ) + + user_supplied_env_vars = [] + + for env_var in all_ray_air_env_vars: + if env_var in os.environ: + user_supplied_env_vars.append(env_var) + + if user_supplied_env_vars: + env_vars_str = json.dumps(user_supplied_env_vars) + record_extra_usage_tag(TagKey.AIR_ENV_VARS, env_vars_str) + return True + + return False + + +def tag_air_entrypoint(entrypoint: AirEntrypoint) -> None: + """Records the entrypoint to an AIR training run.""" + assert entrypoint in AirEntrypoint + record_extra_usage_tag(TagKey.AIR_ENTRYPOINT, entrypoint.value) diff --git a/.venv/lib/python3.11/site-packages/ray/air/_internal/util.py b/.venv/lib/python3.11/site-packages/ray/air/_internal/util.py new file mode 100644 index 0000000000000000000000000000000000000000..ddceba726ee46ea1a1e884e3511430af1e282870 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/air/_internal/util.py @@ -0,0 +1,134 @@ +import copy +import logging +import os +import queue +import socket +import threading +from contextlib import closing +from typing import Optional + +import numpy as np + +from ray.air.constants import _ERROR_REPORT_TIMEOUT + +logger = logging.getLogger(__name__) + + +def find_free_port(): + with closing(socket.socket(socket.AF_INET, socket.SOCK_STREAM)) as s: + s.bind(("", 0)) + s.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1) + return s.getsockname()[1] + + +def is_nan(value): + return np.isnan(value) + + +def is_nan_or_inf(value): + return is_nan(value) or np.isinf(value) + + +class StartTraceback(Exception): + """These exceptions (and their tracebacks) can be skipped with `skip_exceptions`""" + + pass + + +class StartTracebackWithWorkerRank(StartTraceback): + def __init__(self, worker_rank: int) -> None: + super().__init__() + self.worker_rank = worker_rank + + def __reduce__(self): + return (self.__class__, (self.worker_rank,)) + + +def skip_exceptions(exc: Optional[Exception]) -> Exception: + """Skip all contained `StartTracebacks` to reduce traceback output. + + Returns a shallow copy of the exception with all `StartTracebacks` removed. + + If the RAY_AIR_FULL_TRACEBACKS environment variable is set, + the original exception (not a copy) is returned. + """ + should_not_shorten = bool(int(os.environ.get("RAY_AIR_FULL_TRACEBACKS", "0"))) + + if should_not_shorten: + return exc + + if isinstance(exc, StartTraceback): + # If this is a StartTraceback, skip + return skip_exceptions(exc.__cause__) + + # Perform a shallow copy to prevent recursive __cause__/__context__. + new_exc = copy.copy(exc).with_traceback(exc.__traceback__) + + # Make sure nested exceptions are properly skipped. + cause = getattr(exc, "__cause__", None) + if cause: + new_exc.__cause__ = skip_exceptions(cause) + + return new_exc + + +def exception_cause(exc: Optional[Exception]) -> Optional[Exception]: + if not exc: + return None + + return getattr(exc, "__cause__", None) + + +class RunnerThread(threading.Thread): + """Supervisor thread that runs your script.""" + + def __init__(self, *args, error_queue, **kwargs): + threading.Thread.__init__(self, *args, **kwargs) + self._error_queue = error_queue + self._ret = None + + def _propagate_exception(self, e: BaseException): + try: + # report the error but avoid indefinite blocking which would + # prevent the exception from being propagated in the unlikely + # case that something went terribly wrong + self._error_queue.put(e, block=True, timeout=_ERROR_REPORT_TIMEOUT) + except queue.Full: + logger.critical( + ( + "Runner Thread was unable to report error to main " + "function runner thread. This means a previous error " + "was not processed. This should never happen." + ) + ) + + def run(self): + try: + self._ret = self._target(*self._args, **self._kwargs) + except StopIteration: + logger.debug( + ( + "Thread runner raised StopIteration. Interpreting it as a " + "signal to terminate the thread without error." + ) + ) + except SystemExit as e: + # Do not propagate up for graceful termination. + if e.code == 0: + logger.debug( + ( + "Thread runner raised SystemExit with error code 0. " + "Interpreting it as a signal to terminate the thread " + "without error." + ) + ) + else: + # If non-zero exit code, then raise exception to main thread. + self._propagate_exception(e) + except BaseException as e: + # Propagate all other exceptions to the main thread. + self._propagate_exception(e) + + def join(self, timeout=None): + super(RunnerThread, self).join(timeout) + return self._ret diff --git a/.venv/lib/python3.11/site-packages/ray/air/examples/__init__.py b/.venv/lib/python3.11/site-packages/ray/air/examples/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/.venv/lib/python3.11/site-packages/ray/air/examples/__pycache__/__init__.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/air/examples/__pycache__/__init__.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..2dc7cbe872fb03edb82aff479e3931718562bfc5 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/air/examples/__pycache__/__init__.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/air/examples/__pycache__/custom_trainer.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/air/examples/__pycache__/custom_trainer.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..44013899d4f5d2865c93e4eae9bbe69ce5145e40 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/air/examples/__pycache__/custom_trainer.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/air/examples/custom_trainer.py b/.venv/lib/python3.11/site-packages/ray/air/examples/custom_trainer.py new file mode 100644 index 0000000000000000000000000000000000000000..1ac37ec892300fdf28bd869ce12c2c4a17ccf3b6 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/air/examples/custom_trainer.py @@ -0,0 +1,61 @@ +# ruff: noqa +# isort: skip_file +# TODO(rliaw): Include this in the docs. + +# fmt: off +# __custom_trainer_begin__ +import torch + +from ray import train +from ray.train.trainer import BaseTrainer + + +class MyPytorchTrainer(BaseTrainer): + def setup(self): + self.model = torch.nn.Linear(1, 1) + self.optimizer = torch.optim.SGD(self.model.parameters(), lr=0.1) + + def training_loop(self): + # You can access any Trainer attributes directly in this method. + # self.datasets["train"] has already been + # preprocessed by self.preprocessor + dataset = self.datasets["train"] + + loss_fn = torch.nn.MSELoss() + + for epoch_idx in range(10): + loss = 0 + num_batches = 0 + for batch in dataset.iter_torch_batches(dtypes=torch.float): + # Compute prediction error + X, y = torch.unsqueeze(batch["x"], 1), batch["y"] + pred = self.model(X) + batch_loss = loss_fn(pred, y) + + # Backpropagation + self.optimizer.zero_grad() + batch_loss.backward() + self.optimizer.step() + + loss += batch_loss.item() + num_batches += 1 + loss /= num_batches + + # Use Tune functions to report intermediate + # results. + train.report({"loss": loss, "epoch": epoch_idx}) + + +# __custom_trainer_end__ +# fmt: on + + +# fmt: off +# __custom_trainer_usage_begin__ +import ray + +train_dataset = ray.data.from_items([{"x": i, "y": i} for i in range(3)]) +my_trainer = MyPytorchTrainer(datasets={"train": train_dataset}) +result = my_trainer.fit() +# __custom_trainer_usage_end__ +# fmt: on diff --git a/.venv/lib/python3.11/site-packages/ray/air/execution/_internal/__init__.py b/.venv/lib/python3.11/site-packages/ray/air/execution/_internal/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..db48d7854eec8683013753f9d54a667826ee4071 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/air/execution/_internal/__init__.py @@ -0,0 +1,5 @@ +from ray.air.execution._internal.actor_manager import RayActorManager +from ray.air.execution._internal.barrier import Barrier +from ray.air.execution._internal.tracked_actor import TrackedActor + +__all__ = ["Barrier", "RayActorManager", "TrackedActor"] diff --git a/.venv/lib/python3.11/site-packages/ray/air/execution/_internal/__pycache__/__init__.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/air/execution/_internal/__pycache__/__init__.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..6dd5b621d29a90d409ae78bdae0e9ea3030939ec Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/air/execution/_internal/__pycache__/__init__.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/air/execution/_internal/__pycache__/actor_manager.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/air/execution/_internal/__pycache__/actor_manager.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..f6bfb54533a59d915130215e572cd1bf334c0cb4 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/air/execution/_internal/__pycache__/actor_manager.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/air/execution/_internal/__pycache__/barrier.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/air/execution/_internal/__pycache__/barrier.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..b3fc47a728e254ca31d807747ab9cc74d591da9a Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/air/execution/_internal/__pycache__/barrier.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/air/execution/_internal/__pycache__/event_manager.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/air/execution/_internal/__pycache__/event_manager.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..59bbb107cc3202badc6b86daff2c88d0cab4d21d Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/air/execution/_internal/__pycache__/event_manager.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/air/execution/_internal/__pycache__/tracked_actor.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/air/execution/_internal/__pycache__/tracked_actor.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..099b3a74d045ec11f7df8fe27a4f246a32a96d77 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/air/execution/_internal/__pycache__/tracked_actor.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/air/execution/_internal/__pycache__/tracked_actor_task.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/air/execution/_internal/__pycache__/tracked_actor_task.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..f394a8492a1c6a38d41f3cbb22fc0221c42361ca Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/air/execution/_internal/__pycache__/tracked_actor_task.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/air/execution/_internal/actor_manager.py b/.venv/lib/python3.11/site-packages/ray/air/execution/_internal/actor_manager.py new file mode 100644 index 0000000000000000000000000000000000000000..33cf6d9ec9f28ca72197c176c5049c19a7239675 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/air/execution/_internal/actor_manager.py @@ -0,0 +1,894 @@ +import logging +import random +import time +import uuid +from collections import Counter, defaultdict +from typing import Any, Callable, Dict, List, Optional, Set, Tuple, Type, Union + +import ray +from ray.air.execution._internal.event_manager import RayEventManager +from ray.air.execution._internal.tracked_actor import TrackedActor +from ray.air.execution._internal.tracked_actor_task import TrackedActorTask +from ray.air.execution.resources import ( + AcquiredResources, + ResourceManager, + ResourceRequest, +) +from ray.exceptions import RayActorError, RayTaskError + +logger = logging.getLogger(__name__) + + +class RayActorManager: + """Management class for Ray actors and actor tasks. + + This class provides an event-based management interface for actors, and + actor tasks. + + The manager can be used to start actors, stop actors, and schedule and + track task futures on these actors. + The manager will then invoke callbacks related to the tracked entities. + + For instance, when an actor is added with + :meth:`add_actor() `, + a :ref:`TrackedActor `, + an ``on_result`` callback can be specified that is invoked when the task + successfully resolves, and an ``on_error`` callback will resolve when the + task fails. + + The RayActorManager does not implement any true asynchronous processing. Control + has to be explicitly yielded to the event manager via :meth:`RayActorManager.next`. + Callbacks will only be invoked when control is with the RayActorManager, and + callbacks will always be executed sequentially in order of arriving events. + + Args: + resource_manager: Resource manager used to request resources for the actors. + + Example: + + .. code-block:: python + + from ray.air.execution import ResourceRequest + from ray.air.execution._internal import RayActorManager + + actor_manager = RayActorManager() + + # Request an actor + tracked_actor = actor_manager.add_actor( + ActorClass, + kwargs={}, + resource_request=ResourceRequest([{"CPU": 1}]), + on_start=actor_start_callback, + on_stop=actor_stop_callback, + on_error=actor_error_callback + ) + + # Yield control to event manager to start actor + actor_manager.next() + + # Start task on the actor (ActorClass.foo.remote()) + tracked_actor_task = actor_manager.schedule_actor_task( + tracked_actor, + method_name="foo", + on_result=task_result_callback, + on_error=task_error_callback + ) + + # Again yield control to event manager to process task futures + actor_manager.wait() + + """ + + def __init__(self, resource_manager: ResourceManager): + self._resource_manager: ResourceManager = resource_manager + + self._actor_state_events = RayEventManager() + self._actor_task_events = RayEventManager() + + # --- + # Tracked actor futures. + + # This maps TrackedActor objects to their futures. We use this to see if an + # actor has any futures scheduled and to remove them when we terminate an actor. + + # Actors to actor task futures + self._tracked_actors_to_task_futures: Dict[ + TrackedActor, Set[ray.ObjectRef] + ] = defaultdict(set) + + # Actors to actor state futures (start/terminate) + self._tracked_actors_to_state_futures: Dict[ + TrackedActor, Set[ray.ObjectRef] + ] = defaultdict(set) + + # --- + # Pending actors. + # We use three dicts for actors that are requested but not yet started. + + # This dict keeps a list of actors associated with each resource request. + # We use this to start actors in the correct order when their resources + # become available. + self._resource_request_to_pending_actors: Dict[ + ResourceRequest, List[TrackedActor] + ] = defaultdict(list) + + # This dict stores the actor class, kwargs, and resource request of + # pending actors. Once the resources are available, we start the remote + # actor class with its args. We need the resource request to cancel it + # if needed. + self._pending_actors_to_attrs: Dict[ + TrackedActor, Tuple[Type, Dict[str, Any], ResourceRequest] + ] = {} + + # This dict keeps track of cached actor tasks. We can't schedule actor + # tasks before the actor is actually scheduled/live. So when the caller + # tries to schedule a task, we cache it here, and schedule it once the + # actor is started. + self._pending_actors_to_enqueued_actor_tasks: Dict[ + TrackedActor, List[Tuple[TrackedActorTask, str, Tuple[Any], Dict[str, Any]]] + ] = defaultdict(list) + + # --- + # Live actors. + # We keep one dict for actors that are currently running and a set of + # actors that we should forcefully kill. + + # This dict associates the TrackedActor object with the Ray actor handle + # and the resources associated to the actor. We use it to schedule the + # actual ray tasks, and to return the resources when the actor stopped. + self._live_actors_to_ray_actors_resources: Dict[ + TrackedActor, Tuple[ray.actor.ActorHandle, AcquiredResources] + ] = {} + self._live_resource_cache: Optional[Dict[str, Any]] = None + + # This dict contains all actors that should be killed (after calling + # `remove_actor()`). Kill requests will be handled in wait(). + self._live_actors_to_kill: Set[TrackedActor] = set() + + # Track failed actors + self._failed_actor_ids: Set[int] = set() + + def next(self, timeout: Optional[Union[int, float]] = None) -> bool: + """Yield control to event manager to await the next event and invoke callbacks. + + Calling this method will wait for up to ``timeout`` seconds for the next + event to arrive. + + When events arrive, callbacks relating to the events will be + invoked. A timeout of ``None`` will block until the next event arrives. + + Note: + If an actor task fails with a ``RayActorError``, this is one event, + but it may trigger _two_ `on_error` callbacks: One for the actor, + and one for the task. + + Note: + The ``timeout`` argument is used for pure waiting time for events. It does + not include time spent on processing callbacks. Depending on the processing + time of the callbacks, it can take much longer for this function to + return than the specified timeout. + + Args: + timeout: Timeout in seconds to wait for next event. + + Returns: + True if at least one event was processed. + + """ + # First issue any pending forceful actor kills + actor_killed = self._try_kill_actor() + + # We always try to start actors as this won't trigger an event callback + self._try_start_actors() + + # If an actor was killed, this was our event, and we return. + if actor_killed: + return True + + # Otherwise, collect all futures and await the next. + resource_futures = self._resource_manager.get_resource_futures() + actor_state_futures = self._actor_state_events.get_futures() + actor_task_futures = self._actor_task_events.get_futures() + + # Shuffle state futures + shuffled_state_futures = list(actor_state_futures) + random.shuffle(shuffled_state_futures) + + # Shuffle task futures + shuffled_task_futures = list(actor_task_futures) + random.shuffle(shuffled_task_futures) + + # Prioritize resource futures over actor state over task futures + all_futures = resource_futures + shuffled_state_futures + shuffled_task_futures + + start_wait = time.monotonic() + ready, _ = ray.wait(all_futures, num_returns=1, timeout=timeout) + + if not ready: + return False + + [future] = ready + + if future in actor_state_futures: + self._actor_state_events.resolve_future(future) + elif future in actor_task_futures: + self._actor_task_events.resolve_future(future) + else: + self._handle_ready_resource_future() + # Ready resource futures don't count as one event as they don't trigger + # any callbacks. So we repeat until we hit anything that is not a resource + # future. + time_taken = time.monotonic() - start_wait + return self.next( + timeout=max(1e-9, timeout - time_taken) if timeout is not None else None + ) + + self._try_start_actors() + return True + + def _actor_start_resolved(self, tracked_actor: TrackedActor, future: ray.ObjectRef): + """Callback to be invoked when actor started""" + self._tracked_actors_to_state_futures[tracked_actor].remove(future) + + if tracked_actor._on_start: + tracked_actor._on_start(tracked_actor) + + def _actor_stop_resolved(self, tracked_actor: TrackedActor): + """Callback to be invoked when actor stopped""" + self._cleanup_actor(tracked_actor=tracked_actor) + + if tracked_actor._on_stop: + tracked_actor._on_stop(tracked_actor) + + def _actor_start_failed(self, tracked_actor: TrackedActor, exception: Exception): + """Callback to be invoked when actor start/stop failed""" + self._failed_actor_ids.add(tracked_actor.actor_id) + + self._cleanup_actor(tracked_actor=tracked_actor) + + if tracked_actor._on_error: + tracked_actor._on_error(tracked_actor, exception) + + def _actor_task_failed( + self, tracked_actor_task: TrackedActorTask, exception: Exception + ): + """Handle an actor task future that became ready. + + - On actor error, trigger actor error callback AND error task error callback + - On task error, trigger actor task error callback + - On success, trigger actor task result callback + """ + tracked_actor = tracked_actor_task._tracked_actor + + if isinstance(exception, RayActorError): + self._failed_actor_ids.add(tracked_actor.actor_id) + + # Clean up any references to the actor and its futures + self._cleanup_actor(tracked_actor=tracked_actor) + + # Handle actor state callbacks + if tracked_actor._on_error: + tracked_actor._on_error(tracked_actor, exception) + + # Then trigger actor task error callback + if tracked_actor_task._on_error: + tracked_actor_task._on_error(tracked_actor, exception) + + elif isinstance(exception, RayTaskError): + # Otherwise only the task failed. Invoke callback + if tracked_actor_task._on_error: + tracked_actor_task._on_error(tracked_actor, exception) + else: + raise RuntimeError( + f"Caught unexpected exception: {exception}" + ) from exception + + def _actor_task_resolved(self, tracked_actor_task: TrackedActorTask, result: Any): + tracked_actor = tracked_actor_task._tracked_actor + + # Trigger actor task result callback + if tracked_actor_task._on_result: + tracked_actor_task._on_result(tracked_actor, result) + + def _handle_ready_resource_future(self): + """Handle a resource future that became ready. + + - Update state of the resource manager + - Try to start one actor + """ + # Force resource manager to update internal state + self._resource_manager.update_state() + # We handle resource futures one by one, so only try to start 1 actor at a time + self._try_start_actors(max_actors=1) + + def _try_start_actors(self, max_actors: Optional[int] = None) -> int: + """Try to start up to ``max_actors`` actors. + + This function will iterate through all resource requests we collected for + pending actors. As long as a resource request can be fulfilled (resources + are available), we try to start as many actors as possible. + + This will schedule a `Actor.__ray_ready__()` future which, once resolved, + will trigger the `TrackedActor.on_start` callback. + """ + started_actors = 0 + + # Iterate through all resource requests + for resource_request in self._resource_request_to_pending_actors: + if max_actors is not None and started_actors >= max_actors: + break + + # While we have resources ready and there are actors left to schedule + while ( + self._resource_manager.has_resources_ready(resource_request) + and self._resource_request_to_pending_actors[resource_request] + ): + # Acquire resources for actor + acquired_resources = self._resource_manager.acquire_resources( + resource_request + ) + assert acquired_resources + + # Get tracked actor to start + candidate_actors = self._resource_request_to_pending_actors[ + resource_request + ] + assert candidate_actors + + tracked_actor = candidate_actors.pop(0) + + # Get actor class and arguments + actor_cls, kwargs, _ = self._pending_actors_to_attrs.pop(tracked_actor) + + if not isinstance(actor_cls, ray.actor.ActorClass): + actor_cls = ray.remote(actor_cls) + + # Associate to acquired resources + [remote_actor_cls] = acquired_resources.annotate_remote_entities( + [actor_cls] + ) + + # Start Ray actor + actor = remote_actor_cls.remote(**kwargs) + + # Track + self._live_actors_to_ray_actors_resources[tracked_actor] = ( + actor, + acquired_resources, + ) + self._live_resource_cache = None + + # Schedule ready future + future = actor.__ray_ready__.remote() + + self._tracked_actors_to_state_futures[tracked_actor].add(future) + + # We need to create the callbacks in a function so tracked_actors + # are captured correctly. + def create_callbacks( + tracked_actor: TrackedActor, future: ray.ObjectRef + ): + def on_actor_start(result: Any): + self._actor_start_resolved( + tracked_actor=tracked_actor, future=future + ) + + def on_error(exception: Exception): + self._actor_start_failed( + tracked_actor=tracked_actor, exception=exception + ) + + return on_actor_start, on_error + + on_actor_start, on_error = create_callbacks( + tracked_actor=tracked_actor, future=future + ) + + self._actor_state_events.track_future( + future=future, + on_result=on_actor_start, + on_error=on_error, + ) + + self._enqueue_cached_actor_tasks(tracked_actor=tracked_actor) + + started_actors += 1 + + return started_actors + + def _enqueue_cached_actor_tasks(self, tracked_actor: TrackedActor): + assert tracked_actor in self._live_actors_to_ray_actors_resources + + # Enqueue cached futures + cached_tasks = self._pending_actors_to_enqueued_actor_tasks.pop( + tracked_actor, [] + ) + for tracked_actor_task, method_name, args, kwargs in cached_tasks: + self._schedule_tracked_actor_task( + tracked_actor_task=tracked_actor_task, + method_name=method_name, + args=args, + kwargs=kwargs, + ) + + def _try_kill_actor(self) -> bool: + """Try to kill actor scheduled for termination.""" + if not self._live_actors_to_kill: + return False + + tracked_actor = self._live_actors_to_kill.pop() + + # Remove from tracked actors + ( + ray_actor, + acquired_resources, + ) = self._live_actors_to_ray_actors_resources[tracked_actor] + + # Hard kill if requested + ray.kill(ray_actor) + + self._cleanup_actor_futures(tracked_actor) + + self._actor_stop_resolved(tracked_actor) + + return True + + def _cleanup_actor(self, tracked_actor: TrackedActor): + self._cleanup_actor_futures(tracked_actor) + + # Remove from tracked actors + ( + ray_actor, + acquired_resources, + ) = self._live_actors_to_ray_actors_resources.pop(tracked_actor) + self._live_resource_cache = None + + # Return resources + self._resource_manager.free_resources(acquired_resource=acquired_resources) + + @property + def all_actors(self) -> List[TrackedActor]: + """Return all ``TrackedActor`` objects managed by this manager instance.""" + return self.live_actors + self.pending_actors + + @property + def live_actors(self) -> List[TrackedActor]: + """Return all ``TrackedActor`` objects that are currently alive.""" + return list(self._live_actors_to_ray_actors_resources) + + @property + def pending_actors(self) -> List[TrackedActor]: + """Return all ``TrackedActor`` objects that are currently pending.""" + return list(self._pending_actors_to_attrs) + + @property + def num_live_actors(self): + """Return number of started actors.""" + return len(self.live_actors) + + @property + def num_pending_actors(self) -> int: + """Return number of pending (not yet started) actors.""" + return len(self.pending_actors) + + @property + def num_total_actors(self): + """Return number of total actors.""" + return len(self.all_actors) + + @property + def num_actor_tasks(self): + """Return number of pending tasks""" + return self._actor_task_events.num_futures + + def get_live_actors_resources(self): + if self._live_resource_cache: + return self._live_resource_cache + + counter = Counter() + for _, acq in self._live_actors_to_ray_actors_resources.values(): + for bdl in acq.resource_request.bundles: + counter.update(bdl) + self._live_resource_cache = dict(counter) + return self._live_resource_cache + + def add_actor( + self, + cls: Union[Type, ray.actor.ActorClass], + kwargs: Dict[str, Any], + resource_request: ResourceRequest, + *, + on_start: Optional[Callable[[TrackedActor], None]] = None, + on_stop: Optional[Callable[[TrackedActor], None]] = None, + on_error: Optional[Callable[[TrackedActor, Exception], None]] = None, + ) -> TrackedActor: + """Add an actor to be tracked. + + This method will request resources to start the actor. Once the resources + are available, the actor will be started and the + :meth:`TrackedActor.on_start + ` callback + will be invoked. + + Args: + cls: Actor class to schedule. + kwargs: Keyword arguments to pass to actor class on construction. + resource_request: Resources required to start the actor. + on_start: Callback to invoke when the actor started. + on_stop: Callback to invoke when the actor stopped. + on_error: Callback to invoke when the actor failed. + + Returns: + Tracked actor object to reference actor in subsequent API calls. + + """ + tracked_actor = TrackedActor( + uuid.uuid4().int, on_start=on_start, on_stop=on_stop, on_error=on_error + ) + + self._pending_actors_to_attrs[tracked_actor] = cls, kwargs, resource_request + self._resource_request_to_pending_actors[resource_request].append(tracked_actor) + + self._resource_manager.request_resources(resource_request=resource_request) + + return tracked_actor + + def remove_actor( + self, + tracked_actor: TrackedActor, + kill: bool = False, + stop_future: Optional[ray.ObjectRef] = None, + ) -> bool: + """Remove a tracked actor. + + If the actor has already been started, this will stop the actor. This will + trigger the :meth:`TrackedActor.on_stop + ` + callback once the actor stopped. + + If the actor has only been requested, but not started, yet, this will cancel + the actor request. This will not trigger any callback. + + If ``kill=True``, this will use ``ray.kill()`` to forcefully terminate the + actor. Otherwise, graceful actor deconstruction will be scheduled after + all currently tracked futures are resolved. + + This method returns a boolean, indicating if a stop future is tracked and + the ``on_stop`` callback will be invoked. If the actor has been alive, + this will be ``True``. If the actor hasn't been scheduled, yet, or failed + (and triggered the ``on_error`` callback), this will be ``False``. + + Args: + tracked_actor: Tracked actor to be removed. + kill: If set, will forcefully terminate the actor instead of gracefully + scheduling termination. + stop_future: If set, use this future to track actor termination. + Otherwise, schedule a ``__ray_terminate__`` future. + + Returns: + Boolean indicating if the actor was previously alive, and thus whether + a callback will be invoked once it is terminated. + + """ + if tracked_actor.actor_id in self._failed_actor_ids: + logger.debug( + f"Tracked actor already failed, no need to remove: {tracked_actor}" + ) + return False + elif tracked_actor in self._live_actors_to_ray_actors_resources: + # Ray actor is running. + + if not kill: + # Schedule __ray_terminate__ future + ray_actor, _ = self._live_actors_to_ray_actors_resources[tracked_actor] + + # Clear state futures here to avoid resolving __ray_ready__ futures + for future in list( + self._tracked_actors_to_state_futures[tracked_actor] + ): + self._actor_state_events.discard_future(future) + self._tracked_actors_to_state_futures[tracked_actor].remove(future) + + # If the __ray_ready__ future hasn't resolved yet, but we already + # scheduled the actor via Actor.remote(), we just want to stop + # it but not trigger any callbacks. This is in accordance with + # the contract defined in the docstring. + tracked_actor._on_start = None + tracked_actor._on_stop = None + tracked_actor._on_error = None + + def on_actor_stop(*args, **kwargs): + self._actor_stop_resolved(tracked_actor=tracked_actor) + + if stop_future: + # If the stop future was schedule via the actor manager, + # discard (track it as state future instead). + self._actor_task_events.discard_future(stop_future) + else: + stop_future = ray_actor.__ray_terminate__.remote() + + self._actor_state_events.track_future( + future=stop_future, + on_result=on_actor_stop, + on_error=on_actor_stop, + ) + + self._tracked_actors_to_state_futures[tracked_actor].add(stop_future) + else: + # kill = True + self._live_actors_to_kill.add(tracked_actor) + + return True + + elif tracked_actor in self._pending_actors_to_attrs: + # Actor is pending, stop + _, _, resource_request = self._pending_actors_to_attrs.pop(tracked_actor) + self._resource_request_to_pending_actors[resource_request].remove( + tracked_actor + ) + self._resource_manager.cancel_resource_request( + resource_request=resource_request + ) + return False + else: + raise ValueError(f"Unknown tracked actor: {tracked_actor}") + + def is_actor_started(self, tracked_actor: TrackedActor) -> bool: + """Returns True if the actor has been started. + + Args: + tracked_actor: Tracked actor object. + """ + return ( + tracked_actor in self._live_actors_to_ray_actors_resources + and tracked_actor.actor_id not in self._failed_actor_ids + ) + + def is_actor_failed(self, tracked_actor: TrackedActor) -> bool: + return tracked_actor.actor_id in self._failed_actor_ids + + def get_actor_resources( + self, tracked_actor: TrackedActor + ) -> Optional[AcquiredResources]: + """Returns the acquired resources of an actor that has been started. + + This will return ``None`` if the actor has not been started, yet. + + Args: + tracked_actor: Tracked actor object. + """ + if not self.is_actor_started(tracked_actor): + return None + + return self._live_actors_to_ray_actors_resources[tracked_actor][1] + + def schedule_actor_task( + self, + tracked_actor: TrackedActor, + method_name: str, + args: Optional[Tuple] = None, + kwargs: Optional[Dict] = None, + on_result: Optional[Callable[[TrackedActor, Any], None]] = None, + on_error: Optional[Callable[[TrackedActor, Exception], None]] = None, + _return_future: bool = False, + ) -> Optional[ray.ObjectRef]: + """Schedule and track a task on an actor. + + This method will schedule a remote task ``method_name`` on the + ``tracked_actor``. + + This method accepts two optional callbacks that will be invoked when + their respective events are triggered. + + The ``on_result`` callback is triggered when a task resolves successfully. + It should accept two arguments: The actor for which the + task resolved, and the result received from the remote call. + + The ``on_error`` callback is triggered when a task fails. + It should accept two arguments: The actor for which the + task threw an error, and the exception. + + Args: + tracked_actor: Actor to schedule task on. + method_name: Remote method name to invoke on the actor. If this is + e.g. ``foo``, then ``actor.foo.remote(*args, **kwargs)`` will be + scheduled. + args: Arguments to pass to the task. + kwargs: Keyword arguments to pass to the task. + on_result: Callback to invoke when the task resolves. + on_error: Callback to invoke when the task fails. + + Raises: + ValueError: If the ``tracked_actor`` is not managed by this event manager. + + """ + args = args or tuple() + kwargs = kwargs or {} + + if tracked_actor.actor_id in self._failed_actor_ids: + return + + tracked_actor_task = TrackedActorTask( + tracked_actor=tracked_actor, on_result=on_result, on_error=on_error + ) + + if tracked_actor not in self._live_actors_to_ray_actors_resources: + # Actor is not started, yet + if tracked_actor not in self._pending_actors_to_attrs: + raise ValueError( + f"Tracked actor is not managed by this event manager: " + f"{tracked_actor}" + ) + + # Cache tasks for future execution + self._pending_actors_to_enqueued_actor_tasks[tracked_actor].append( + (tracked_actor_task, method_name, args, kwargs) + ) + else: + res = self._schedule_tracked_actor_task( + tracked_actor_task=tracked_actor_task, + method_name=method_name, + args=args, + kwargs=kwargs, + _return_future=_return_future, + ) + if _return_future: + return res[1] + + def _schedule_tracked_actor_task( + self, + tracked_actor_task: TrackedActorTask, + method_name: str, + *, + args: Optional[Tuple] = None, + kwargs: Optional[Dict] = None, + _return_future: bool = False, + ) -> Union[TrackedActorTask, Tuple[TrackedActorTask, ray.ObjectRef]]: + tracked_actor = tracked_actor_task._tracked_actor + ray_actor, _ = self._live_actors_to_ray_actors_resources[tracked_actor] + + try: + remote_fn = getattr(ray_actor, method_name) + except AttributeError as e: + raise AttributeError( + f"Remote function `{method_name}()` does not exist for this actor." + ) from e + + def on_result(result: Any): + self._actor_task_resolved( + tracked_actor_task=tracked_actor_task, result=result + ) + + def on_error(exception: Exception): + self._actor_task_failed( + tracked_actor_task=tracked_actor_task, exception=exception + ) + + future = remote_fn.remote(*args, **kwargs) + + self._actor_task_events.track_future( + future=future, on_result=on_result, on_error=on_error + ) + + self._tracked_actors_to_task_futures[tracked_actor].add(future) + + if _return_future: + return tracked_actor_task, future + + return tracked_actor_task + + def schedule_actor_tasks( + self, + tracked_actors: List[TrackedActor], + method_name: str, + *, + args: Optional[Union[Tuple, List[Tuple]]] = None, + kwargs: Optional[Union[Dict, List[Dict]]] = None, + on_result: Optional[Callable[[TrackedActor, Any], None]] = None, + on_error: Optional[Callable[[TrackedActor, Exception], None]] = None, + ) -> None: + """Schedule and track tasks on a list of actors. + + This method will schedule a remote task ``method_name`` on all + ``tracked_actors``. + + ``args`` and ``kwargs`` can be a single tuple/dict, in which case the same + (keyword) arguments are passed to all actors. If a list is passed instead, + they are mapped to the respective actors. In that case, the list of + (keyword) arguments must be the same length as the list of actors. + + This method accepts two optional callbacks that will be invoked when + their respective events are triggered. + + The ``on_result`` callback is triggered when a task resolves successfully. + It should accept two arguments: The actor for which the + task resolved, and the result received from the remote call. + + The ``on_error`` callback is triggered when a task fails. + It should accept two arguments: The actor for which the + task threw an error, and the exception. + + Args: + tracked_actors: List of actors to schedule tasks on. + method_name: Remote actor method to invoke on the actors. If this is + e.g. ``foo``, then ``actor.foo.remote(*args, **kwargs)`` will be + scheduled on all actors. + args: Arguments to pass to the task. + kwargs: Keyword arguments to pass to the task. + on_result: Callback to invoke when the task resolves. + on_error: Callback to invoke when the task fails. + + """ + if not isinstance(args, List): + args_list = [args] * len(tracked_actors) + else: + if len(tracked_actors) != len(args): + raise ValueError( + f"Length of args must be the same as tracked_actors " + f"list. Got `len(kwargs)={len(kwargs)}` and " + f"`len(tracked_actors)={len(tracked_actors)}" + ) + args_list = args + + if not isinstance(kwargs, List): + kwargs_list = [kwargs] * len(tracked_actors) + else: + if len(tracked_actors) != len(kwargs): + raise ValueError( + f"Length of kwargs must be the same as tracked_actors " + f"list. Got `len(args)={len(args)}` and " + f"`len(tracked_actors)={len(tracked_actors)}" + ) + kwargs_list = kwargs + + for tracked_actor, args, kwargs in zip(tracked_actors, args_list, kwargs_list): + self.schedule_actor_task( + tracked_actor=tracked_actor, + method_name=method_name, + args=args, + kwargs=kwargs, + on_result=on_result, + on_error=on_error, + ) + + def clear_actor_task_futures(self, tracked_actor: TrackedActor): + """Discard all actor task futures from a tracked actor.""" + futures = self._tracked_actors_to_task_futures.pop(tracked_actor, []) + for future in futures: + self._actor_task_events.discard_future(future) + + def _cleanup_actor_futures(self, tracked_actor: TrackedActor): + # Remove all actor task futures + self.clear_actor_task_futures(tracked_actor=tracked_actor) + + # Remove all actor state futures + futures = self._tracked_actors_to_state_futures.pop(tracked_actor, []) + for future in futures: + self._actor_state_events.discard_future(future) + + def cleanup(self): + for ( + actor, + acquired_resources, + ) in self._live_actors_to_ray_actors_resources.values(): + ray.kill(actor) + self._resource_manager.free_resources(acquired_resources) + + for ( + resource_request, + pending_actors, + ) in self._resource_request_to_pending_actors.items(): + for i in range(len(pending_actors)): + self._resource_manager.cancel_resource_request(resource_request) + + self._resource_manager.clear() + + self.__init__(resource_manager=self._resource_manager) diff --git a/.venv/lib/python3.11/site-packages/ray/air/execution/_internal/barrier.py b/.venv/lib/python3.11/site-packages/ray/air/execution/_internal/barrier.py new file mode 100644 index 0000000000000000000000000000000000000000..8cb0c5a2bf55242281090a1cc6bcac9009face52 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/air/execution/_internal/barrier.py @@ -0,0 +1,93 @@ +from typing import Any, Callable, List, Optional, Tuple + + +class Barrier: + """Barrier to collect results and process them in bulk. + + A barrier can be used to collect multiple results and process them in bulk once + a certain count or a timeout is reached. + + For instance, if ``max_results=N``, the ``on_completion`` callback will be + invoked once :meth:`arrive` has been called ``N`` times. + + The completion callback will only be invoked once, even if more results + arrive after completion. The collected results can be resetted + with :meth:`reset`, after which the callback may be invoked again. + + The completion callback should expect one argument, which is the barrier + object that completed. + + Args: + max_results: Maximum number of results to collect before a call to + :meth:`wait` resolves or the :meth:`on_completion` callback is invoked. + on_completion: Callback to invoke when ``max_results`` results + arrived at the barrier. + + """ + + def __init__( + self, + max_results: int, + *, + on_completion: Optional[Callable[["Barrier"], None]] = None, + ): + self._max_results = max_results + + # on_completion callback + self._completed = False + self._on_completion = on_completion + + # Collect received results + self._results: List[Tuple[Any]] = [] + + def arrive(self, *data): + """Notify barrier that a result successfully arrived. + + This will count against the ``max_results`` limit. The received result + will be included in a call to :meth:`get_results`. + + Args: + *data: Result data to be cached. Can be obtained via :meth:`get_results`. + + """ + if len(data) == 1: + data = data[0] + + self._results.append(data) + self._check_completion() + + def _check_completion(self): + if self._completed: + # Already fired completion callback + return + + if self.num_results >= self._max_results: + # Barrier is complete + self._completed = True + + if self._on_completion: + self._on_completion(self) + + @property + def completed(self) -> bool: + """Returns True if the barrier is completed.""" + return self._completed + + @property + def num_results(self) -> int: + """Number of received (successful) results.""" + return len(self._results) + + def get_results(self) -> List[Tuple[Any]]: + """Return list of received results.""" + return self._results + + def reset(self) -> None: + """Reset barrier, removing all received results. + + Resetting the barrier will reset the completion status. When ``max_results`` + is set and enough new events arrive after resetting, the + :meth:`on_completion` callback will be invoked again. + """ + self._completed = False + self._results = [] diff --git a/.venv/lib/python3.11/site-packages/ray/air/execution/_internal/event_manager.py b/.venv/lib/python3.11/site-packages/ray/air/execution/_internal/event_manager.py new file mode 100644 index 0000000000000000000000000000000000000000..fbea6f72d2925b5435931e58203d206bd0009709 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/air/execution/_internal/event_manager.py @@ -0,0 +1,148 @@ +import random +from typing import Any, Callable, Dict, Iterable, Optional, Set, Tuple, Union + +import ray + +_ResultCallback = Callable[[Any], None] +_ErrorCallback = Callable[[Exception], None] + + +class RayEventManager: + """Event manager for Ray futures. + + The event manager can be used to track futures and invoke callbacks when + they resolve. + + Futures are tracked with :meth:`track_future`. Future can then be awaited with + :meth:`wait`. When futures successfully resolve, they trigger an optional + ``on_result`` callback that can be passed to :meth:`track_future`. If they + fail, they trigger an optional ``on_error`` callback. + + Args: + shuffle_futures: If True, futures will be shuffled before awaited. This + will avoid implicit prioritization of futures within Ray. + """ + + def __init__(self, shuffle_futures: bool = True): + self._shuffle_futures = shuffle_futures + + # Map of futures to callbacks (result, error) + self._tracked_futures: Dict[ + ray.ObjectRef, Tuple[Optional[_ResultCallback], Optional[_ErrorCallback]] + ] = {} + + def track_future( + self, + future: ray.ObjectRef, + on_result: Optional[_ResultCallback] = None, + on_error: Optional[_ErrorCallback] = None, + ): + """Track a single future and invoke callbacks on resolution. + + Control has to be yielded to the event manager for the callbacks to + be invoked, either via :meth:`wait` or via :meth:`resolve_future`. + + Args: + future: Ray future to await. + on_result: Callback to invoke when the future resolves successfully. + on_error: Callback to invoke when the future fails. + + """ + self._tracked_futures[future] = (on_result, on_error) + + def track_futures( + self, + futures: Iterable[ray.ObjectRef], + on_result: Optional[_ResultCallback] = None, + on_error: Optional[_ErrorCallback] = None, + ): + """Track multiple futures and invoke callbacks on resolution. + + Control has to be yielded to the event manager for the callbacks to + be invoked, either via :meth:`wait` or via :meth:`resolve_future`. + + Args: + futures: Ray futures to await. + on_result: Callback to invoke when the future resolves successfully. + on_error: Callback to invoke when the future fails. + + """ + for future in futures: + self.track_future(future, on_result=on_result, on_error=on_error) + + def discard_future(self, future: ray.ObjectRef): + """Remove future from tracking. + + The future will not be awaited anymore, and it will not trigger any callbacks. + + Args: + future: Ray futures to discard. + """ + self._tracked_futures.pop(future, None) + + def get_futures(self) -> Set[ray.ObjectRef]: + """Get futures tracked by the event manager.""" + return set(self._tracked_futures) + + @property + def num_futures(self) -> int: + return len(self._tracked_futures) + + def resolve_future(self, future: ray.ObjectRef): + """Resolve a single future. + + This method will block until the future is available. It will then + trigger the callback associated to the future and the event (success + or error), if specified. + + Args: + future: Ray future to resolve. + + """ + try: + on_result, on_error = self._tracked_futures.pop(future) + except KeyError as e: + raise ValueError( + f"Future {future} is not tracked by this RayEventManager" + ) from e + + try: + result = ray.get(future) + except Exception as e: + if on_error: + on_error(e) + else: + raise e + else: + if on_result: + on_result(result) + + def wait( + self, + timeout: Optional[Union[float, int]] = None, + num_results: Optional[int] = 1, + ): + """Wait up to ``timeout`` seconds for ``num_results`` futures to resolve. + + If ``timeout=None``, this method will block until all `num_results`` futures + resolve. If ``num_results=None``, this method will await all tracked futures. + + For every future that resolves, the respective associated callbacks will be + invoked. + + Args: + timeout: Timeout in second to wait for futures to resolve. + num_results: Number of futures to await. If ``None``, will wait for + all tracked futures to resolve. + + """ + futures = list(self.get_futures()) + + if self._shuffle_futures: + random.shuffle(futures) + + num_results = num_results or len(futures) + + ready, _ = ray.wait(list(futures), timeout=timeout, num_returns=num_results) + for future in ready: + self.resolve_future(future) diff --git a/.venv/lib/python3.11/site-packages/ray/air/execution/_internal/tracked_actor.py b/.venv/lib/python3.11/site-packages/ray/air/execution/_internal/tracked_actor.py new file mode 100644 index 0000000000000000000000000000000000000000..91abd0556e40b197ea82ac8ac55b53613ef635cc --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/air/execution/_internal/tracked_actor.py @@ -0,0 +1,54 @@ +from typing import Callable, Optional + + +class TrackedActor: + """Actor tracked by an actor manager. + + This object is used to reference a Ray actor on an actor manager + + Existence of this object does not mean that the Ray actor has already been started. + Actor state can be inquired from the actor manager tracking the Ray actor. + + Note: + Objects of this class are returned by the :class:`RayActorManager`. + This class should not be instantiated manually. + + Attributes: + actor_id: ID for identification of the actor within the actor manager. This + ID is not related to the Ray actor ID. + + """ + + def __init__( + self, + actor_id: int, + on_start: Optional[Callable[["TrackedActor"], None]] = None, + on_stop: Optional[Callable[["TrackedActor"], None]] = None, + on_error: Optional[Callable[["TrackedActor", Exception], None]] = None, + ): + self.actor_id = actor_id + self._on_start = on_start + self._on_stop = on_stop + self._on_error = on_error + + def set_on_start(self, on_start: Optional[Callable[["TrackedActor"], None]]): + self._on_start = on_start + + def set_on_stop(self, on_stop: Optional[Callable[["TrackedActor"], None]]): + self._on_stop = on_stop + + def set_on_error( + self, on_error: Optional[Callable[["TrackedActor", Exception], None]] + ): + self._on_error = on_error + + def __repr__(self): + return f"" + + def __eq__(self, other): + if not isinstance(other, self.__class__): + return False + return self.actor_id == other.actor_id + + def __hash__(self): + return hash(self.actor_id) diff --git a/.venv/lib/python3.11/site-packages/ray/air/execution/_internal/tracked_actor_task.py b/.venv/lib/python3.11/site-packages/ray/air/execution/_internal/tracked_actor_task.py new file mode 100644 index 0000000000000000000000000000000000000000..2fb21cbfb514924d8d38267c4e63321a27f9cb41 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/air/execution/_internal/tracked_actor_task.py @@ -0,0 +1,42 @@ +from typing import Any, Callable, Optional + +from ray.air.execution._internal.tracked_actor import TrackedActor + + +class TrackedActorTask: + """Actor task tracked by a Ray event manager. + + This container class is used to define callbacks to be invoked when + the task resolves, errors, or times out. + + Note: + Objects of this class are returned by the :class:`RayActorManager`. + This class should not be instantiated manually. + + Args: + tracked_actor: Tracked actor object this task is scheduled on. + on_result: Callback to invoke when the task resolves. + on_error: Callback to invoke when the task fails. + + Example: + + .. code-block:: python + + tracked_futures = actor_manager.schedule_actor_tasks( + actor_manager.live_actors, + "foo", + on_result=lambda actor, result: print(result) + ) + + """ + + def __init__( + self, + tracked_actor: TrackedActor, + on_result: Optional[Callable[[TrackedActor, Any], None]] = None, + on_error: Optional[Callable[[TrackedActor, Exception], None]] = None, + ): + self._tracked_actor = tracked_actor + + self._on_result = on_result + self._on_error = on_error diff --git a/.venv/lib/python3.11/site-packages/ray/air/execution/resources/__pycache__/__init__.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/air/execution/resources/__pycache__/__init__.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..3c7d19fa7e99717c8266007283cab61e6451247f Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/air/execution/resources/__pycache__/__init__.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/air/integrations/__init__.py b/.venv/lib/python3.11/site-packages/ray/air/integrations/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/.venv/lib/python3.11/site-packages/ray/air/integrations/__pycache__/__init__.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/air/integrations/__pycache__/__init__.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..507e8fddcef9ea6d84b614f984dc10e705870ed8 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/air/integrations/__pycache__/__init__.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/air/integrations/__pycache__/comet.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/air/integrations/__pycache__/comet.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..04097100733b87d5d71e6be7a3352c3d94876c81 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/air/integrations/__pycache__/comet.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/air/integrations/__pycache__/keras.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/air/integrations/__pycache__/keras.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..46e40396d1dbc6b2bea6b807a3cfcf77669ddee3 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/air/integrations/__pycache__/keras.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/air/integrations/__pycache__/mlflow.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/air/integrations/__pycache__/mlflow.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..df86b9b7eba23af99ffefaba6f98c960f4b953e3 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/air/integrations/__pycache__/mlflow.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/air/integrations/__pycache__/wandb.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/air/integrations/__pycache__/wandb.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..8582e379cc95b9cbe9bf4af6023bafa4219ed67d Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/air/integrations/__pycache__/wandb.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/air/integrations/keras.py b/.venv/lib/python3.11/site-packages/ray/air/integrations/keras.py new file mode 100644 index 0000000000000000000000000000000000000000..677213e73dedf7fc16a6199c7ac133b35c4b3827 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/air/integrations/keras.py @@ -0,0 +1,185 @@ +import shutil +from typing import Dict, List, Optional, Union + +from tensorflow.keras.callbacks import Callback as KerasCallback + +import ray +from ray.train.tensorflow import TensorflowCheckpoint +from ray.util.annotations import PublicAPI + + +class _Callback(KerasCallback): + """Base class for Air's Keras callbacks.""" + + _allowed = [ + "epoch_begin", + "epoch_end", + "train_batch_begin", + "train_batch_end", + "test_batch_begin", + "test_batch_end", + "predict_batch_begin", + "predict_batch_end", + "train_begin", + "train_end", + "test_begin", + "test_end", + "predict_begin", + "predict_end", + ] + + def __init__(self, on: Union[str, List[str]] = "validation_end"): + super(_Callback, self).__init__() + + if not isinstance(on, list): + on = [on] + if any(w not in self._allowed for w in on): + raise ValueError( + "Invalid trigger time selected: {}. Must be one of {}".format( + on, self._allowed + ) + ) + self._on = on + + def _handle(self, logs: Dict, when: str): + raise NotImplementedError + + def on_epoch_begin(self, epoch, logs=None): + if "epoch_begin" in self._on: + self._handle(logs, "epoch_begin") + + def on_epoch_end(self, epoch, logs=None): + if "epoch_end" in self._on: + self._handle(logs, "epoch_end") + + def on_train_batch_begin(self, batch, logs=None): + if "train_batch_begin" in self._on: + self._handle(logs, "train_batch_begin") + + def on_train_batch_end(self, batch, logs=None): + if "train_batch_end" in self._on: + self._handle(logs, "train_batch_end") + + def on_test_batch_begin(self, batch, logs=None): + if "test_batch_begin" in self._on: + self._handle(logs, "test_batch_begin") + + def on_test_batch_end(self, batch, logs=None): + if "test_batch_end" in self._on: + self._handle(logs, "test_batch_end") + + def on_predict_batch_begin(self, batch, logs=None): + if "predict_batch_begin" in self._on: + self._handle(logs, "predict_batch_begin") + + def on_predict_batch_end(self, batch, logs=None): + if "predict_batch_end" in self._on: + self._handle(logs, "predict_batch_end") + + def on_train_begin(self, logs=None): + if "train_begin" in self._on: + self._handle(logs, "train_begin") + + def on_train_end(self, logs=None): + if "train_end" in self._on: + self._handle(logs, "train_end") + + def on_test_begin(self, logs=None): + if "test_begin" in self._on: + self._handle(logs, "test_begin") + + def on_test_end(self, logs=None): + if "test_end" in self._on: + self._handle(logs, "test_end") + + def on_predict_begin(self, logs=None): + if "predict_begin" in self._on: + self._handle(logs, "predict_begin") + + def on_predict_end(self, logs=None): + if "predict_end" in self._on: + self._handle(logs, "predict_end") + + +@PublicAPI(stability="alpha") +class ReportCheckpointCallback(_Callback): + """Keras callback for Ray Train reporting and checkpointing. + + .. note:: + Metrics are always reported with checkpoints, even if the event isn't specified + in ``report_metrics_on``. + + Example: + .. code-block:: python + + ############# Using it in TrainSession ############### + from ray.air.integrations.keras import ReportCheckpointCallback + def train_loop_per_worker(): + strategy = tf.distribute.MultiWorkerMirroredStrategy() + with strategy.scope(): + model = build_model() + + model.fit(dataset_shard, callbacks=[ReportCheckpointCallback()]) + + Args: + metrics: Metrics to report. If this is a list, each item describes + the metric key reported to Keras, and it's reported under the + same name. If this is a dict, each key is the name reported + and the respective value is the metric key reported to Keras. + If this is None, all Keras logs are reported. + report_metrics_on: When to report metrics. Must be one of + the Keras event hooks (less the ``on_``), e.g. + "train_start" or "predict_end". Defaults to "epoch_end". + checkpoint_on: When to save checkpoints. Must be one of the Keras event hooks + (less the ``on_``), e.g. "train_start" or "predict_end". Defaults to + "epoch_end". + """ + + def __init__( + self, + checkpoint_on: Union[str, List[str]] = "epoch_end", + report_metrics_on: Union[str, List[str]] = "epoch_end", + metrics: Optional[Union[str, List[str], Dict[str, str]]] = None, + ): + if isinstance(checkpoint_on, str): + checkpoint_on = [checkpoint_on] + if isinstance(report_metrics_on, str): + report_metrics_on = [report_metrics_on] + + on = list(set(checkpoint_on + report_metrics_on)) + super().__init__(on=on) + + self._checkpoint_on: List[str] = checkpoint_on + self._report_metrics_on: List[str] = report_metrics_on + self._metrics = metrics + + def _handle(self, logs: Dict, when: str): + assert when in self._checkpoint_on or when in self._report_metrics_on + + metrics = self._get_reported_metrics(logs) + + should_checkpoint = when in self._checkpoint_on + if should_checkpoint: + checkpoint = TensorflowCheckpoint.from_model(self.model) + ray.train.report(metrics, checkpoint=checkpoint) + # Clean up temporary checkpoint + shutil.rmtree(checkpoint.path, ignore_errors=True) + else: + ray.train.report(metrics, checkpoint=None) + + def _get_reported_metrics(self, logs: Dict) -> Dict: + assert isinstance(self._metrics, (type(None), str, list, dict)) + + if self._metrics is None: + reported_metrics = logs + elif isinstance(self._metrics, str): + reported_metrics = {self._metrics: logs[self._metrics]} + elif isinstance(self._metrics, list): + reported_metrics = {metric: logs[metric] for metric in self._metrics} + elif isinstance(self._metrics, dict): + reported_metrics = { + key: logs[metric] for key, metric in self._metrics.items() + } + + assert isinstance(reported_metrics, dict) + return reported_metrics diff --git a/.venv/lib/python3.11/site-packages/ray/air/integrations/mlflow.py b/.venv/lib/python3.11/site-packages/ray/air/integrations/mlflow.py new file mode 100644 index 0000000000000000000000000000000000000000..21bface0c910ad6ee03f078ec3010c9e0a433726 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/air/integrations/mlflow.py @@ -0,0 +1,325 @@ +import logging +from types import ModuleType +from typing import Dict, Optional, Union + +import ray +from ray.air._internal import usage as air_usage +from ray.air._internal.mlflow import _MLflowLoggerUtil +from ray.air.constants import TRAINING_ITERATION +from ray.tune.experiment import Trial +from ray.tune.logger import LoggerCallback +from ray.tune.result import TIMESTEPS_TOTAL +from ray.util.annotations import PublicAPI + +try: + import mlflow +except ImportError: + mlflow = None + + +logger = logging.getLogger(__name__) + + +class _NoopModule: + def __getattr__(self, item): + return _NoopModule() + + def __call__(self, *args, **kwargs): + return None + + +@PublicAPI(stability="alpha") +def setup_mlflow( + config: Optional[Dict] = None, + tracking_uri: Optional[str] = None, + registry_uri: Optional[str] = None, + experiment_id: Optional[str] = None, + experiment_name: Optional[str] = None, + tracking_token: Optional[str] = None, + artifact_location: Optional[str] = None, + run_name: Optional[str] = None, + create_experiment_if_not_exists: bool = False, + tags: Optional[Dict] = None, + rank_zero_only: bool = True, +) -> Union[ModuleType, _NoopModule]: + """Set up a MLflow session. + + This function can be used to initialize an MLflow session in a + (distributed) training or tuning run. The session will be created on the trainable. + + By default, the MLflow experiment ID is the Ray trial ID and the + MLlflow experiment name is the Ray trial name. These settings can be overwritten by + passing the respective keyword arguments. + + The ``config`` dict is automatically logged as the run parameters (excluding the + mlflow settings). + + In distributed training with Ray Train, only the zero-rank worker will initialize + mlflow. All other workers will return a noop client, so that logging is not + duplicated in a distributed run. This can be disabled by passing + ``rank_zero_only=False``, which will then initialize mlflow in every training + worker. + + This function will return the ``mlflow`` module or a noop module for + non-rank zero workers ``if rank_zero_only=True``. By using + ``mlflow = setup_mlflow(config)`` you can ensure that only the rank zero worker + calls the mlflow API. + + Args: + config: Configuration dict to be logged to mlflow as parameters. + tracking_uri: The tracking URI for MLflow tracking. If using + Tune in a multi-node setting, make sure to use a remote server for + tracking. + registry_uri: The registry URI for the MLflow model registry. + experiment_id: The id of an already created MLflow experiment. + All logs from all trials in ``tune.Tuner()`` will be reported to this + experiment. If this is not provided or the experiment with this + id does not exist, you must provide an``experiment_name``. This + parameter takes precedence over ``experiment_name``. + experiment_name: The name of an already existing MLflow + experiment. All logs from all trials in ``tune.Tuner()`` will be + reported to this experiment. If this is not provided, you must + provide a valid ``experiment_id``. + tracking_token: A token to use for HTTP authentication when + logging to a remote tracking server. This is useful when you + want to log to a Databricks server, for example. This value will + be used to set the MLFLOW_TRACKING_TOKEN environment variable on + all the remote training processes. + artifact_location: The location to store run artifacts. + If not provided, MLFlow picks an appropriate default. + Ignored if experiment already exists. + run_name: Name of the new MLflow run that will be created. + If not set, will default to the ``experiment_name``. + create_experiment_if_not_exists: Whether to create an + experiment with the provided name if it does not already + exist. Defaults to False. + tags: Tags to set for the new run. + rank_zero_only: If True, will return an initialized session only for the + rank 0 worker in distributed training. If False, will initialize a + session for all workers. Defaults to True. + + Example: + + Per default, you can just call ``setup_mlflow`` and continue to use + MLflow like you would normally do: + + .. code-block:: python + + from ray.air.integrations.mlflow import setup_mlflow + + def training_loop(config): + mlflow = setup_mlflow(config) + # ... + mlflow.log_metric(key="loss", val=0.123, step=0) + + In distributed data parallel training, you can utilize the return value of + ``setup_mlflow``. This will make sure it is only invoked on the first worker + in distributed training runs. + + .. code-block:: python + + from ray.air.integrations.mlflow import setup_mlflow + + def training_loop(config): + mlflow = setup_mlflow(config) + # ... + mlflow.log_metric(key="loss", val=0.123, step=0) + + + You can also use MlFlow's autologging feature if using a training + framework like Pytorch Lightning, XGBoost, etc. More information can be + found here + (https://mlflow.org/docs/latest/tracking.html#automatic-logging). + + .. code-block:: python + + from ray.air.integrations.mlflow import setup_mlflow + + def train_fn(config): + mlflow = setup_mlflow(config) + mlflow.autolog() + xgboost_results = xgb.train(config, ...) + + """ + if not mlflow: + raise RuntimeError( + "mlflow was not found - please install with `pip install mlflow`" + ) + + try: + train_context = ray.train.get_context() + + # Do a try-catch here if we are not in a train session + if rank_zero_only and train_context.get_world_rank() != 0: + return _NoopModule() + + default_trial_id = train_context.get_trial_id() + default_trial_name = train_context.get_trial_name() + + except RuntimeError: + default_trial_id = None + default_trial_name = None + + _config = config.copy() if config else {} + + experiment_id = experiment_id or default_trial_id + experiment_name = experiment_name or default_trial_name + + # Setup mlflow + mlflow_util = _MLflowLoggerUtil() + mlflow_util.setup_mlflow( + tracking_uri=tracking_uri, + registry_uri=registry_uri, + experiment_id=experiment_id, + experiment_name=experiment_name, + tracking_token=tracking_token, + artifact_location=artifact_location, + create_experiment_if_not_exists=create_experiment_if_not_exists, + ) + + mlflow_util.start_run( + run_name=run_name or experiment_name, + tags=tags, + set_active=True, + ) + mlflow_util.log_params(_config) + + # Record `setup_mlflow` usage when everything has setup successfully. + air_usage.tag_setup_mlflow() + + return mlflow_util._mlflow + + +class MLflowLoggerCallback(LoggerCallback): + """MLflow Logger to automatically log Tune results and config to MLflow. + + MLflow (https://mlflow.org) Tracking is an open source library for + recording and querying experiments. This Ray Tune ``LoggerCallback`` + sends information (config parameters, training results & metrics, + and artifacts) to MLflow for automatic experiment tracking. + + Keep in mind that the callback will open an MLflow session on the driver and + not on the trainable. Therefore, it is not possible to call MLflow functions + like ``mlflow.log_figure()`` inside the trainable as there is no MLflow session + on the trainable. For more fine grained control, use + :func:`ray.air.integrations.mlflow.setup_mlflow`. + + Args: + tracking_uri: The tracking URI for where to manage experiments + and runs. This can either be a local file path or a remote server. + This arg gets passed directly to mlflow + initialization. When using Tune in a multi-node setting, make sure + to set this to a remote server and not a local file path. + registry_uri: The registry URI that gets passed directly to + mlflow initialization. + experiment_name: The experiment name to use for this Tune run. + If the experiment with the name already exists with MLflow, + it will be reused. If not, a new experiment will be created with + that name. + tags: An optional dictionary of string keys and values to set + as tags on the run + tracking_token: Tracking token used to authenticate with MLflow. + save_artifact: If set to True, automatically save the entire + contents of the Tune local_dir as an artifact to the + corresponding run in MlFlow. + + Example: + + .. code-block:: python + + from ray.air.integrations.mlflow import MLflowLoggerCallback + + tags = { "user_name" : "John", + "git_commit_hash" : "abc123"} + + tune.run( + train_fn, + config={ + # define search space here + "parameter_1": tune.choice([1, 2, 3]), + "parameter_2": tune.choice([4, 5, 6]), + }, + callbacks=[MLflowLoggerCallback( + experiment_name="experiment1", + tags=tags, + save_artifact=True)]) + + """ + + def __init__( + self, + tracking_uri: Optional[str] = None, + *, + registry_uri: Optional[str] = None, + experiment_name: Optional[str] = None, + tags: Optional[Dict] = None, + tracking_token: Optional[str] = None, + save_artifact: bool = False, + ): + + self.tracking_uri = tracking_uri + self.registry_uri = registry_uri + self.experiment_name = experiment_name + self.tags = tags + self.tracking_token = tracking_token + self.should_save_artifact = save_artifact + + self.mlflow_util = _MLflowLoggerUtil() + + if ray.util.client.ray.is_connected(): + logger.warning( + "When using MLflowLoggerCallback with Ray Client, " + "it is recommended to use a remote tracking " + "server. If you are using a MLflow tracking server " + "backed by the local filesystem, then it must be " + "setup on the server side and not on the client " + "side." + ) + + def setup(self, *args, **kwargs): + # Setup the mlflow logging util. + self.mlflow_util.setup_mlflow( + tracking_uri=self.tracking_uri, + registry_uri=self.registry_uri, + experiment_name=self.experiment_name, + tracking_token=self.tracking_token, + ) + + if self.tags is None: + # Create empty dictionary for tags if not given explicitly + self.tags = {} + + self._trial_runs = {} + + def log_trial_start(self, trial: "Trial"): + # Create run if not already exists. + if trial not in self._trial_runs: + + # Set trial name in tags + tags = self.tags.copy() + tags["trial_name"] = str(trial) + + run = self.mlflow_util.start_run(tags=tags, run_name=str(trial)) + self._trial_runs[trial] = run.info.run_id + + run_id = self._trial_runs[trial] + + # Log the config parameters. + config = trial.config + self.mlflow_util.log_params(run_id=run_id, params_to_log=config) + + def log_trial_result(self, iteration: int, trial: "Trial", result: Dict): + step = result.get(TIMESTEPS_TOTAL) or result[TRAINING_ITERATION] + run_id = self._trial_runs[trial] + self.mlflow_util.log_metrics(run_id=run_id, metrics_to_log=result, step=step) + + def log_trial_end(self, trial: "Trial", failed: bool = False): + run_id = self._trial_runs[trial] + + # Log the artifact if set_artifact is set to True. + if self.should_save_artifact: + self.mlflow_util.save_artifacts(run_id=run_id, dir=trial.local_path) + + # Stop the run once trial finishes. + status = "FINISHED" if not failed else "FAILED" + self.mlflow_util.end_run(run_id=run_id, status=status) diff --git a/.venv/lib/python3.11/site-packages/ray/air/integrations/wandb.py b/.venv/lib/python3.11/site-packages/ray/air/integrations/wandb.py new file mode 100644 index 0000000000000000000000000000000000000000..5d220211dd622dcfa00ec33b8c80fa31bd66d624 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/air/integrations/wandb.py @@ -0,0 +1,749 @@ +import enum +import os +import pickle +import urllib +import warnings +from numbers import Number +from types import ModuleType +from typing import Any, Dict, List, Optional, Sequence, Tuple, Union + +import numpy as np +import pyarrow.fs + +import ray +from ray import logger +from ray._private.storage import _load_class +from ray.air._internal import usage as air_usage +from ray.air.constants import TRAINING_ITERATION +from ray.air.util.node import _force_on_current_node +from ray.train._internal.session import get_session +from ray.train._internal.syncer import DEFAULT_SYNC_TIMEOUT +from ray.tune.experiment import Trial +from ray.tune.logger import LoggerCallback +from ray.tune.utils import flatten_dict +from ray.util import PublicAPI +from ray.util.queue import Queue + +try: + import wandb + from wandb.sdk.data_types.base_types.wb_value import WBValue + from wandb.sdk.data_types.image import Image + from wandb.sdk.data_types.video import Video + from wandb.sdk.lib.disabled import RunDisabled + from wandb.util import json_dumps_safer + from wandb.wandb_run import Run +except ImportError: + wandb = json_dumps_safer = Run = RunDisabled = WBValue = None + + +WANDB_ENV_VAR = "WANDB_API_KEY" +WANDB_PROJECT_ENV_VAR = "WANDB_PROJECT_NAME" +WANDB_GROUP_ENV_VAR = "WANDB_GROUP_NAME" +WANDB_MODE_ENV_VAR = "WANDB_MODE" +# Hook that is invoked before wandb.init in the setup method of WandbLoggerCallback +# to populate the API key if it isn't already set when initializing the callback. +# It doesn't take in any arguments and returns the W&B API key. +# Example: "your.module.wandb_setup_api_key_hook". +WANDB_SETUP_API_KEY_HOOK = "WANDB_SETUP_API_KEY_HOOK" +# Hook that is invoked before wandb.init in the setup method of WandbLoggerCallback +# to populate environment variables to specify the location +# (project and group) of the W&B run. +# It doesn't take in any arguments and doesn't return anything, but it does populate +# WANDB_PROJECT_NAME and WANDB_GROUP_NAME. +# Example: "your.module.wandb_populate_run_location_hook". +WANDB_POPULATE_RUN_LOCATION_HOOK = "WANDB_POPULATE_RUN_LOCATION_HOOK" +# Hook that is invoked after running wandb.init in WandbLoggerCallback +# to process information about the W&B run. +# It takes in a W&B run object and doesn't return anything. +# Example: "your.module.wandb_process_run_info_hook". +WANDB_PROCESS_RUN_INFO_HOOK = "WANDB_PROCESS_RUN_INFO_HOOK" + + +@PublicAPI(stability="alpha") +def setup_wandb( + config: Optional[Dict] = None, + api_key: Optional[str] = None, + api_key_file: Optional[str] = None, + rank_zero_only: bool = True, + **kwargs, +) -> Union[Run, RunDisabled]: + """Set up a Weights & Biases session. + + This function can be used to initialize a Weights & Biases session in a + (distributed) training or tuning run. + + By default, the run ID is the trial ID, the run name is the trial name, and + the run group is the experiment name. These settings can be overwritten by + passing the respective arguments as ``kwargs``, which will be passed to + ``wandb.init()``. + + In distributed training with Ray Train, only the zero-rank worker will initialize + wandb. All other workers will return a disabled run object, so that logging is not + duplicated in a distributed run. This can be disabled by passing + ``rank_zero_only=False``, which will then initialize wandb in every training + worker. + + The ``config`` argument will be passed to Weights and Biases and will be logged + as the run configuration. + + If no API key or key file are passed, wandb will try to authenticate + using locally stored credentials, created for instance by running ``wandb login``. + + Keyword arguments passed to ``setup_wandb()`` will be passed to + ``wandb.init()`` and take precedence over any potential default settings. + + Args: + config: Configuration dict to be logged to Weights and Biases. Can contain + arguments for ``wandb.init()`` as well as authentication information. + api_key: API key to use for authentication with Weights and Biases. + api_key_file: File pointing to API key for with Weights and Biases. + rank_zero_only: If True, will return an initialized session only for the + rank 0 worker in distributed training. If False, will initialize a + session for all workers. + kwargs: Passed to ``wandb.init()``. + + Example: + + .. code-block:: python + + from ray.air.integrations.wandb import setup_wandb + + def training_loop(config): + wandb = setup_wandb(config) + # ... + wandb.log({"loss": 0.123}) + + """ + if not wandb: + raise RuntimeError( + "Wandb was not found - please install with `pip install wandb`" + ) + + default_trial_id = None + default_trial_name = None + default_experiment_name = None + + # Do a try-catch here if we are not in a train session + session = get_session() + if session and rank_zero_only and session.world_rank in (None, 0): + return RunDisabled() + + if session: + default_trial_id = session.trial_id + default_trial_name = session.trial_name + default_experiment_name = session.experiment_name + + # Default init kwargs + wandb_init_kwargs = { + "trial_id": kwargs.get("trial_id") or default_trial_id, + "trial_name": kwargs.get("trial_name") or default_trial_name, + "group": kwargs.get("group") or default_experiment_name, + } + # Passed kwargs take precedence over default kwargs + wandb_init_kwargs.update(kwargs) + + return _setup_wandb( + config=config, api_key=api_key, api_key_file=api_key_file, **wandb_init_kwargs + ) + + +def _setup_wandb( + trial_id: str, + trial_name: str, + config: Optional[Dict] = None, + api_key: Optional[str] = None, + api_key_file: Optional[str] = None, + _wandb: Optional[ModuleType] = None, + **kwargs, +) -> Union[Run, RunDisabled]: + _config = config.copy() if config else {} + + # If key file is specified, set + if api_key_file: + api_key_file = os.path.expanduser(api_key_file) + + _set_api_key(api_key_file, api_key) + project = _get_wandb_project(kwargs.pop("project", None)) + group = kwargs.pop("group", os.environ.get(WANDB_GROUP_ENV_VAR)) + + # Remove unpickleable items. + _config = _clean_log(_config) + + wandb_init_kwargs = dict( + id=trial_id, + name=trial_name, + resume=True, + reinit=True, + allow_val_change=True, + config=_config, + project=project, + group=group, + ) + + # Update config (e.g. set any other parameters in the call to wandb.init) + wandb_init_kwargs.update(**kwargs) + + # On windows, we can't fork + if os.name == "nt": + os.environ["WANDB_START_METHOD"] = "thread" + else: + os.environ["WANDB_START_METHOD"] = "fork" + + _wandb = _wandb or wandb + + run = _wandb.init(**wandb_init_kwargs) + _run_wandb_process_run_info_hook(run) + + # Record `setup_wandb` usage when everything has setup successfully. + air_usage.tag_setup_wandb() + + return run + + +def _is_allowed_type(obj): + """Return True if type is allowed for logging to wandb""" + if isinstance(obj, np.ndarray) and obj.size == 1: + return isinstance(obj.item(), Number) + if isinstance(obj, Sequence) and len(obj) > 0: + return isinstance(obj[0], (Image, Video, WBValue)) + return isinstance(obj, (Number, WBValue)) + + +def _clean_log(obj: Any): + # Fixes https://github.com/ray-project/ray/issues/10631 + if isinstance(obj, dict): + return {k: _clean_log(v) for k, v in obj.items()} + elif isinstance(obj, (list, set)): + return [_clean_log(v) for v in obj] + elif isinstance(obj, tuple): + return tuple(_clean_log(v) for v in obj) + elif isinstance(obj, np.ndarray) and obj.ndim == 3: + # Must be single image (H, W, C). + return Image(obj) + elif isinstance(obj, np.ndarray) and obj.ndim == 4: + # Must be batch of images (N >= 1, H, W, C). + return ( + _clean_log([Image(v) for v in obj]) if obj.shape[0] > 1 else Image(obj[0]) + ) + elif isinstance(obj, np.ndarray) and obj.ndim == 5: + # Must be batch of videos (N >= 1, T, C, W, H). + return ( + _clean_log([Video(v) for v in obj]) if obj.shape[0] > 1 else Video(obj[0]) + ) + elif _is_allowed_type(obj): + return obj + + # Else + + try: + # This is what wandb uses internally. If we cannot dump + # an object using this method, wandb will raise an exception. + json_dumps_safer(obj) + + # This is probably unnecessary, but left here to be extra sure. + pickle.dumps(obj) + + return obj + except Exception: + # give up, similar to _SafeFallBackEncoder + fallback = str(obj) + + # Try to convert to int + try: + fallback = int(fallback) + return fallback + except ValueError: + pass + + # Try to convert to float + try: + fallback = float(fallback) + return fallback + except ValueError: + pass + + # Else, return string + return fallback + + +def _get_wandb_project(project: Optional[str] = None) -> Optional[str]: + """Get W&B project from environment variable or external hook if not passed + as and argument.""" + if ( + not project + and not os.environ.get(WANDB_PROJECT_ENV_VAR) + and os.environ.get(WANDB_POPULATE_RUN_LOCATION_HOOK) + ): + # Try to populate WANDB_PROJECT_ENV_VAR and WANDB_GROUP_ENV_VAR + # from external hook + try: + _load_class(os.environ[WANDB_POPULATE_RUN_LOCATION_HOOK])() + except Exception as e: + logger.exception( + f"Error executing {WANDB_POPULATE_RUN_LOCATION_HOOK} to " + f"populate {WANDB_PROJECT_ENV_VAR} and {WANDB_GROUP_ENV_VAR}: {e}", + exc_info=e, + ) + if not project and os.environ.get(WANDB_PROJECT_ENV_VAR): + # Try to get project and group from environment variables if not + # passed through WandbLoggerCallback. + project = os.environ.get(WANDB_PROJECT_ENV_VAR) + return project + + +def _set_api_key(api_key_file: Optional[str] = None, api_key: Optional[str] = None): + """Set WandB API key from `wandb_config`. Will pop the + `api_key_file` and `api_key` keys from `wandb_config` parameter. + + The order of fetching the API key is: + 1) From `api_key` or `api_key_file` arguments + 2) From WANDB_API_KEY environment variables + 3) User already logged in to W&B (wandb.api.api_key set) + 4) From external hook WANDB_SETUP_API_KEY_HOOK + """ + if os.environ.get(WANDB_MODE_ENV_VAR) in {"offline", "disabled"}: + return + + if api_key_file: + if api_key: + raise ValueError("Both WandB `api_key_file` and `api_key` set.") + with open(api_key_file, "rt") as fp: + api_key = fp.readline().strip() + + if not api_key and not os.environ.get(WANDB_ENV_VAR): + # Check if user is already logged into wandb. + try: + wandb.ensure_configured() + if wandb.api.api_key: + logger.info("Already logged into W&B.") + return + except AttributeError: + pass + # Try to get API key from external hook + if WANDB_SETUP_API_KEY_HOOK in os.environ: + try: + api_key = _load_class(os.environ[WANDB_SETUP_API_KEY_HOOK])() + except Exception as e: + logger.exception( + f"Error executing {WANDB_SETUP_API_KEY_HOOK} to setup API key: {e}", + exc_info=e, + ) + if api_key: + os.environ[WANDB_ENV_VAR] = api_key + elif not os.environ.get(WANDB_ENV_VAR): + raise ValueError( + "No WandB API key found. Either set the {} environment " + "variable, pass `api_key` or `api_key_file` to the" + "`WandbLoggerCallback` class as arguments, " + "or run `wandb login` from the command line".format(WANDB_ENV_VAR) + ) + + +def _run_wandb_process_run_info_hook(run: Any) -> None: + """Run external hook to process information about wandb run""" + if WANDB_PROCESS_RUN_INFO_HOOK in os.environ: + try: + _load_class(os.environ[WANDB_PROCESS_RUN_INFO_HOOK])(run) + except Exception as e: + logger.exception( + f"Error calling {WANDB_PROCESS_RUN_INFO_HOOK}: {e}", exc_info=e + ) + + +class _QueueItem(enum.Enum): + END = enum.auto() + RESULT = enum.auto() + CHECKPOINT = enum.auto() + + +class _WandbLoggingActor: + """ + Wandb assumes that each trial's information should be logged from a + separate process. We use Ray actors as forking multiprocessing + processes is not supported by Ray and spawn processes run into pickling + problems. + + We use a queue for the driver to communicate with the logging process. + The queue accepts the following items: + + - If it's a dict, it is assumed to be a result and will be logged using + ``wandb.log()`` + - If it's a checkpoint object, it will be saved using ``wandb.log_artifact()``. + """ + + def __init__( + self, + logdir: str, + queue: Queue, + exclude: List[str], + to_config: List[str], + *args, + **kwargs, + ): + import wandb + + self._wandb = wandb + + os.chdir(logdir) + self.queue = queue + self._exclude = set(exclude) + self._to_config = set(to_config) + self.args = args + self.kwargs = kwargs + + self._trial_name = self.kwargs.get("name", "unknown") + self._logdir = logdir + + def run(self): + # Since we're running in a separate process already, use threads. + os.environ["WANDB_START_METHOD"] = "thread" + run = self._wandb.init(*self.args, **self.kwargs) + run.config.trial_log_path = self._logdir + + _run_wandb_process_run_info_hook(run) + + while True: + item_type, item_content = self.queue.get() + if item_type == _QueueItem.END: + break + + if item_type == _QueueItem.CHECKPOINT: + self._handle_checkpoint(item_content) + continue + + assert item_type == _QueueItem.RESULT + log, config_update = self._handle_result(item_content) + try: + self._wandb.config.update(config_update, allow_val_change=True) + self._wandb.log(log, step=log.get(TRAINING_ITERATION)) + except urllib.error.HTTPError as e: + # Ignore HTTPError. Missing a few data points is not a + # big issue, as long as things eventually recover. + logger.warning("Failed to log result to w&b: {}".format(str(e))) + self._wandb.finish() + + def _handle_checkpoint(self, checkpoint_path: str): + artifact = self._wandb.Artifact( + name=f"checkpoint_{self._trial_name}", type="model" + ) + artifact.add_dir(checkpoint_path) + self._wandb.log_artifact(artifact) + + def _handle_result(self, result: Dict) -> Tuple[Dict, Dict]: + config_update = result.get("config", {}).copy() + log = {} + flat_result = flatten_dict(result, delimiter="/") + + for k, v in flat_result.items(): + if any(k.startswith(item + "/") or k == item for item in self._exclude): + continue + elif any(k.startswith(item + "/") or k == item for item in self._to_config): + config_update[k] = v + elif not _is_allowed_type(v): + continue + else: + log[k] = v + + config_update.pop("callbacks", None) # Remove callbacks + return log, config_update + + +@PublicAPI(stability="alpha") +class WandbLoggerCallback(LoggerCallback): + """WandbLoggerCallback + + Weights and biases (https://www.wandb.ai/) is a tool for experiment + tracking, model optimization, and dataset versioning. This Ray Tune + ``LoggerCallback`` sends metrics to Wandb for automatic tracking and + visualization. + + Example: + + .. testcode:: + + import random + + from ray import train, tune + from ray.train import RunConfig + from ray.air.integrations.wandb import WandbLoggerCallback + + + def train_func(config): + offset = random.random() / 5 + for epoch in range(2, config["epochs"]): + acc = 1 - (2 + config["lr"]) ** -epoch - random.random() / epoch - offset + loss = (2 + config["lr"]) ** -epoch + random.random() / epoch + offset + train.report({"acc": acc, "loss": loss}) + + + tuner = tune.Tuner( + train_func, + param_space={ + "lr": tune.grid_search([0.001, 0.01, 0.1, 1.0]), + "epochs": 10, + }, + run_config=RunConfig( + callbacks=[WandbLoggerCallback(project="Optimization_Project")] + ), + ) + results = tuner.fit() + + .. testoutput:: + :hide: + + ... + + Args: + project: Name of the Wandb project. Mandatory. + group: Name of the Wandb group. Defaults to the trainable + name. + api_key_file: Path to file containing the Wandb API KEY. This + file only needs to be present on the node running the Tune script + if using the WandbLogger. + api_key: Wandb API Key. Alternative to setting ``api_key_file``. + excludes: List of metrics and config that should be excluded from + the log. + log_config: Boolean indicating if the ``config`` parameter of + the ``results`` dict should be logged. This makes sense if + parameters will change during training, e.g. with + PopulationBasedTraining. Defaults to False. + upload_checkpoints: If ``True``, model checkpoints will be uploaded to + Wandb as artifacts. Defaults to ``False``. + **kwargs: The keyword arguments will be pased to ``wandb.init()``. + + Wandb's ``group``, ``run_id`` and ``run_name`` are automatically selected + by Tune, but can be overwritten by filling out the respective configuration + values. + + Please see here for all other valid configuration settings: + https://docs.wandb.ai/library/init + """ # noqa: E501 + + # Do not log these result keys + _exclude_results = ["done", "should_checkpoint"] + + AUTO_CONFIG_KEYS = [ + "trial_id", + "experiment_tag", + "node_ip", + "experiment_id", + "hostname", + "pid", + "date", + ] + """Results that are saved with `wandb.config` instead of `wandb.log`.""" + + _logger_actor_cls = _WandbLoggingActor + + def __init__( + self, + project: Optional[str] = None, + group: Optional[str] = None, + api_key_file: Optional[str] = None, + api_key: Optional[str] = None, + excludes: Optional[List[str]] = None, + log_config: bool = False, + upload_checkpoints: bool = False, + save_checkpoints: bool = False, + upload_timeout: int = DEFAULT_SYNC_TIMEOUT, + **kwargs, + ): + if not wandb: + raise RuntimeError( + "Wandb was not found - please install with `pip install wandb`" + ) + + if save_checkpoints: + warnings.warn( + "`save_checkpoints` is deprecated. Use `upload_checkpoints` instead.", + DeprecationWarning, + ) + upload_checkpoints = save_checkpoints + + self.project = project + self.group = group + self.api_key_path = api_key_file + self.api_key = api_key + self.excludes = excludes or [] + self.log_config = log_config + self.upload_checkpoints = upload_checkpoints + self._upload_timeout = upload_timeout + self.kwargs = kwargs + + self._remote_logger_class = None + + self._trial_logging_actors: Dict[ + "Trial", ray.actor.ActorHandle[_WandbLoggingActor] + ] = {} + self._trial_logging_futures: Dict["Trial", ray.ObjectRef] = {} + self._logging_future_to_trial: Dict[ray.ObjectRef, "Trial"] = {} + self._trial_queues: Dict["Trial", Queue] = {} + + def setup(self, *args, **kwargs): + self.api_key_file = ( + os.path.expanduser(self.api_key_path) if self.api_key_path else None + ) + _set_api_key(self.api_key_file, self.api_key) + + self.project = _get_wandb_project(self.project) + if not self.project: + raise ValueError( + "Please pass the project name as argument or through " + f"the {WANDB_PROJECT_ENV_VAR} environment variable." + ) + if not self.group and os.environ.get(WANDB_GROUP_ENV_VAR): + self.group = os.environ.get(WANDB_GROUP_ENV_VAR) + + def log_trial_start(self, trial: "Trial"): + config = trial.config.copy() + + config.pop("callbacks", None) # Remove callbacks + + exclude_results = self._exclude_results.copy() + + # Additional excludes + exclude_results += self.excludes + + # Log config keys on each result? + if not self.log_config: + exclude_results += ["config"] + + # Fill trial ID and name + trial_id = trial.trial_id if trial else None + trial_name = str(trial) if trial else None + + # Project name for Wandb + wandb_project = self.project + + # Grouping + wandb_group = self.group or trial.experiment_dir_name if trial else None + + # remove unpickleable items! + config = _clean_log(config) + config = { + key: value for key, value in config.items() if key not in self.excludes + } + + wandb_init_kwargs = dict( + id=trial_id, + name=trial_name, + resume=False, + reinit=True, + allow_val_change=True, + group=wandb_group, + project=wandb_project, + config=config, + ) + wandb_init_kwargs.update(self.kwargs) + + self._start_logging_actor(trial, exclude_results, **wandb_init_kwargs) + + def _start_logging_actor( + self, trial: "Trial", exclude_results: List[str], **wandb_init_kwargs + ): + # Reuse actor if one already exists. + # This can happen if the trial is restarted. + if trial in self._trial_logging_futures: + return + + if not self._remote_logger_class: + env_vars = {} + # API key env variable is not set if authenticating through `wandb login` + if WANDB_ENV_VAR in os.environ: + env_vars[WANDB_ENV_VAR] = os.environ[WANDB_ENV_VAR] + self._remote_logger_class = ray.remote( + num_cpus=0, + **_force_on_current_node(), + runtime_env={"env_vars": env_vars}, + max_restarts=-1, + max_task_retries=-1, + )(self._logger_actor_cls) + + self._trial_queues[trial] = Queue( + actor_options={ + "num_cpus": 0, + **_force_on_current_node(), + "max_restarts": -1, + "max_task_retries": -1, + } + ) + self._trial_logging_actors[trial] = self._remote_logger_class.remote( + logdir=trial.local_path, + queue=self._trial_queues[trial], + exclude=exclude_results, + to_config=self.AUTO_CONFIG_KEYS, + **wandb_init_kwargs, + ) + logging_future = self._trial_logging_actors[trial].run.remote() + self._trial_logging_futures[trial] = logging_future + self._logging_future_to_trial[logging_future] = trial + + def _signal_logging_actor_stop(self, trial: "Trial"): + self._trial_queues[trial].put((_QueueItem.END, None)) + + def log_trial_result(self, iteration: int, trial: "Trial", result: Dict): + if trial not in self._trial_logging_actors: + self.log_trial_start(trial) + + result = _clean_log(result) + self._trial_queues[trial].put((_QueueItem.RESULT, result)) + + def log_trial_save(self, trial: "Trial"): + if self.upload_checkpoints and trial.checkpoint: + checkpoint_root = None + if isinstance(trial.checkpoint.filesystem, pyarrow.fs.LocalFileSystem): + checkpoint_root = trial.checkpoint.path + + if checkpoint_root: + self._trial_queues[trial].put((_QueueItem.CHECKPOINT, checkpoint_root)) + + def log_trial_end(self, trial: "Trial", failed: bool = False): + self._signal_logging_actor_stop(trial=trial) + self._cleanup_logging_actors() + + def _cleanup_logging_actor(self, trial: "Trial"): + del self._trial_queues[trial] + del self._trial_logging_futures[trial] + ray.kill(self._trial_logging_actors[trial]) + del self._trial_logging_actors[trial] + + def _cleanup_logging_actors(self, timeout: int = 0, kill_on_timeout: bool = False): + """Clean up logging actors that have finished uploading to wandb. + Waits for `timeout` seconds to collect finished logging actors. + + Args: + timeout: The number of seconds to wait. Defaults to 0 to clean up + any immediate logging actors during the run. + This is set to a timeout threshold to wait for pending uploads + on experiment end. + kill_on_timeout: Whether or not to kill and cleanup the logging actor if + it hasn't finished within the timeout. + """ + + futures = list(self._trial_logging_futures.values()) + done, remaining = ray.wait(futures, num_returns=len(futures), timeout=timeout) + for ready_future in done: + finished_trial = self._logging_future_to_trial.pop(ready_future) + self._cleanup_logging_actor(finished_trial) + + if kill_on_timeout: + for remaining_future in remaining: + trial = self._logging_future_to_trial.pop(remaining_future) + self._cleanup_logging_actor(trial) + + def on_experiment_end(self, trials: List["Trial"], **info): + """Wait for the actors to finish their call to `wandb.finish`. + This includes uploading all logs + artifacts to wandb.""" + self._cleanup_logging_actors(timeout=self._upload_timeout, kill_on_timeout=True) + + def __del__(self): + if ray.is_initialized(): + for trial in list(self._trial_logging_actors): + self._signal_logging_actor_stop(trial=trial) + + self._cleanup_logging_actors(timeout=2, kill_on_timeout=True) + + self._trial_logging_actors = {} + self._trial_logging_futures = {} + self._logging_future_to_trial = {} + self._trial_queues = {} diff --git a/.venv/lib/python3.11/site-packages/ray/air/util/__pycache__/__init__.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/air/util/__pycache__/__init__.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..570becd6d49700f4b03211d24a9f40117fe5fc55 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/air/util/__pycache__/__init__.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/air/util/__pycache__/check_ingest.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/air/util/__pycache__/check_ingest.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..f5aead7c9d0a333b6bc7e88f479c9d6d55c7b648 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/air/util/__pycache__/check_ingest.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/air/util/__pycache__/data_batch_conversion.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/air/util/__pycache__/data_batch_conversion.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..6f2eafe1341e069d3f888bf6ebc7494994727f5b Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/air/util/__pycache__/data_batch_conversion.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/air/util/__pycache__/node.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/air/util/__pycache__/node.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..fc2fd249edb52d95064eb53d94ee88391800a9df Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/air/util/__pycache__/node.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/air/util/__pycache__/torch_dist.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/air/util/__pycache__/torch_dist.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e097adb5d1d0365a54d74232c893eb7256d3bfe9 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/air/util/__pycache__/torch_dist.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/air/util/__pycache__/transform_pyarrow.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/air/util/__pycache__/transform_pyarrow.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..9ebd6add7f93acc364b04a4bd86256167599bf15 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/air/util/__pycache__/transform_pyarrow.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/air/util/check_ingest.py b/.venv/lib/python3.11/site-packages/ray/air/util/check_ingest.py new file mode 100644 index 0000000000000000000000000000000000000000..8f43ee4cf6abbba124493f5efea604f3e710d77b --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/air/util/check_ingest.py @@ -0,0 +1,201 @@ +#!/usr/bin/env python + +import sys +import time +from typing import Optional + +import numpy as np + +import ray +from ray import train +from ray.air.config import DatasetConfig, ScalingConfig +from ray.data import DataIterator, Dataset, Preprocessor +from ray.train import DataConfig +from ray.train.data_parallel_trainer import DataParallelTrainer +from ray.util.annotations import Deprecated, DeveloperAPI + +MAKE_LOCAL_DATA_ITERATOR_DEPRECATION_MSG = """ +make_local_dataset_iterator is deprecated. Call ``iterator()`` directly on your dataset instead to create a local DataIterator. +""" # noqa: E501 + + +@DeveloperAPI +class DummyTrainer(DataParallelTrainer): + """A Trainer that does nothing except read the data for a given number of epochs. + + It prints out as much debugging statistics as possible. + + This is useful for debugging data ingest problem. This trainer supports normal + scaling options same as any other Trainer (e.g., num_workers, use_gpu). + + Args: + scaling_config: Configuration for how to scale training. This is the same + as for :class:`~ray.train.base_trainer.BaseTrainer`. + num_epochs: How many many times to iterate through the datasets for. + prefetch_batches: The number of batches to prefetch ahead of the + current block during the scan. This is the same as + :meth:`~ray.data.Dataset.iter_batches` + """ + + def __init__( + self, + *args, + scaling_config: Optional[ScalingConfig] = None, + num_epochs: int = 1, + prefetch_batches: int = 1, + batch_size: Optional[int] = 4096, + **kwargs, + ): + if not scaling_config: + scaling_config = ScalingConfig(num_workers=1) + super().__init__( + train_loop_per_worker=DummyTrainer.make_train_loop( + num_epochs, prefetch_batches, batch_size + ), + *args, + scaling_config=scaling_config, + **kwargs, + ) + + @staticmethod + def make_train_loop( + num_epochs: int, + prefetch_batches: int, + batch_size: Optional[int], + ): + """Make a debug train loop that runs for the given amount of epochs.""" + + def train_loop_per_worker(): + import pandas as pd + + rank = train.get_context().get_world_rank() + data_shard = train.get_dataset_shard("train") + start = time.perf_counter() + epochs_read, batches_read, bytes_read = 0, 0, 0 + batch_delays = [] + + print("Starting train loop on worker", rank) + for epoch in range(num_epochs): + epochs_read += 1 + batch_start = time.perf_counter() + for batch in data_shard.iter_batches( + prefetch_batches=prefetch_batches, + batch_size=batch_size, + ): + batch_delay = time.perf_counter() - batch_start + batch_delays.append(batch_delay) + batches_read += 1 + if isinstance(batch, pd.DataFrame): + bytes_read += int( + batch.memory_usage(index=True, deep=True).sum() + ) + elif isinstance(batch, np.ndarray): + bytes_read += batch.nbytes + elif isinstance(batch, dict): + for arr in batch.values(): + bytes_read += arr.nbytes + else: + # NOTE: This isn't recursive and will just return the size of + # the object pointers if list of non-primitive types. + bytes_read += sys.getsizeof(batch) + train.report( + dict( + bytes_read=bytes_read, + batches_read=batches_read, + epochs_read=epochs_read, + batch_delay=batch_delay, + ) + ) + batch_start = time.perf_counter() + delta = time.perf_counter() - start + + print("Time to read all data", delta, "seconds") + print( + "P50/P95/Max batch delay (s)", + np.quantile(batch_delays, 0.5), + np.quantile(batch_delays, 0.95), + np.max(batch_delays), + ) + print("Num epochs read", epochs_read) + print("Num batches read", batches_read) + print("Num bytes read", round(bytes_read / (1024 * 1024), 2), "MiB") + print( + "Mean throughput", round(bytes_read / (1024 * 1024) / delta, 2), "MiB/s" + ) + + if rank == 0: + print("Ingest stats from rank=0:\n\n{}".format(data_shard.stats())) + + return train_loop_per_worker + + +@Deprecated(MAKE_LOCAL_DATA_ITERATOR_DEPRECATION_MSG) +def make_local_dataset_iterator( + dataset: Dataset, + preprocessor: Preprocessor, + dataset_config: DatasetConfig, +) -> DataIterator: + """A helper function to create a local + :py:class:`DataIterator `, + like the one returned by :meth:`~ray.train.get_dataset_shard`. + + This function should only be used for development and debugging. It will + raise an exception if called by a worker instead of the driver. + + Args: + dataset: The input Dataset. + preprocessor: The preprocessor that will be applied to the input dataset. + dataset_config: The dataset config normally passed to the trainer. + """ + raise DeprecationWarning(MAKE_LOCAL_DATA_ITERATOR_DEPRECATION_MSG) + + +if __name__ == "__main__": + + import argparse + + parser = argparse.ArgumentParser() + parser.add_argument( + "--num-epochs", "-e", type=int, default=1, help="Number of epochs to read." + ) + parser.add_argument( + "--prefetch-batches", + "-b", + type=int, + default=1, + help="Number of batches to prefetch when reading data.", + ) + + args = parser.parse_args() + + # Generate a synthetic dataset of ~10GiB of float64 data. The dataset is sharded + # into 100 blocks (override_num_blocks=100). + ds = ray.data.range_tensor(50000, shape=(80, 80, 4), override_num_blocks=100) + + # An example preprocessing chain that just scales all values by 4.0 in two stages. + ds = ds.map_batches(lambda df: df * 2, batch_format="pandas") + ds = ds.map_batches(lambda df: df * 2, batch_format="pandas") + + # Setup the dummy trainer that prints ingest stats. + # Run and print ingest stats. + trainer = DummyTrainer( + scaling_config=ScalingConfig(num_workers=1, use_gpu=False), + datasets={"train": ds}, + num_epochs=args.num_epochs, + prefetch_batches=args.prefetch_batches, + dataset_config=DataConfig(), + batch_size=None, + ) + print("Dataset config", trainer.get_dataset_config()) + trainer.fit() + + # Print memory stats (you can also use "ray memory --stats-only" to monitor this + # during the middle of the run. + try: + print( + "Memory stats at end of ingest:\n\n{}".format( + ray._private.internal_api.memory_summary(stats_only=True) + ) + ) + except Exception: + print("Error getting Ray memory stats") diff --git a/.venv/lib/python3.11/site-packages/ray/air/util/data_batch_conversion.py b/.venv/lib/python3.11/site-packages/ray/air/util/data_batch_conversion.py new file mode 100644 index 0000000000000000000000000000000000000000..1bf69b4b93989f8d341d2d5a3fa9423435602aa5 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/air/util/data_batch_conversion.py @@ -0,0 +1,353 @@ +import warnings +from enum import Enum +from typing import TYPE_CHECKING, Dict, List, Union + +import numpy as np + +from ray.air.constants import TENSOR_COLUMN_NAME +from ray.air.data_batch_type import DataBatchType +from ray.util.annotations import Deprecated, DeveloperAPI + +if TYPE_CHECKING: + import pandas as pd + +# TODO: Consolidate data conversion edges for arrow bug workaround. +try: + import pyarrow +except ImportError: + pyarrow = None + +# Lazy import to avoid ray init failures without pandas installed and allow +# dataset to import modules in this file. +_pandas = None + + +def _lazy_import_pandas(): + global _pandas + if _pandas is None: + import pandas + + _pandas = pandas + return _pandas + + +@DeveloperAPI +class BatchFormat(str, Enum): + PANDAS = "pandas" + # TODO: Remove once Arrow is deprecated as user facing batch format + ARROW = "arrow" + NUMPY = "numpy" # Either a single numpy array or a Dict of numpy arrays. + + +@DeveloperAPI +class BlockFormat(str, Enum): + """Internal Dataset block format enum.""" + + PANDAS = "pandas" + ARROW = "arrow" + SIMPLE = "simple" + + +def _convert_batch_type_to_pandas( + data: DataBatchType, + cast_tensor_columns: bool = False, +) -> "pd.DataFrame": + """Convert the provided data to a Pandas DataFrame. + + Args: + data: Data of type DataBatchType + cast_tensor_columns: Whether tensor columns should be cast to NumPy ndarrays. + + Returns: + A pandas Dataframe representation of the input data. + + """ + pd = _lazy_import_pandas() + + if isinstance(data, np.ndarray): + data = pd.DataFrame({TENSOR_COLUMN_NAME: _ndarray_to_column(data)}) + elif isinstance(data, dict): + tensor_dict = {} + for col_name, col in data.items(): + if not isinstance(col, np.ndarray): + raise ValueError( + "All values in the provided dict must be of type " + f"np.ndarray. Found type {type(col)} for key {col_name} " + f"instead." + ) + tensor_dict[col_name] = _ndarray_to_column(col) + data = pd.DataFrame(tensor_dict) + elif pyarrow is not None and isinstance(data, pyarrow.Table): + data = data.to_pandas() + elif not isinstance(data, pd.DataFrame): + raise ValueError( + f"Received data of type: {type(data)}, but expected it to be one " + f"of {DataBatchType}" + ) + if cast_tensor_columns: + data = _cast_tensor_columns_to_ndarrays(data) + return data + + +def _convert_pandas_to_batch_type( + data: "pd.DataFrame", + type: BatchFormat, + cast_tensor_columns: bool = False, +) -> DataBatchType: + """Convert the provided Pandas dataframe to the provided ``type``. + + Args: + data: A Pandas DataFrame + type: The specific ``BatchFormat`` to convert to. + cast_tensor_columns: Whether tensor columns should be cast to our tensor + extension type. + + Returns: + The input data represented with the provided type. + """ + if cast_tensor_columns: + data = _cast_ndarray_columns_to_tensor_extension(data) + if type == BatchFormat.PANDAS: + return data + + elif type == BatchFormat.NUMPY: + if len(data.columns) == 1: + # If just a single column, return as a single numpy array. + return data.iloc[:, 0].to_numpy() + else: + # Else return as a dict of numpy arrays. + output_dict = {} + for column in data: + output_dict[column] = data[column].to_numpy() + return output_dict + + elif type == BatchFormat.ARROW: + if not pyarrow: + raise ValueError( + "Attempted to convert data to Pyarrow Table but Pyarrow " + "is not installed. Please do `pip install pyarrow` to " + "install Pyarrow." + ) + return pyarrow.Table.from_pandas(data) + + else: + raise ValueError( + f"Received type {type}, but expected it to be one of {DataBatchType}" + ) + + +@Deprecated +def convert_batch_type_to_pandas( + data: DataBatchType, + cast_tensor_columns: bool = False, +): + """Convert the provided data to a Pandas DataFrame. + + This API is deprecated from Ray 2.4. + + Args: + data: Data of type DataBatchType + cast_tensor_columns: Whether tensor columns should be cast to NumPy ndarrays. + + Returns: + A pandas Dataframe representation of the input data. + + """ + warnings.warn( + "`convert_batch_type_to_pandas` is deprecated as a developer API " + "starting from Ray 2.4. All batch format conversions should be " + "done manually instead of relying on this API.", + PendingDeprecationWarning, + ) + return _convert_batch_type_to_pandas( + data=data, cast_tensor_columns=cast_tensor_columns + ) + + +@Deprecated +def convert_pandas_to_batch_type( + data: "pd.DataFrame", + type: BatchFormat, + cast_tensor_columns: bool = False, +): + """Convert the provided Pandas dataframe to the provided ``type``. + + Args: + data: A Pandas DataFrame + type: The specific ``BatchFormat`` to convert to. + cast_tensor_columns: Whether tensor columns should be cast to our tensor + extension type. + + Returns: + The input data represented with the provided type. + """ + warnings.warn( + "`convert_pandas_to_batch_type` is deprecated as a developer API " + "starting from Ray 2.4. All batch format conversions should be " + "done manually instead of relying on this API.", + PendingDeprecationWarning, + ) + return _convert_pandas_to_batch_type( + data=data, type=type, cast_tensor_columns=cast_tensor_columns + ) + + +def _convert_batch_type_to_numpy( + data: DataBatchType, +) -> Union[np.ndarray, Dict[str, np.ndarray]]: + """Convert the provided data to a NumPy ndarray or dict of ndarrays. + + Args: + data: Data of type DataBatchType + + Returns: + A numpy representation of the input data. + """ + pd = _lazy_import_pandas() + + if isinstance(data, np.ndarray): + return data + elif isinstance(data, dict): + for col_name, col in data.items(): + if not isinstance(col, np.ndarray): + raise ValueError( + "All values in the provided dict must be of type " + f"np.ndarray. Found type {type(col)} for key {col_name} " + f"instead." + ) + return data + elif pyarrow is not None and isinstance(data, pyarrow.Table): + from ray.air.util.tensor_extensions.arrow import ( + get_arrow_extension_fixed_shape_tensor_types, + ) + from ray.data._internal.arrow_ops import transform_pyarrow + + column_values_ndarrays = [] + + for col in data.columns: + # Combine columnar values arrays to make these contiguous + # (making them compatible with numpy format) + combined_array = transform_pyarrow.combine_chunked_array(col) + + column_values_ndarrays.append( + transform_pyarrow.to_numpy(combined_array, zero_copy_only=False) + ) + + arrow_fixed_shape_tensor_types = get_arrow_extension_fixed_shape_tensor_types() + + # NOTE: This branch is here for backwards-compatibility + if data.column_names == [TENSOR_COLUMN_NAME] and ( + isinstance(data.schema.types[0], arrow_fixed_shape_tensor_types) + ): + return column_values_ndarrays[0] + + return dict(zip(data.column_names, column_values_ndarrays)) + elif isinstance(data, pd.DataFrame): + return _convert_pandas_to_batch_type(data, BatchFormat.NUMPY) + else: + raise ValueError( + f"Received data of type: {type(data)}, but expected it to be one " + f"of {DataBatchType}" + ) + + +def _ndarray_to_column(arr: np.ndarray) -> Union["pd.Series", List[np.ndarray]]: + """Convert a NumPy ndarray into an appropriate column format for insertion into a + pandas DataFrame. + + If conversion to a pandas Series fails (e.g. if the ndarray is multi-dimensional), + fall back to a list of NumPy ndarrays. + """ + pd = _lazy_import_pandas() + try: + # Try to convert to Series, falling back to a list conversion if this fails + # (e.g. if the ndarray is multi-dimensional). + return pd.Series(arr) + except ValueError: + return list(arr) + + +def _unwrap_ndarray_object_type_if_needed(arr: np.ndarray) -> np.ndarray: + """Unwrap an object-dtyped NumPy ndarray containing ndarray pointers into a single + contiguous ndarray, if needed/possible. + """ + if arr.dtype.type is np.object_: + try: + # Try to convert the NumPy ndarray to a non-object dtype. + arr = np.array([np.asarray(v) for v in arr]) + except Exception: + # This may fail if the subndarrays are of heterogeneous shape + pass + return arr + + +def _cast_ndarray_columns_to_tensor_extension(df: "pd.DataFrame") -> "pd.DataFrame": + """ + Cast all NumPy ndarray columns in df to our tensor extension type, TensorArray. + """ + pd = _lazy_import_pandas() + try: + SettingWithCopyWarning = pd.core.common.SettingWithCopyWarning + except AttributeError: + # SettingWithCopyWarning was moved to pd.errors in Pandas 1.5.0. + SettingWithCopyWarning = pd.errors.SettingWithCopyWarning + + from ray.air.util.tensor_extensions.pandas import ( + TensorArray, + column_needs_tensor_extension, + ) + + # Try to convert any ndarray columns to TensorArray columns. + # TODO(Clark): Once Pandas supports registering extension types for type + # inference on construction, implement as much for NumPy ndarrays and remove + # this. See https://github.com/pandas-dev/pandas/issues/41848 + # TODO(Clark): Optimize this with propagated DataFrame metadata containing a list of + # column names containing tensor columns, to make this an O(# of tensor columns) + # check rather than the current O(# of columns) check. + for col_name, col in df.items(): + if column_needs_tensor_extension(col): + try: + # Suppress Pandas warnings: + # https://github.com/ray-project/ray/issues/29270 + # We actually want in-place operations so we surpress this warning. + # https://stackoverflow.com/a/74193599 + with warnings.catch_warnings(): + warnings.simplefilter("ignore", category=FutureWarning) + warnings.simplefilter("ignore", category=SettingWithCopyWarning) + df[col_name] = TensorArray(col) + except Exception as e: + raise ValueError( + f"Tried to cast column {col_name} to the TensorArray tensor " + "extension type but the conversion failed. To disable " + "automatic casting to this tensor extension, set " + "ctx = DataContext.get_current(); " + "ctx.enable_tensor_extension_casting = False." + ) from e + return df + + +def _cast_tensor_columns_to_ndarrays(df: "pd.DataFrame") -> "pd.DataFrame": + """Cast all tensor extension columns in df to NumPy ndarrays.""" + pd = _lazy_import_pandas() + try: + SettingWithCopyWarning = pd.core.common.SettingWithCopyWarning + except AttributeError: + # SettingWithCopyWarning was moved to pd.errors in Pandas 1.5.0. + SettingWithCopyWarning = pd.errors.SettingWithCopyWarning + from ray.air.util.tensor_extensions.pandas import TensorDtype + + # Try to convert any tensor extension columns to ndarray columns. + # TODO(Clark): Optimize this with propagated DataFrame metadata containing a list of + # column names containing tensor columns, to make this an O(# of tensor columns) + # check rather than the current O(# of columns) check. + for col_name, col in df.items(): + if isinstance(col.dtype, TensorDtype): + # Suppress Pandas warnings: + # https://github.com/ray-project/ray/issues/29270 + # We actually want in-place operations so we surpress this warning. + # https://stackoverflow.com/a/74193599 + with warnings.catch_warnings(): + warnings.simplefilter("ignore", category=FutureWarning) + warnings.simplefilter("ignore", category=SettingWithCopyWarning) + df[col_name] = list(col.to_numpy()) + return df diff --git a/.venv/lib/python3.11/site-packages/ray/air/util/node.py b/.venv/lib/python3.11/site-packages/ray/air/util/node.py new file mode 100644 index 0000000000000000000000000000000000000000..60ea3558878f3b220f1ef97210193d5b0e626ef4 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/air/util/node.py @@ -0,0 +1,69 @@ +from typing import Dict, Optional, Union + +import ray + + +def _get_node_id_from_node_ip(node_ip: str) -> Optional[str]: + """Returns the node ID for the first alive node with the input IP.""" + for node in ray.nodes(): + if node["Alive"] and node["NodeManagerAddress"] == node_ip: + return node["NodeID"] + + return None + + +def _force_on_node( + node_id: str, + remote_func_or_actor_class: Optional[ + Union[ray.remote_function.RemoteFunction, ray.actor.ActorClass] + ] = None, +) -> Union[Union[ray.remote_function.RemoteFunction, ray.actor.ActorClass], Dict]: + """Schedule a remote function or actor class on a given node. + + Args: + node_id: The node to schedule on. + remote_func_or_actor_class: A Ray remote function or actor class + to schedule on the input node. If None, this function will directly + return the options dict to pass to another remote function or actor class + as remote options. + Returns: + The provided remote function or actor class, but with options modified to force + placement on the input node. If remote_func_or_actor_class is None, + the options dict to pass to another remote function or + actor class as remote options kwargs. + """ + + scheduling_strategy = ray.util.scheduling_strategies.NodeAffinitySchedulingStrategy( + node_id=node_id, soft=False + ) + + options = {"scheduling_strategy": scheduling_strategy} + + if remote_func_or_actor_class is None: + return options + + return remote_func_or_actor_class.options(**options) + + +def _force_on_current_node( + remote_func_or_actor_class: Optional[ + Union[ray.remote_function.RemoteFunction, ray.actor.ActorClass] + ] = None +) -> Union[Union[ray.remote_function.RemoteFunction, ray.actor.ActorClass], Dict]: + """Schedule a remote function or actor class on the current node. + + If using Ray Client, the current node is the client server node. + + Args: + remote_func_or_actor_class: A Ray remote function or actor class + to schedule on the current node. If None, this function will directly + return the options dict to pass to another remote function or actor class + as remote options. + Returns: + The provided remote function or actor class, but with options modified to force + placement on the current node. If remote_func_or_actor_class is None, + the options dict to pass to another remote function or + actor class as remote options kwargs. + """ + current_node_id = ray.get_runtime_context().get_node_id() + return _force_on_node(current_node_id, remote_func_or_actor_class) diff --git a/.venv/lib/python3.11/site-packages/ray/air/util/object_extensions/__init__.py b/.venv/lib/python3.11/site-packages/ray/air/util/object_extensions/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/.venv/lib/python3.11/site-packages/ray/air/util/object_extensions/arrow.py b/.venv/lib/python3.11/site-packages/ray/air/util/object_extensions/arrow.py new file mode 100644 index 0000000000000000000000000000000000000000..a56a04869855e6f162ffd35eb7f8e63d0f854b0e --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/air/util/object_extensions/arrow.py @@ -0,0 +1,119 @@ +import pickle +import typing + +import numpy as np +import pyarrow as pa +from packaging.version import parse as parse_version + +import ray.air.util.object_extensions.pandas +from ray._private.serialization import pickle_dumps +from ray._private.utils import _get_pyarrow_version +from ray.util.annotations import PublicAPI + +MIN_PYARROW_VERSION_SCALAR_SUBCLASS = parse_version("9.0.0") + +_VER = _get_pyarrow_version() +PYARROW_VERSION = None if _VER is None else parse_version(_VER) + + +def _object_extension_type_allowed() -> bool: + return ( + PYARROW_VERSION is not None + and PYARROW_VERSION >= MIN_PYARROW_VERSION_SCALAR_SUBCLASS + ) + + +# Please see https://arrow.apache.org/docs/python/extending_types.html for more info +@PublicAPI(stability="alpha") +class ArrowPythonObjectType(pa.ExtensionType): + """Defines a new Arrow extension type for Python objects. + We do not require a parametrized type, so the constructor does not + take any arguments + """ + + def __init__(self) -> None: + # Defines the underlying storage type as the PyArrow LargeBinary type + super().__init__(pa.large_binary(), "ray.data.arrow_pickled_object") + + def __arrow_ext_serialize__(self) -> bytes: + # Since there are no type parameters, we are free to return empty + return b"" + + @classmethod + def __arrow_ext_deserialize__( + cls, storage_type: pa.DataType, serialized: bytes + ) -> "ArrowPythonObjectType": + return ArrowPythonObjectType() + + def __arrow_ext_scalar_class__(self) -> type: + """Returns the scalar class of the extension type. Indexing out of the + PyArrow extension array will return instances of this type. + """ + return ArrowPythonObjectScalar + + def __arrow_ext_class__(self) -> type: + """Returns the array type of the extension type. Selecting one array + out of the ChunkedArray that makes up a column in a Table with + this custom type will return an instance of this type. + """ + return ArrowPythonObjectArray + + def to_pandas_dtype(self): + """Pandas interoperability type. This describes the Pandas counterpart + to the Arrow type. See https://pandas.pydata.org/docs/development/extending.html + for more information. + """ + return ray.air.util.object_extensions.pandas.PythonObjectDtype() + + def __reduce__(self): + # Earlier PyArrow versions require custom pickling behavior. + return self.__arrow_ext_deserialize__, ( + self.storage_type, + self.__arrow_ext_serialize__(), + ) + + +@PublicAPI(stability="alpha") +class ArrowPythonObjectScalar(pa.ExtensionScalar): + """Scalar class for ArrowPythonObjectType""" + + def as_py(self) -> typing.Any: + if not isinstance(self.value, pa.LargeBinaryScalar): + raise RuntimeError( + f"{type(self.value)} is not the expected LargeBinaryScalar" + ) + return pickle.load(pa.BufferReader(self.value.as_buffer())) + + +@PublicAPI(stability="alpha") +class ArrowPythonObjectArray(pa.ExtensionArray): + """Array class for ArrowPythonObjectType""" + + def from_objects( + objects: typing.Union[np.ndarray, typing.Iterable[typing.Any]] + ) -> "ArrowPythonObjectArray": + if isinstance(objects, np.ndarray): + objects = objects.tolist() + type_ = ArrowPythonObjectType() + all_dumped_bytes = [] + for obj in objects: + dumped_bytes = pickle_dumps( + obj, "Error pickling object to convert to Arrow" + ) + all_dumped_bytes.append(dumped_bytes) + arr = pa.array(all_dumped_bytes, type=type_.storage_type) + return ArrowPythonObjectArray.from_storage(type_, arr) + + def to_numpy( + self, zero_copy_only: bool = False, writable: bool = False + ) -> np.ndarray: + arr = np.empty(len(self), dtype=object) + arr[:] = self.to_pylist() + return arr + + +try: + pa.register_extension_type(ArrowPythonObjectType()) +except pa.ArrowKeyError: + # Already registered + pass diff --git a/.venv/lib/python3.11/site-packages/ray/air/util/object_extensions/pandas.py b/.venv/lib/python3.11/site-packages/ray/air/util/object_extensions/pandas.py new file mode 100644 index 0000000000000000000000000000000000000000..dbc5732f350b8c89d314b8a634c8809ad7e817b3 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/air/util/object_extensions/pandas.py @@ -0,0 +1,146 @@ +import collections.abc +import typing + +import numpy as np +import pandas as pd +import pyarrow as pa +from pandas._libs import lib +from pandas._typing import ArrayLike, Dtype, PositionalIndexer, TakeIndexer, npt + +import ray.air.util.object_extensions.arrow +from ray.util.annotations import PublicAPI + + +# See https://pandas.pydata.org/docs/development/extending.html for more information. +@PublicAPI(stability="alpha") +class PythonObjectArray(pd.api.extensions.ExtensionArray): + """Implements the Pandas extension array interface for the Arrow object array""" + + def __init__(self, values: collections.abc.Iterable[typing.Any]): + vals = list(values) + self.values = np.empty(len(vals), dtype=object) + self.values[:] = vals + + @classmethod + def _from_sequence( + cls, + scalars: collections.abc.Sequence[typing.Any], + *, + dtype: typing.Union[Dtype, None] = None, + copy: bool = False, + ) -> "PythonObjectArray": + return PythonObjectArray(scalars) + + @classmethod + def _from_factorized( + cls, values: collections.abc.Sequence[typing.Any], original: "PythonObjectArray" + ) -> "PythonObjectArray": + return PythonObjectArray(values) + + def __getitem__(self, item: PositionalIndexer) -> typing.Any: + return self.values[item] + + def __setitem__(self, key, value) -> None: + self.values[key] = value + + def __len__(self) -> int: + return len(self.values) + + def __eq__(self, other: object) -> ArrayLike: + if isinstance(other, PythonObjectArray): + return self.values == other.values + elif isinstance(other, np.ndarray): + return self.values == other + else: + return NotImplemented + + def to_numpy( + self, + dtype: typing.Union["npt.DTypeLike", None] = None, + copy: bool = False, + na_value: object = lib.no_default, + ) -> np.ndarray: + result = self.values + if copy or na_value is not lib.no_default: + result = result.copy() + if na_value is not lib.no_default: + result[self.isna()] = na_value + return result + + @property + def dtype(self) -> pd.api.extensions.ExtensionDtype: + return PythonObjectDtype() + + @property + def nbytes(self) -> int: + return self.values.nbytes + + def __arrow_array__(self, type=None): + return ray.air.util.object_extensions.arrow.ArrowPythonObjectArray.from_objects( + self.values + ) + + def isna(self) -> np.ndarray: + return pd.isnull(self.values) + + def take( + self, + indices: TakeIndexer, + *, + allow_fill: bool = False, + fill_value: typing.Any = None, + ) -> "PythonObjectArray": + if allow_fill and fill_value is None: + fill_value = self.dtype.na_value + + result = pd.core.algorithms.take( + self.values, indices, allow_fill=allow_fill, fill_value=fill_value + ) + return self._from_sequence(result, dtype=self.dtype) + + def copy(self) -> "PythonObjectArray": + return PythonObjectArray(self.values) + + @classmethod + def _concat_same_type( + cls, to_concat: collections.abc.Sequence["PythonObjectArray"] + ) -> "PythonObjectArray": + values_to_concat = [element.values for element in to_concat] + return cls(np.concatenate(values_to_concat)) + + +@PublicAPI(stability="alpha") +@pd.api.extensions.register_extension_dtype +class PythonObjectDtype(pd.api.extensions.ExtensionDtype): + @classmethod + def construct_from_string(cls, string: str): + if string != "python_object()": + raise TypeError(f"Cannot construct a '{cls.__name__}' from '{string}'") + return cls() + + @property + def type(self): + """ + The scalar type for the array, e.g. ``int`` + It's expected ``ExtensionArray[item]`` returns an instance + of ``ExtensionDtype.type`` for scalar ``item``, assuming + that value is valid (not NA). NA values do not need to be + instances of `type`. + """ + return object + + @property + def name(self) -> str: + return "python_object()" + + @classmethod + def construct_array_type(cls: type) -> type: + """ + Return the array type associated with this dtype. + """ + return PythonObjectArray + + def __from_arrow__( + self, array: typing.Union[pa.Array, pa.ChunkedArray] + ) -> PythonObjectArray: + return PythonObjectArray(array.to_pylist()) diff --git a/.venv/lib/python3.11/site-packages/ray/air/util/tensor_extensions/__init__.py b/.venv/lib/python3.11/site-packages/ray/air/util/tensor_extensions/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/.venv/lib/python3.11/site-packages/ray/air/util/tensor_extensions/__pycache__/__init__.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/air/util/tensor_extensions/__pycache__/__init__.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..f4548521f8cbcb2d763a91a2f1a0eee9fb4bd575 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/air/util/tensor_extensions/__pycache__/__init__.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/air/util/tensor_extensions/__pycache__/arrow.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/air/util/tensor_extensions/__pycache__/arrow.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..b097e67a649566d3944f1bafe1148594963d4eac Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/air/util/tensor_extensions/__pycache__/arrow.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/air/util/tensor_extensions/__pycache__/pandas.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/air/util/tensor_extensions/__pycache__/pandas.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..ac4b03b06be3cf5f0a7931e35969a247a5fe2928 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/air/util/tensor_extensions/__pycache__/pandas.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/air/util/tensor_extensions/__pycache__/utils.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/air/util/tensor_extensions/__pycache__/utils.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..8e904ade212ecfde4db5a6ff1fa64fe57043fe9e Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/air/util/tensor_extensions/__pycache__/utils.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/air/util/tensor_extensions/arrow.py b/.venv/lib/python3.11/site-packages/ray/air/util/tensor_extensions/arrow.py new file mode 100644 index 0000000000000000000000000000000000000000..b8b62d86fb1ded7ba49900e7ed7c76fdcd3a99ad --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/air/util/tensor_extensions/arrow.py @@ -0,0 +1,1224 @@ +import abc +import itertools +import json +import logging +import sys +from typing import Any, Dict, Iterable, List, Optional, Sequence, Tuple, Union + +import numpy as np +import pyarrow as pa +from packaging.version import parse as parse_version + +from ray._private.utils import _get_pyarrow_version +from ray.air.constants import TENSOR_COLUMN_NAME +from ray.air.util.tensor_extensions.utils import ( + _is_ndarray_tensor, + _is_ndarray_variable_shaped_tensor, + create_ragged_ndarray, +) +from ray.data._internal.util import GiB +from ray.util import log_once +from ray.util.annotations import DeveloperAPI, PublicAPI + +PYARROW_VERSION = _get_pyarrow_version() +if PYARROW_VERSION is not None: + PYARROW_VERSION = parse_version(PYARROW_VERSION) +# Minimum version of Arrow that supports ExtensionScalars. +# TODO(Clark): Remove conditional definition once we only support Arrow 8.0.0+. +MIN_PYARROW_VERSION_SCALAR = parse_version("8.0.0") +# Minimum version of Arrow that supports subclassable ExtensionScalars. +# TODO(Clark): Remove conditional definition once we only support Arrow 9.0.0+. +MIN_PYARROW_VERSION_SCALAR_SUBCLASS = parse_version("9.0.0") +# Minimum version supporting `zero_copy_only` flag in `ChunkedArray.to_numpy` +MIN_PYARROW_VERSION_CHUNKED_ARRAY_TO_NUMPY_ZERO_COPY_ONLY = parse_version("13.0.0") + +NUM_BYTES_PER_UNICODE_CHAR = 4 + +# NOTE: Overflow threshold in bytes for most Arrow types using int32 as +# its offsets +INT32_OVERFLOW_THRESHOLD = 2 * GiB + +logger = logging.getLogger(__name__) + + +@DeveloperAPI +class ArrowConversionError(Exception): + """Error raised when there is an issue converting data to Arrow.""" + + MAX_DATA_STR_LEN = 200 + + def __init__(self, data_str: str): + if len(data_str) > self.MAX_DATA_STR_LEN: + data_str = data_str[: self.MAX_DATA_STR_LEN] + "..." + message = f"Error converting data to Arrow: {data_str}" + super().__init__(message) + + +def _arrow_supports_extension_scalars(): + """ + Whether Arrow ExtensionScalars are supported in the current pyarrow version. + + This returns True if the pyarrow version is 8.0.0+, or if the pyarrow version is + unknown. + """ + # TODO(Clark): Remove utility once we only support Arrow 8.0.0+. + return PYARROW_VERSION is None or PYARROW_VERSION >= MIN_PYARROW_VERSION_SCALAR + + +def _arrow_extension_scalars_are_subclassable(): + """ + Whether Arrow ExtensionScalars support subclassing in the current pyarrow version. + + This returns True if the pyarrow version is 9.0.0+, or if the pyarrow version is + unknown. + """ + # TODO(Clark): Remove utility once we only support Arrow 9.0.0+. + return ( + PYARROW_VERSION is None + or PYARROW_VERSION >= MIN_PYARROW_VERSION_SCALAR_SUBCLASS + ) + + +@DeveloperAPI +def pyarrow_table_from_pydict( + pydict: Dict[str, Union[List[Any], pa.Array]], +) -> pa.Table: + """ + Convert a Python dictionary to a pyarrow Table. + + Raises: + ArrowConversionError: if the conversion fails. + """ + try: + return pa.Table.from_pydict(pydict) + except Exception as e: + raise ArrowConversionError(str(pydict)) from e + + +@DeveloperAPI(stability="alpha") +def convert_to_pyarrow_array(column_values: np.ndarray, column_name: str) -> pa.Array: + """Converts provided NumPy `ndarray` into PyArrow's `array` while utilizing + both Arrow's natively supported types as well as custom extension types: + + - ArrowTensorArray (for tensors) + - ArrowPythonObjectArray (for user-defined python class objects, as well as + any python object that aren't represented by a corresponding Arrow's native + scalar type) + """ + + try: + # Since Arrow does NOT support tensors (aka multidimensional arrays) natively, + # we have to make sure that we handle this case utilizing `ArrowTensorArray` + # extension type + if column_name == TENSOR_COLUMN_NAME or _is_ndarray_tensor(column_values): + from ray.data.extensions.tensor_extension import ArrowTensorArray + + return ArrowTensorArray.from_numpy(column_values, column_name) + else: + return _convert_to_pyarrow_native_array(column_values, column_name) + + except ArrowConversionError as ace: + from ray.data import DataContext + from ray.data.extensions.object_extension import ( + ArrowPythonObjectArray, + _object_extension_type_allowed, + ) + + enable_fallback_config: Optional[ + bool + ] = DataContext.get_current().enable_fallback_to_arrow_object_ext_type + + if not _object_extension_type_allowed(): + object_ext_type_fallback_allowed = False + object_ext_type_detail = ( + "skipping fallback to serialize as pickled python" + f" objects (due to unsupported Arrow version {PYARROW_VERSION}, " + f"min required version is {MIN_PYARROW_VERSION_SCALAR_SUBCLASS})" + ) + else: + # NOTE: By default setting is unset which (for compatibility reasons) + # is allowing the fallback + object_ext_type_fallback_allowed = ( + enable_fallback_config is None or enable_fallback_config + ) + + if object_ext_type_fallback_allowed: + object_ext_type_detail = ( + "falling back to serialize as pickled python objects" + ) + else: + object_ext_type_detail = ( + "skipping fallback to serialize as pickled python objects " + "(due to DataContext.enable_fallback_to_arrow_object_ext_type " + "= False)" + ) + + if not object_ext_type_fallback_allowed: + # To avoid logging following warning for every block it's + # only going to be logged in following cases + # - When fallback is disallowed, and + # - Fallback configuration is not set or set to false, and + # - It's being logged for the first time + if not enable_fallback_config and log_once( + "_fallback_to_arrow_object_extension_type_warning" + ): + logger.warning( + f"Failed to convert column '{column_name}' into pyarrow " + f"array due to: {ace}; {object_ext_type_detail}", + exc_info=ace, + ) + + # If `ArrowPythonObjectType` is not supported raise original exception + raise + + # Otherwise, attempt to fall back to serialize as python objects + return ArrowPythonObjectArray.from_objects(column_values) + + +def _convert_to_pyarrow_native_array( + column_values: np.ndarray, column_name: str +) -> pa.Array: + """Converts provided NumPy `ndarray` into PyArrow's `array` while only utilizing + Arrow's natively supported types (ie no custom extension types)""" + + try: + # NOTE: We explicitly infer PyArrow `DataType` so that + # we can perform upcasting to be able to accommodate + # blocks that are larger than 2Gb in size (limited + # by int32 offsets used by Arrow internally) + dtype = _infer_pyarrow_type(column_values) + + logger.log( + logging.getLevelName("TRACE"), + f"Inferred dtype of '{dtype}' for column '{column_name}'", + ) + + return pa.array(column_values, type=dtype) + except Exception as e: + raise ArrowConversionError(str(column_values)) from e + + +def _infer_pyarrow_type(column_values: np.ndarray) -> Optional[pa.DataType]: + """Infers target Pyarrow `DataType` based on the provided + columnar values. + + NOTE: This is a wrapper on top of `pa.infer_type(...)` utility + performing up-casting of `binary` and `string` types to + corresponding `large_binary` and `large_string` types in case + any of the array elements exceeds 2Gb in size therefore + making it impossible for original types to accommodate such + values. + + Unfortunately, for unknown reasons PA doesn't perform + that upcasting itself henceforth we have to do perform + it manually + + Args: + column_values: List of columnar values + + Returns: + Instance of PyArrow's `DataType` based on the provided + column values + """ + + if len(column_values) == 0: + return None + + inferred_pa_dtype = pa.infer_type(column_values) + + def _len_gt_overflow_threshold(obj: Any) -> bool: + # NOTE: This utility could be seeing objects other than strings or bytes in + # cases when column contains non-scalar non-homogeneous object types as + # column values, therefore making Arrow unable to infer corresponding + # column type appropriately, therefore falling back to assume the type + # of the first element in the list. + # + # Check out test cases for this method for an additional context. + if isinstance(obj, (str, bytes)): + return len(obj) > INT32_OVERFLOW_THRESHOLD + + return False + + if pa.types.is_binary(inferred_pa_dtype) and any( + [_len_gt_overflow_threshold(v) for v in column_values] + ): + return pa.large_binary() + elif pa.types.is_string(inferred_pa_dtype) and any( + [_len_gt_overflow_threshold(v) for v in column_values] + ): + return pa.large_string() + + return inferred_pa_dtype + + +@DeveloperAPI +def get_arrow_extension_tensor_types(): + """Returns list of extension types of Arrow Array holding + multidimensional tensors + """ + return ( + *get_arrow_extension_fixed_shape_tensor_types(), + *get_arrow_extension_variable_shape_tensor_types(), + ) + + +@DeveloperAPI +def get_arrow_extension_fixed_shape_tensor_types(): + """Returns list of Arrow extension types holding multidimensional + tensors of *fixed* shape + """ + return ArrowTensorType, ArrowTensorTypeV2 + + +@DeveloperAPI +def get_arrow_extension_variable_shape_tensor_types(): + """Returns list of Arrow extension types holding multidimensional + tensors of *fixed* shape + """ + return (ArrowVariableShapedTensorType,) + + +class _BaseFixedShapeArrowTensorType(pa.ExtensionType, abc.ABC): + """ + Arrow ExtensionType for an array of fixed-shaped, homogeneous-typed + tensors. + + This is the Arrow side of TensorDtype. + + See Arrow extension type docs: + https://arrow.apache.org/docs/python/extending_types.html#defining-extension-types-user-defined-types + """ + + def __init__( + self, shape: Tuple[int, ...], tensor_dtype: pa.DataType, ext_type_id: str + ): + self._shape = shape + + super().__init__(tensor_dtype, ext_type_id) + + @property + def shape(self): + """ + Shape of contained tensors. + """ + return self._shape + + @property + def scalar_type(self): + """Returns the type of the underlying tensor elements.""" + return self.storage_type.value_type + + def to_pandas_dtype(self): + """ + Convert Arrow extension type to corresponding Pandas dtype. + + Returns: + An instance of pd.api.extensions.ExtensionDtype. + """ + from ray.air.util.tensor_extensions.pandas import TensorDtype + + return TensorDtype(self._shape, self.scalar_type.to_pandas_dtype()) + + def __reduce__(self): + return self.__arrow_ext_deserialize__, ( + self.storage_type, + self.__arrow_ext_serialize__(), + ) + + def __arrow_ext_serialize__(self): + return json.dumps(self._shape).encode() + + def __arrow_ext_class__(self): + """ + ExtensionArray subclass with custom logic for this array of tensors + type. + + Returns: + A subclass of pd.api.extensions.ExtensionArray. + """ + return ArrowTensorArray + + if _arrow_extension_scalars_are_subclassable(): + # TODO(Clark): Remove this version guard once we only support Arrow 9.0.0+. + def __arrow_ext_scalar_class__(self): + """ + ExtensionScalar subclass with custom logic for this array of tensors type. + """ + return ArrowTensorScalar + + if _arrow_supports_extension_scalars(): + # TODO(Clark): Remove this version guard once we only support Arrow 8.0.0+. + def _extension_scalar_to_ndarray( + self, scalar: pa.ExtensionScalar + ) -> np.ndarray: + """ + Convert an ExtensionScalar to a tensor element. + """ + raw_values = scalar.value.values + shape = scalar.type.shape + value_type = raw_values.type + offset = raw_values.offset + data_buffer = raw_values.buffers()[1] + return _to_ndarray_helper(shape, value_type, offset, data_buffer) + + def __str__(self) -> str: + return ( + f"numpy.ndarray(shape={self.shape}, dtype={self.storage_type.value_type})" + ) + + def __repr__(self) -> str: + return str(self) + + @classmethod + def _need_variable_shaped_tensor_array( + cls, + array_types: Sequence[ + Union[ + "ArrowTensorType", "ArrowTensorTypeV2", "ArrowVariableShapedTensorType" + ] + ], + ) -> bool: + """ + Whether the provided list of tensor types needs a variable-shaped + representation (i.e. `ArrowVariableShapedTensorType`) when concatenating + or chunking. If one or more of the tensor types in `array_types` are + variable-shaped and/or any of the tensor arrays have a different shape + than the others, a variable-shaped tensor array representation will be + required and this method will return True. + + Args: + array_types: List of tensor types to check if a variable-shaped + representation is required for concatenation + + Returns: + True if concatenating arrays with types `array_types` requires + a variable-shaped representation + """ + shape = None + for arr_type in array_types: + # If at least one of the arrays is variable-shaped, we can immediately + # short-circuit since we require a variable-shaped representation. + if isinstance(arr_type, ArrowVariableShapedTensorType): + return True + if not isinstance(arr_type, get_arrow_extension_fixed_shape_tensor_types()): + raise ValueError( + "All provided array types must be an instance of either " + "ArrowTensorType or ArrowVariableShapedTensorType, but " + f"got {arr_type}" + ) + # We need variable-shaped representation if any of the tensor arrays have + # different shapes. + if shape is not None and arr_type.shape != shape: + return True + shape = arr_type.shape + return False + + +@PublicAPI(stability="beta") +class ArrowTensorType(_BaseFixedShapeArrowTensorType): + """Arrow ExtensionType (v1) for tensors. + + NOTE: This type does *NOT* support tensors larger than 4Gb (due to + overflow of int32 offsets utilized inside Pyarrow `ListType`) + """ + + OFFSET_DTYPE = np.int32 + + def __init__(self, shape: Tuple[int, ...], dtype: pa.DataType): + """ + Construct the Arrow extension type for array of fixed-shaped tensors. + + Args: + shape: Shape of contained tensors. + dtype: pyarrow dtype of tensor elements. + """ + + super().__init__(shape, pa.list_(dtype), "ray.data.arrow_tensor") + + @classmethod + def __arrow_ext_deserialize__(cls, storage_type, serialized): + shape = tuple(json.loads(serialized)) + return cls(shape, storage_type.value_type) + + +@PublicAPI(stability="alpha") +class ArrowTensorTypeV2(_BaseFixedShapeArrowTensorType): + """Arrow ExtensionType (v2) for tensors (supporting tensors > 4Gb).""" + + OFFSET_DTYPE = np.int64 + + def __init__(self, shape: Tuple[int, ...], dtype: pa.DataType): + """ + Construct the Arrow extension type for array of fixed-shaped tensors. + + Args: + shape: Shape of contained tensors. + dtype: pyarrow dtype of tensor elements. + """ + + super().__init__(shape, pa.large_list(dtype), "ray.data.arrow_tensor_v2") + + @classmethod + def __arrow_ext_deserialize__(cls, storage_type, serialized): + shape = tuple(json.loads(serialized)) + return cls(shape, storage_type.value_type) + + +if _arrow_extension_scalars_are_subclassable(): + # TODO(Clark): Remove this version guard once we only support Arrow 9.0.0+. + @PublicAPI(stability="beta") + class ArrowTensorScalar(pa.ExtensionScalar): + def as_py(self) -> np.ndarray: + return self.type._extension_scalar_to_ndarray(self) + + def __array__(self) -> np.ndarray: + return self.as_py() + + +# TODO(Clark): Remove this mixin once we only support Arrow 9.0.0+. +class _ArrowTensorScalarIndexingMixin: + """ + A mixin providing support for scalar indexing in tensor extension arrays for + Arrow < 9.0.0, before full ExtensionScalar support was added. This mixin overrides + __getitem__, __iter__, and to_pylist. + """ + + # This mixin will be a no-op (no methods added) for Arrow 9.0.0+. + if not _arrow_extension_scalars_are_subclassable(): + # NOTE: These __iter__ and to_pylist definitions are shared for both + # Arrow < 8.0.0 and Arrow 8.*. + def __iter__(self): + # Override pa.Array.__iter__() in order to return an iterator of + # properly shaped tensors instead of an iterator of flattened tensors. + # See comment in above __getitem__ method. + for i in range(len(self)): + # Use overridden __getitem__ method. + yield self.__getitem__(i) + + def to_pylist(self): + # Override pa.Array.to_pylist() due to a lack of ExtensionScalar + # support (see comment in __getitem__). + return list(self) + + if _arrow_supports_extension_scalars(): + # NOTE(Clark): This __getitem__ override is only needed for Arrow 8.*, + # before ExtensionScalar subclassing support was added. + # TODO(Clark): Remove these methods once we only support Arrow 9.0.0+. + def __getitem__(self, key): + # This __getitem__ hook allows us to support proper indexing when + # accessing a single tensor (a "scalar" item of the array). Without this + # hook for integer keys, the indexing will fail on pyarrow < 9.0.0 due + # to a lack of ExtensionScalar subclassing support. + + # NOTE(Clark): We'd like to override the pa.Array.getitem() helper + # instead, which would obviate the need for overriding __iter__(), but + # unfortunately overriding Cython cdef methods with normal Python + # methods isn't allowed. + item = super().__getitem__(key) + if not isinstance(key, slice): + item = item.type._extension_scalar_to_ndarray(item) + return item + + else: + # NOTE(Clark): This __getitem__ override is only needed for Arrow < 8.0.0, + # before any ExtensionScalar support was added. + # TODO(Clark): Remove these methods once we only support Arrow 8.0.0+. + def __getitem__(self, key): + # This __getitem__ hook allows us to support proper indexing when + # accessing a single tensor (a "scalar" item of the array). Without this + # hook for integer keys, the indexing will fail on pyarrow < 8.0.0 due + # to a lack of ExtensionScalar support. + + # NOTE(Clark): We'd like to override the pa.Array.getitem() helper + # instead, which would obviate the need for overriding __iter__(), but + # unfortunately overriding Cython cdef methods with normal Python + # methods isn't allowed. + if isinstance(key, slice): + return super().__getitem__(key) + return self._to_numpy(key) + + +# NOTE: We need to inherit from the mixin before pa.ExtensionArray to ensure that the +# mixin's overriding methods appear first in the MRO. +# TODO(Clark): Remove this mixin once we only support Arrow 9.0.0+. +@PublicAPI(stability="beta") +class ArrowTensorArray(_ArrowTensorScalarIndexingMixin, pa.ExtensionArray): + """ + An array of fixed-shape, homogeneous-typed tensors. + + This is the Arrow side of TensorArray. + + See Arrow docs for customizing extension arrays: + https://arrow.apache.org/docs/python/extending_types.html#custom-extension-array-class + """ + + @classmethod + def from_numpy( + cls, + arr: Union[np.ndarray, Iterable[np.ndarray]], + column_name: Optional[str] = None, + ) -> Union["ArrowTensorArray", "ArrowVariableShapedTensorArray"]: + """ + Convert an ndarray or an iterable of ndarrays to an array of homogeneous-typed + tensors. If given fixed-shape tensor elements, this will return an + ``ArrowTensorArray``; if given variable-shape tensor elements, this will return + an ``ArrowVariableShapedTensorArray``. + + Args: + arr: An ndarray or an iterable of ndarrays. + column_name: Optional. Used only in logging outputs to provide + additional details. + + Returns: + - If fixed-shape tensor elements, an ``ArrowTensorArray`` containing + ``len(arr)`` tensors of fixed shape. + - If variable-shaped tensor elements, an ``ArrowVariableShapedTensorArray`` + containing ``len(arr)`` tensors of variable shape. + - If scalar elements, a ``pyarrow.Array``. + """ + if not isinstance(arr, np.ndarray) and isinstance(arr, Iterable): + arr = list(arr) + + if isinstance(arr, (list, tuple)) and arr and isinstance(arr[0], np.ndarray): + # Stack ndarrays and pass through to ndarray handling logic below. + try: + arr = np.stack(arr, axis=0) + except ValueError as ve: + logger.warning( + f"Failed to stack lists due to: {ve}; " + f"falling back to using np.array(..., dtype=object)", + exc_info=ve, + ) + + # ndarray stacking may fail if the arrays are heterogeneously-shaped. + arr = np.array(arr, dtype=object) + if not isinstance(arr, np.ndarray): + raise ValueError( + f"Must give ndarray or iterable of ndarrays, got {type(arr)} {arr}" + ) + + try: + return cls._from_numpy(arr) + except Exception as e: + data_str = "" + if column_name: + data_str += f"column: '{column_name}', " + data_str += f"shape: {arr.shape}, dtype: {arr.dtype}, data: {arr}" + raise ArrowConversionError(data_str) from e + + @classmethod + def _from_numpy( + cls, + arr: np.ndarray, + ) -> Union["ArrowTensorArray", "ArrowVariableShapedTensorArray"]: + if len(arr) > 0 and np.isscalar(arr[0]): + # Elements are scalar so a plain Arrow Array will suffice. + return pa.array(arr) + if _is_ndarray_variable_shaped_tensor(arr): + # Tensor elements have variable shape, so we delegate to + # ArrowVariableShapedTensorArray. + return ArrowVariableShapedTensorArray.from_numpy(arr) + if not arr.flags.c_contiguous: + # We only natively support C-contiguous ndarrays. + arr = np.ascontiguousarray(arr) + scalar_dtype = pa.from_numpy_dtype(arr.dtype) + if pa.types.is_string(scalar_dtype): + if arr.dtype.byteorder == ">" or ( + arr.dtype.byteorder == "=" and sys.byteorder == "big" + ): + raise ValueError( + "Only little-endian string tensors are supported, " + f"but got: {arr.dtype}", + ) + scalar_dtype = pa.binary(arr.dtype.itemsize) + outer_len = arr.shape[0] + element_shape = arr.shape[1:] + total_num_items = arr.size + num_items_per_element = np.prod(element_shape) if element_shape else 1 + + # Data buffer. + if pa.types.is_boolean(scalar_dtype): + # NumPy doesn't represent boolean arrays as bit-packed, so we manually + # bit-pack the booleans before handing the buffer off to Arrow. + # NOTE: Arrow expects LSB bit-packed ordering. + # NOTE: This creates a copy. + arr = np.packbits(arr, bitorder="little") + data_buffer = pa.py_buffer(arr) + data_array = pa.Array.from_buffers( + scalar_dtype, total_num_items, [None, data_buffer] + ) + + from ray.data import DataContext + + if DataContext.get_current().use_arrow_tensor_v2: + pa_type_ = ArrowTensorTypeV2(element_shape, scalar_dtype) + else: + pa_type_ = ArrowTensorType(element_shape, scalar_dtype) + + # Create Offset buffer + offset_buffer = pa.py_buffer( + pa_type_.OFFSET_DTYPE( + [i * num_items_per_element for i in range(outer_len + 1)] + ) + ) + + storage = pa.Array.from_buffers( + pa_type_.storage_type, + outer_len, + [None, offset_buffer], + children=[data_array], + ) + + return pa.ExtensionArray.from_storage(pa_type_, storage) + + def _to_numpy(self, index: Optional[int] = None, zero_copy_only: bool = False): + """ + Helper for getting either an element of the array of tensors as an + ndarray, or the entire array of tensors as a single ndarray. + + Args: + index: The index of the tensor element that we wish to return as + an ndarray. If not given, the entire array of tensors is + returned as an ndarray. + zero_copy_only: If True, an exception will be raised if the + conversion to a NumPy array would require copying the + underlying data (e.g. in presence of nulls, or for + non-primitive types). This argument is currently ignored, so + zero-copy isn't enforced even if this argument is true. + + Returns: + The corresponding tensor element as an ndarray if an index was + given, or the entire array of tensors as an ndarray otherwise. + """ + # TODO(Clark): Enforce zero_copy_only. + # TODO(Clark): Support strides? + # Buffers schema: + # [None, offset_buffer, None, data_buffer] + buffers = self.buffers() + data_buffer = buffers[3] + storage_list_type = self.storage.type + value_type = storage_list_type.value_type + ext_dtype = value_type.to_pandas_dtype() + shape = self.type.shape + if pa.types.is_boolean(value_type): + # Arrow boolean array buffers are bit-packed, with 8 entries per byte, + # and are accessed via bit offsets. + buffer_item_width = value_type.bit_width + else: + # We assume all other array types are accessed via byte array + # offsets. + buffer_item_width = value_type.bit_width // 8 + # Number of items per inner ndarray. + num_items_per_element = np.prod(shape) if shape else 1 + # Base offset into data buffer, e.g. due to zero-copy slice. + buffer_offset = self.offset * num_items_per_element + # Offset of array data in buffer. + offset = buffer_item_width * buffer_offset + if index is not None: + # Getting a single tensor element of the array. + offset_buffer = buffers[1] + offset_array = np.ndarray( + (len(self),), buffer=offset_buffer, dtype=self.type.OFFSET_DTYPE + ) + # Offset into array to reach logical index. + index_offset = offset_array[index] + # Add the index offset to the base offset. + offset += buffer_item_width * index_offset + else: + # Getting the entire array of tensors. + shape = (len(self),) + shape + if pa.types.is_boolean(value_type): + # Special handling for boolean arrays, since Arrow bit-packs boolean arrays + # while NumPy does not. + # Cast as uint8 array and let NumPy unpack into a boolean view. + # Offset into uint8 array, where each element is a bucket for 8 booleans. + byte_bucket_offset = offset // 8 + # Offset for a specific boolean, within a uint8 array element. + bool_offset = offset % 8 + # The number of uint8 array elements (buckets) that our slice spans. + # Note that, due to the offset for a specific boolean, the slice can span + # byte boundaries even if it contains less than 8 booleans. + num_boolean_byte_buckets = 1 + ((bool_offset + np.prod(shape) - 1) // 8) + # Construct the uint8 array view on the buffer. + arr = np.ndarray( + (num_boolean_byte_buckets,), + dtype=np.uint8, + buffer=data_buffer, + offset=byte_bucket_offset, + ) + # Unpack into a byte per boolean, using LSB bit-packed ordering. + arr = np.unpackbits(arr, bitorder="little") + # Interpret buffer as boolean array. + return np.ndarray(shape, dtype=np.bool_, buffer=arr, offset=bool_offset) + # Special handling of binary/string types. Assumes unicode string tensor columns + if pa.types.is_fixed_size_binary(value_type): + ext_dtype = np.dtype( + f" Union["ArrowTensorArray", "ArrowVariableShapedTensorArray"]: + """ + Concatenate multiple tensor arrays. + + If one or more of the tensor arrays in to_concat are variable-shaped and/or any + of the tensor arrays have a different shape than the others, a variable-shaped + tensor array will be returned. + """ + to_concat_types = [arr.type for arr in to_concat] + if ArrowTensorType._need_variable_shaped_tensor_array(to_concat_types): + # Need variable-shaped tensor array. + # TODO(Clark): Eliminate this NumPy roundtrip by directly constructing the + # underlying storage array buffers (NumPy roundtrip will not be zero-copy + # for e.g. boolean arrays). + # NOTE(Clark): Iterating over a tensor extension array converts each element + # to an ndarray view. + return ArrowVariableShapedTensorArray.from_numpy( + [e for a in to_concat for e in a] + ) + else: + storage = pa.concat_arrays([c.storage for c in to_concat]) + + return ArrowTensorArray.from_storage(to_concat[0].type, storage) + + @classmethod + def _chunk_tensor_arrays( + cls, arrs: Sequence[Union["ArrowTensorArray", "ArrowVariableShapedTensorArray"]] + ) -> pa.ChunkedArray: + """ + Create a ChunkedArray from multiple tensor arrays. + """ + arrs_types = [arr.type for arr in arrs] + if ArrowTensorType._need_variable_shaped_tensor_array(arrs_types): + new_arrs = [] + for a in arrs: + if isinstance(a.type, get_arrow_extension_fixed_shape_tensor_types()): + a = a.to_variable_shaped_tensor_array() + assert isinstance(a.type, ArrowVariableShapedTensorType) + new_arrs.append(a) + arrs = new_arrs + return pa.chunked_array(arrs) + + def to_variable_shaped_tensor_array(self) -> "ArrowVariableShapedTensorArray": + """ + Convert this tensor array to a variable-shaped tensor array. + + This is primarily used when concatenating multiple chunked tensor arrays where + at least one chunked array is already variable-shaped and/or the shapes of the + chunked arrays differ, in which case the resulting concatenated tensor array + will need to be in the variable-shaped representation. + """ + # TODO(Clark): Eliminate this NumPy roundtrip by directly constructing the + # underlying storage array buffers (NumPy roundtrip will not be zero-copy for + # e.g. boolean arrays). + return ArrowVariableShapedTensorArray.from_numpy(self.to_numpy()) + + +@PublicAPI(stability="alpha") +class ArrowVariableShapedTensorType(pa.ExtensionType): + """ + Arrow ExtensionType for an array of heterogeneous-shaped, homogeneous-typed + tensors. + + This is the Arrow side of TensorDtype for tensor elements with different shapes. + Note that this extension only supports non-ragged tensor elements; i.e., when + considering each tensor element in isolation, they must have a well-defined, + non-ragged shape. + + See Arrow extension type docs: + https://arrow.apache.org/docs/python/extending_types.html#defining-extension-types-user-defined-types + """ + + def __init__(self, dtype: pa.DataType, ndim: int): + """ + Construct the Arrow extension type for array of heterogeneous-shaped tensors. + + Args: + dtype: pyarrow dtype of tensor elements. + ndim: The number of dimensions in the tensor elements. + """ + self._ndim = ndim + super().__init__( + pa.struct( + [("data", pa.large_list(dtype)), ("shape", pa.list_(pa.int64()))] + ), + "ray.data.arrow_variable_shaped_tensor", + ) + + def to_pandas_dtype(self): + """ + Convert Arrow extension type to corresponding Pandas dtype. + + Returns: + An instance of pd.api.extensions.ExtensionDtype. + """ + from ray.air.util.tensor_extensions.pandas import TensorDtype + + return TensorDtype( + (None,) * self.ndim, + self.storage_type["data"].type.value_type.to_pandas_dtype(), + ) + + @property + def ndim(self) -> int: + """Return the number of dimensions in the tensor elements.""" + return self._ndim + + @property + def scalar_type(self): + """Returns the type of the underlying tensor elements.""" + data_field_index = self.storage_type.get_field_index("data") + return self.storage_type[data_field_index].type.value_type + + def __reduce__(self): + return self.__arrow_ext_deserialize__, ( + self.storage_type, + self.__arrow_ext_serialize__(), + ) + + def __arrow_ext_serialize__(self): + return json.dumps(self._ndim).encode() + + @classmethod + def __arrow_ext_deserialize__(cls, storage_type, serialized): + ndim = json.loads(serialized) + dtype = storage_type["data"].type.value_type + return cls(dtype, ndim) + + def __arrow_ext_class__(self): + """ + ExtensionArray subclass with custom logic for this array of tensors + type. + + Returns: + A subclass of pd.api.extensions.ExtensionArray. + """ + return ArrowVariableShapedTensorArray + + if _arrow_extension_scalars_are_subclassable(): + # TODO(Clark): Remove this version guard once we only support Arrow 9.0.0+. + def __arrow_ext_scalar_class__(self): + """ + ExtensionScalar subclass with custom logic for this array of tensors type. + """ + return ArrowTensorScalar + + def __str__(self) -> str: + dtype = self.storage_type["data"].type.value_type + return f"numpy.ndarray(ndim={self.ndim}, dtype={dtype})" + + def __repr__(self) -> str: + return str(self) + + if _arrow_supports_extension_scalars(): + # TODO(Clark): Remove this version guard once we only support Arrow 8.0.0+. + def _extension_scalar_to_ndarray( + self, scalar: pa.ExtensionScalar + ) -> np.ndarray: + """ + Convert an ExtensionScalar to a tensor element. + """ + data = scalar.value.get("data") + raw_values = data.values + + shape = tuple(scalar.value.get("shape").as_py()) + value_type = raw_values.type + offset = raw_values.offset + data_buffer = raw_values.buffers()[1] + return _to_ndarray_helper(shape, value_type, offset, data_buffer) + + +# NOTE: We need to inherit from the mixin before pa.ExtensionArray to ensure that the +# mixin's overriding methods appear first in the MRO. +# TODO(Clark): Remove this mixin once we only support Arrow 9.0.0+. +@PublicAPI(stability="alpha") +class ArrowVariableShapedTensorArray( + _ArrowTensorScalarIndexingMixin, pa.ExtensionArray +): + """ + An array of heterogeneous-shaped, homogeneous-typed tensors. + + This is the Arrow side of TensorArray for tensor elements that have differing + shapes. Note that this extension only supports non-ragged tensor elements; i.e., + when considering each tensor element in isolation, they must have a well-defined + shape. This extension also only supports tensor elements that all have the same + number of dimensions. + + See Arrow docs for customizing extension arrays: + https://arrow.apache.org/docs/python/extending_types.html#custom-extension-array-class + """ + + @classmethod + def from_numpy( + cls, arr: Union[np.ndarray, List[np.ndarray], Tuple[np.ndarray]] + ) -> "ArrowVariableShapedTensorArray": + """ + Convert an ndarray or an iterable of heterogeneous-shaped ndarrays to an array + of heterogeneous-shaped, homogeneous-typed tensors. + + Args: + arr: An ndarray or an iterable of heterogeneous-shaped ndarrays. + + Returns: + An ArrowVariableShapedTensorArray containing len(arr) tensors of + heterogeneous shape. + """ + # Implementation note - Arrow representation of ragged tensors: + # + # We represent an array of ragged tensors using a struct array containing two + # fields: + # - data: a variable-sized list array, where each element in the array is a + # tensor element stored in a 1D (raveled) variable-sized list of the + # underlying scalar data type. + # - shape: a variable-sized list array containing the shapes of each tensor + # element. + if not isinstance(arr, (list, tuple, np.ndarray)): + raise ValueError( + "ArrowVariableShapedTensorArray can only be constructed from an " + f"ndarray or a list/tuple of ndarrays, but got: {type(arr)}" + ) + if len(arr) == 0: + # Empty ragged tensor arrays are not supported. + raise ValueError("Creating empty ragged tensor arrays is not supported.") + + # Whether all subndarrays are contiguous views of the same ndarray. + shapes, sizes, raveled = [], [], [] + ndim = None + for a in arr: + a = np.asarray(a) + if ndim is not None and a.ndim != ndim: + raise ValueError( + "ArrowVariableShapedTensorArray only supports tensor elements that " + "all have the same number of dimensions, but got tensor elements " + f"with dimensions: {ndim}, {a.ndim}" + ) + ndim = a.ndim + shapes.append(a.shape) + sizes.append(a.size) + # Convert to 1D array view; this should be zero-copy in the common case. + # NOTE: If array is not in C-contiguous order, this will convert it to + # C-contiguous order, incurring a copy. + a = np.ravel(a, order="C") + raveled.append(a) + # Get size offsets and total size. + sizes = np.array(sizes) + size_offsets = np.cumsum(sizes) + total_size = size_offsets[-1] + # Concatenate 1D views into a contiguous 1D array. + if all(_is_contiguous_view(curr, prev) for prev, curr in _pairwise(raveled)): + # An optimized zero-copy path if raveled tensor elements are already + # contiguous in memory, e.g. if this tensor array has already done a + # roundtrip through our Arrow representation. + np_data_buffer = raveled[-1].base + else: + np_data_buffer = np.concatenate(raveled) + dtype = np_data_buffer.dtype + pa_dtype = pa.from_numpy_dtype(dtype) + if pa.types.is_string(pa_dtype): + if dtype.byteorder == ">" or ( + dtype.byteorder == "=" and sys.byteorder == "big" + ): + raise ValueError( + "Only little-endian string tensors are supported, " + f"but got: {dtype}" + ) + pa_dtype = pa.binary(dtype.itemsize) + if dtype.type is np.bool_: + # NumPy doesn't represent boolean arrays as bit-packed, so we manually + # bit-pack the booleans before handing the buffer off to Arrow. + # NOTE: Arrow expects LSB bit-packed ordering. + # NOTE: This creates a copy. + np_data_buffer = np.packbits(np_data_buffer, bitorder="little") + data_buffer = pa.py_buffer(np_data_buffer) + # Construct underlying data array. + value_array = pa.Array.from_buffers(pa_dtype, total_size, [None, data_buffer]) + # Construct array for offsets into the 1D data array, where each offset + # corresponds to a tensor element. + size_offsets = np.insert(size_offsets, 0, 0) + offset_array = pa.array(size_offsets) + data_array = pa.LargeListArray.from_arrays(offset_array, value_array) + # We store the tensor element shapes so we can reconstruct each tensor when + # converting back to NumPy ndarrays. + shape_array = pa.array(shapes) + # Build storage array containing tensor data and the tensor element shapes. + storage = pa.StructArray.from_arrays( + [data_array, shape_array], + ["data", "shape"], + ) + type_ = ArrowVariableShapedTensorType(pa_dtype, ndim) + return pa.ExtensionArray.from_storage(type_, storage) + + def _to_numpy(self, index: Optional[int] = None, zero_copy_only: bool = False): + """ + Helper for getting either an element of the array of tensors as an ndarray, or + the entire array of tensors as a single ndarray. + + Args: + index: The index of the tensor element that we wish to return as an + ndarray. If not given, the entire array of tensors is returned as an + ndarray. + zero_copy_only: If True, an exception will be raised if the conversion to a + NumPy array would require copying the underlying data (e.g. in presence + of nulls, or for non-primitive types). This argument is currently + ignored, so zero-copy isn't enforced even if this argument is true. + + Returns: + The corresponding tensor element as an ndarray if an index was given, or + the entire array of tensors as an ndarray otherwise. + """ + # TODO(Clark): Enforce zero_copy_only. + # TODO(Clark): Support strides? + if index is None: + # Get individual ndarrays for each tensor element. + arrs = [self._to_numpy(i, zero_copy_only) for i in range(len(self))] + # Return ragged NumPy ndarray in the ndarray of ndarray pointers + # representation. + return create_ragged_ndarray(arrs) + data = self.storage.field("data") + shapes = self.storage.field("shape") + + shape = shapes[index].as_py() + value_type = data.type.value_type + offset = data.offsets[index].as_py() + data_buffer = data.buffers()[3] + return _to_ndarray_helper(shape, value_type, offset, data_buffer) + + def to_numpy(self, zero_copy_only: bool = True): + """ + Convert the entire array of tensors into a single ndarray. + + Args: + zero_copy_only: If True, an exception will be raised if the conversion to a + NumPy array would require copying the underlying data (e.g. in presence + of nulls, or for non-primitive types). This argument is currently + ignored, so zero-copy isn't enforced even if this argument is true. + + Returns: + A single ndarray representing the entire array of tensors. + """ + return self._to_numpy(zero_copy_only=zero_copy_only) + + +def _is_contiguous_view(curr: np.ndarray, prev: Optional[np.ndarray]) -> bool: + """Check if the provided tensor element is contiguous with the previous tensor + element. + + Args: + curr: The tensor element whose contiguity that we wish to check. + prev: The previous tensor element in the tensor array. + + Returns: + Whether the provided tensor element is contiguous with the previous tensor + element. + """ + if ( + curr.base is None + or not curr.data.c_contiguous + or (prev is not None and curr.base is not prev.base) + ): + # curr is either: + # - not a view, + # - not in C-contiguous order, + # - a view that does not share its base with the other subndarrays. + return False + else: + # curr is a C-contiguous view that shares the same base with the seen + # subndarrays, but we need to confirm that it is contiguous with the + # previous subndarray. + if prev is not None and ( + _get_buffer_address(curr) - _get_buffer_address(prev) + != prev.base.dtype.itemsize * prev.size + ): + # This view is not contiguous with the previous view. + return False + else: + return True + + +def _get_buffer_address(arr: np.ndarray) -> int: + """Get the address of the buffer underlying the provided NumPy ndarray.""" + return arr.__array_interface__["data"][0] + + +def _pairwise(iterable): + # pairwise('ABCDEFG') --> AB BC CD DE EF FG + # Backport of itertools.pairwise for Python < 3.10. + a, b = itertools.tee(iterable) + next(b, None) + return zip(a, b) + + +def _to_ndarray_helper(shape, value_type, offset, data_buffer): + if pa.types.is_boolean(value_type): + # Arrow boolean array buffers are bit-packed, with 8 entries per byte, + # and are accessed via bit offsets. + buffer_item_width = value_type.bit_width + else: + # We assume all other array types are accessed via byte array + # offsets. + buffer_item_width = value_type.bit_width // 8 + data_offset = buffer_item_width * offset + + if pa.types.is_boolean(value_type): + # Special handling for boolean arrays, since Arrow + # bit-packs boolean arrays while NumPy does not. + # Cast as uint8 array and let NumPy unpack into a boolean view. + # Offset into uint8 array, where each element is + # a bucket for 8 booleans. + byte_bucket_offset = data_offset // 8 + # Offset for a specific boolean, within a uint8 array element. + bool_offset = data_offset % 8 + # The number of uint8 array elements (buckets) that our slice spans. + # Note that, due to the offset for a specific boolean, + # the slice can span byte boundaries even if it contains + # less than 8 booleans. + num_boolean_byte_buckets = 1 + ((bool_offset + np.prod(shape) - 1) // 8) + # Construct the uint8 array view on the buffer. + arr = np.ndarray( + (num_boolean_byte_buckets,), + dtype=np.uint8, + buffer=data_buffer, + offset=byte_bucket_offset, + ) + # Unpack into a byte per boolean, using LSB bit-packed ordering. + arr = np.unpackbits(arr, bitorder="little") + # Interpret buffer as boolean array. + return np.ndarray(shape, dtype=np.bool_, buffer=arr, offset=bool_offset) + ext_dtype = value_type.to_pandas_dtype() + # Special handling of ragged string tensors + if pa.types.is_fixed_size_binary(value_type): + ext_dtype = np.dtype(f" bool: + """Return whether the provided NumPy ndarray is comprised of tensors. + + NOTE: Tensor is defined as a NumPy array such that `len(arr.shape) > 1` + """ + + # Case of uniform-shaped (ie non-ragged) tensor + if arr.ndim > 1: + return True + + # Case of ragged tensor (as produced by `create_ragged_ndarray` utility) + elif ( + arr.dtype.type is np.object_ and len(arr) > 0 and isinstance(arr[0], np.ndarray) + ): + return True + + return False + + +def _is_ndarray_variable_shaped_tensor(arr: np.ndarray) -> bool: + """Return whether the provided NumPy ndarray is comprised of variable-shaped + tensors. + + NOTE: This is an O(rows) check. + """ + if arr.dtype.type is not np.object_: + return False + if len(arr) == 0: + return False + if not isinstance(arr[0], np.ndarray): + return False + shape = arr[0].shape + for a in arr[1:]: + if not isinstance(a, np.ndarray): + return False + if a.shape != shape: + return True + return True + + +def _create_possibly_ragged_ndarray( + values: Union[np.ndarray, "ABCSeries", Sequence[Any]] +) -> np.ndarray: + """ + Create a possibly ragged ndarray. + Using the np.array() constructor will fail to construct a ragged ndarray that has a + uniform first dimension (e.g. uniform channel dimension in imagery). This function + catches this failure and tries a create-and-fill method to construct the ragged + ndarray. + """ + try: + with warnings.catch_warnings(): + # For NumPy < 1.24, constructing a ragged ndarray directly via + # `np.array(...)` without the `dtype=object` parameter will raise a + # VisibleDeprecationWarning which we suppress. + # More details: https://stackoverflow.com/q/63097829 + if np.lib.NumpyVersion(np.__version__) >= "2.0.0": + copy_if_needed = None + warning_type = np.exceptions.VisibleDeprecationWarning + else: + copy_if_needed = False + warning_type = np.VisibleDeprecationWarning + + warnings.simplefilter("ignore", category=warning_type) + arr = np.array(values, copy=copy_if_needed) + return arr + except ValueError as e: + # Constructing a ragged ndarray directly via `np.array(...)` + # without the `dtype=object` parameter will raise a ValueError. + # For NumPy < 1.24, the message is of the form: + # "could not broadcast input array from shape..." + # For NumPy >= 1.24, the message is of the form: + # "The requested array has an inhomogeneous shape..." + # More details: https://github.com/numpy/numpy/pull/22004 + error_str = str(e) + if ( + "could not broadcast input array from shape" in error_str + or "The requested array has an inhomogeneous shape" in error_str + ): + # Fall back to strictly creating a ragged ndarray. + return create_ragged_ndarray(values) + else: + # Re-raise original error if the failure wasn't a broadcast error. + raise e from None + + +@PublicAPI(stability="alpha") +def create_ragged_ndarray(values: Sequence[Any]) -> np.ndarray: + """Create an array that contains arrays of different length + + If you're working with variable-length arrays like images, use this function to + create ragged arrays instead of ``np.array``. + + .. note:: + ``np.array`` fails to construct ragged arrays if the input arrays have a uniform + first dimension: + + .. testsetup:: + + import numpy as np + from ray.air.util.tensor_extensions.utils import create_ragged_ndarray + + .. doctest:: + + >>> values = [np.zeros((3, 1)), np.zeros((3, 2))] + >>> np.array(values, dtype=object) + Traceback (most recent call last): + ... + ValueError: could not broadcast input array from shape (3,1) into shape (3,) + >>> create_ragged_ndarray(values) + array([array([[0.], + [0.], + [0.]]), array([[0., 0.], + [0., 0.], + [0., 0.]])], dtype=object) + + Or if you're creating a ragged array from a single array: + + .. doctest:: + + >>> values = [np.zeros((3, 1))] + >>> np.array(values, dtype=object)[0].dtype + dtype('O') + >>> create_ragged_ndarray(values)[0].dtype + dtype('float64') + + ``create_ragged_ndarray`` avoids the limitations of ``np.array`` by creating an + empty array and filling it with pointers to the variable-length arrays. + """ # noqa: E501 + # Create an empty object-dtyped 1D array. + arr = np.empty(len(values), dtype=object) + # Try to fill the 1D array of pointers with the (ragged) tensors. + arr[:] = list(values) + return arr diff --git a/.venv/lib/python3.11/site-packages/xgrammar/xgrammar_bindings.cpython-311-x86_64-linux-gnu.so b/.venv/lib/python3.11/site-packages/xgrammar/xgrammar_bindings.cpython-311-x86_64-linux-gnu.so new file mode 100644 index 0000000000000000000000000000000000000000..b3346e0f83650fa3ad64918d4435fcc2f3aafc77 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/xgrammar/xgrammar_bindings.cpython-311-x86_64-linux-gnu.so @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8d49891b985816f83db0ca3a2325fde9a6651eafc06b890bee9f096d616f65f7 +size 971744