diff --git a/.gitattributes b/.gitattributes index e27c55413e2c90c83fe5c168d4b812c6a13a95d7..666a47cc600b18661deb3997e62e07250eb2b034 100644 --- a/.gitattributes +++ b/.gitattributes @@ -171,3 +171,5 @@ tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_inductor/_ .venv/lib/python3.11/site-packages/ray/_private/runtime_env/agent/thirdparty_files/aiohttp/_websocket/reader_c.cpython-311-x86_64-linux-gnu.so filter=lfs diff=lfs merge=lfs -text .venv/lib/python3.11/site-packages/ray/_private/thirdparty/tabulate/__pycache__/tabulate.cpython-311.pyc filter=lfs diff=lfs merge=lfs -text .venv/lib/python3.11/site-packages/ray/_private/runtime_env/agent/thirdparty_files/idna/__pycache__/idnadata.cpython-311.pyc filter=lfs diff=lfs merge=lfs -text +.venv/lib/python3.11/site-packages/ray/_private/runtime_env/agent/thirdparty_files/propcache/_helpers_c.cpython-311-x86_64-linux-gnu.so filter=lfs diff=lfs merge=lfs -text +.venv/lib/python3.11/site-packages/ray/jars/ray_dist.jar filter=lfs diff=lfs merge=lfs -text diff --git a/.venv/lib/python3.11/site-packages/ray/_private/__pycache__/process_watcher.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/_private/__pycache__/process_watcher.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..5fcd03baf12046b8bd73d4832965c70eb296c0d3 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/_private/__pycache__/process_watcher.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/_private/accelerators/__init__.py b/.venv/lib/python3.11/site-packages/ray/_private/accelerators/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..71550bc43b45e21c88029f1012b4ed5973468f9e --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/_private/accelerators/__init__.py @@ -0,0 +1,77 @@ +from typing import Set, Optional + +from ray._private.accelerators.accelerator import AcceleratorManager +from ray._private.accelerators.nvidia_gpu import NvidiaGPUAcceleratorManager +from ray._private.accelerators.intel_gpu import IntelGPUAcceleratorManager +from ray._private.accelerators.amd_gpu import AMDGPUAcceleratorManager +from ray._private.accelerators.tpu import TPUAcceleratorManager +from ray._private.accelerators.neuron import NeuronAcceleratorManager +from ray._private.accelerators.hpu import HPUAcceleratorManager +from ray._private.accelerators.npu import NPUAcceleratorManager + + +def get_all_accelerator_managers() -> Set[AcceleratorManager]: + """Get all accelerator managers supported by Ray.""" + return { + NvidiaGPUAcceleratorManager, + IntelGPUAcceleratorManager, + AMDGPUAcceleratorManager, + TPUAcceleratorManager, + NeuronAcceleratorManager, + HPUAcceleratorManager, + NPUAcceleratorManager, + } + + +def get_all_accelerator_resource_names() -> Set[str]: + """Get all resource names for accelerators.""" + return { + accelerator_manager.get_resource_name() + for accelerator_manager in get_all_accelerator_managers() + } + + +def get_accelerator_manager_for_resource( + resource_name: str, +) -> Optional[AcceleratorManager]: + """Get the corresponding accelerator manager for the given + accelerator resource name + + E.g., TPUAcceleratorManager is returned if resource name is "TPU" + """ + try: + return get_accelerator_manager_for_resource._resource_name_to_accelerator_manager.get( # noqa: E501 + resource_name, None + ) + except AttributeError: + # Lazy initialization. + resource_name_to_accelerator_manager = { + accelerator_manager.get_resource_name(): accelerator_manager + for accelerator_manager in get_all_accelerator_managers() + } + # Special handling for GPU resource name since multiple accelerator managers + # have the same GPU resource name. + if AMDGPUAcceleratorManager.get_current_node_num_accelerators() > 0: + resource_name_to_accelerator_manager["GPU"] = AMDGPUAcceleratorManager + elif IntelGPUAcceleratorManager.get_current_node_num_accelerators() > 0: + resource_name_to_accelerator_manager["GPU"] = IntelGPUAcceleratorManager + else: + resource_name_to_accelerator_manager["GPU"] = NvidiaGPUAcceleratorManager + get_accelerator_manager_for_resource._resource_name_to_accelerator_manager = ( + resource_name_to_accelerator_manager + ) + return resource_name_to_accelerator_manager.get(resource_name, None) + + +__all__ = [ + "NvidiaGPUAcceleratorManager", + "IntelGPUAcceleratorManager", + "AMDGPUAcceleratorManager", + "TPUAcceleratorManager", + "NeuronAcceleratorManager", + "HPUAcceleratorManager", + "NPUAcceleratorManager", + "get_all_accelerator_managers", + "get_all_accelerator_resource_names", + "get_accelerator_manager_for_resource", +] diff --git a/.venv/lib/python3.11/site-packages/ray/_private/accelerators/__pycache__/__init__.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/_private/accelerators/__pycache__/__init__.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..3d28398b3a37fe9a4214daad6eb7f06800100ff3 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/_private/accelerators/__pycache__/__init__.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/_private/accelerators/__pycache__/accelerator.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/_private/accelerators/__pycache__/accelerator.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..3c7509aa6ef7b8b1b346222a33588eccb43ab4ad Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/_private/accelerators/__pycache__/accelerator.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/_private/accelerators/__pycache__/amd_gpu.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/_private/accelerators/__pycache__/amd_gpu.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..74e33de0cd25ef8dac1529c7f9ce909b9f8f621b Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/_private/accelerators/__pycache__/amd_gpu.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/_private/accelerators/__pycache__/hpu.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/_private/accelerators/__pycache__/hpu.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..90a2403b357688e627428139a6e71352c6a5b68e Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/_private/accelerators/__pycache__/hpu.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/_private/accelerators/__pycache__/intel_gpu.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/_private/accelerators/__pycache__/intel_gpu.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..fd79a52d55e666c6810afa52075586ff58c764b9 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/_private/accelerators/__pycache__/intel_gpu.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/_private/accelerators/__pycache__/neuron.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/_private/accelerators/__pycache__/neuron.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e096da36830076ac2a457276daa27491efc5dc65 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/_private/accelerators/__pycache__/neuron.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/_private/accelerators/__pycache__/npu.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/_private/accelerators/__pycache__/npu.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..0449ec02e6a2d984a2969320ec5172ea130559fb Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/_private/accelerators/__pycache__/npu.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/_private/accelerators/__pycache__/nvidia_gpu.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/_private/accelerators/__pycache__/nvidia_gpu.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..857265915ba3b8f06580e772efbabc5b5541e36c Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/_private/accelerators/__pycache__/nvidia_gpu.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/_private/accelerators/__pycache__/tpu.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/_private/accelerators/__pycache__/tpu.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..f001939716c2d66e41f8cdce861fbfb8bef7bda4 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/_private/accelerators/__pycache__/tpu.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/_private/accelerators/accelerator.py b/.venv/lib/python3.11/site-packages/ray/_private/accelerators/accelerator.py new file mode 100644 index 0000000000000000000000000000000000000000..70178094e14cd0cccdd35fe8013fc50f877345b9 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/_private/accelerators/accelerator.py @@ -0,0 +1,138 @@ +from abc import ABC, abstractmethod +from typing import Dict, Optional, List, Tuple + + +class AcceleratorManager(ABC): + """This class contains all the functions needed for supporting + an accelerator family in Ray.""" + + @staticmethod + @abstractmethod + def get_resource_name() -> str: + """Get the name of the resource representing this accelerator family. + + Returns: + The resource name: e.g., the resource name for Nvidia GPUs is "GPU" + """ + + @staticmethod + @abstractmethod + def get_visible_accelerator_ids_env_var() -> str: + """Get the env var that sets the ids of visible accelerators of this family. + + Returns: + The env var for setting visible accelerator ids: e.g., + CUDA_VISIBLE_DEVICES for Nvidia GPUs. + """ + + @staticmethod + @abstractmethod + def get_current_node_num_accelerators() -> int: + """Get the total number of accelerators of this family on the current node. + + Returns: + The detected total number of accelerators of this family. + Return 0 if the current node doesn't contain accelerators of this family. + """ + + @staticmethod + @abstractmethod + def get_current_node_accelerator_type() -> Optional[str]: + """Get the type of the accelerator of this family on the current node. + + Currently Ray only supports single accelerator type of + an accelerator family on each node. + + The result should only be used when get_current_node_num_accelerators() > 0. + + Returns: + The detected accelerator type of this family: e.g., H100 for Nvidia GPU. + Return None if it's unknown or the node doesn't have + accelerators of this family. + """ + + @staticmethod + @abstractmethod + def get_current_node_additional_resources() -> Optional[Dict[str, float]]: + """Get any additional resources required for the current node. + + In case a particular accelerator type requires considerations for + additional resources (e.g. for TPUs, providing the TPU pod type and + TPU name), this function can be used to provide the + additional logical resources. + + Returns: + A dictionary representing additional resources that may be + necessary for a particular accelerator type. + """ + + @staticmethod + @abstractmethod + def validate_resource_request_quantity( + quantity: float, + ) -> Tuple[bool, Optional[str]]: + """Validate the resource request quantity of this accelerator resource. + + Args: + quantity: The resource request quantity to be validated. + + Returns: + (valid, error_message) tuple: the first element of the tuple + indicates whether the given quantity is valid or not, + the second element is the error message + if the given quantity is invalid. + """ + + @staticmethod + @abstractmethod + def get_current_process_visible_accelerator_ids() -> Optional[List[str]]: + """Get the ids of accelerators of this family that are visible to the current process. + + Returns: + The list of visiable accelerator ids. + Return None if all accelerators are visible. + """ + + @staticmethod + @abstractmethod + def set_current_process_visible_accelerator_ids(ids: List[str]) -> None: + """Set the ids of accelerators of this family that are visible to the current process. + + Args: + ids: The ids of visible accelerators of this family. + """ + + @staticmethod + def get_ec2_instance_num_accelerators( + instance_type: str, instances: dict + ) -> Optional[int]: + """Get the number of accelerators of this family on ec2 instance with given type. + + Args: + instance_type: The ec2 instance type. + instances: Map from ec2 instance type to instance metadata returned by + ec2 `describe-instance-types`. + + Returns: + The number of accelerators of this family on the ec2 instance + with given type. + Return None if it's unknown. + """ + return None + + @staticmethod + def get_ec2_instance_accelerator_type( + instance_type: str, instances: dict + ) -> Optional[str]: + """Get the accelerator type of this family on ec2 instance with given type. + + Args: + instance_type: The ec2 instance type. + instances: Map from ec2 instance type to instance metadata returned by + ec2 `describe-instance-types`. + + Returns: + The accelerator type of this family on the ec2 instance with given type. + Return None if it's unknown. + """ + return None diff --git a/.venv/lib/python3.11/site-packages/ray/_private/accelerators/hpu.py b/.venv/lib/python3.11/site-packages/ray/_private/accelerators/hpu.py new file mode 100644 index 0000000000000000000000000000000000000000..87bae0a9267ee0b4f69cf58f79af251eb4693b61 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/_private/accelerators/hpu.py @@ -0,0 +1,121 @@ +import os +import logging +from typing import Optional, List, Tuple +from functools import lru_cache +from importlib.util import find_spec + +from ray._private.accelerators.accelerator import AcceleratorManager + +logger = logging.getLogger(__name__) + +HABANA_VISIBLE_DEVICES_ENV_VAR = "HABANA_VISIBLE_MODULES" +NOSET_HABANA_VISIBLE_MODULES_ENV_VAR = "RAY_EXPERIMENTAL_NOSET_HABANA_VISIBLE_MODULES" + + +@lru_cache() +def is_package_present(package_name: str) -> bool: + try: + return find_spec(package_name) is not None + except ModuleNotFoundError: + return False + + +HPU_PACKAGE_AVAILABLE = is_package_present("habana_frameworks") + + +class HPUAcceleratorManager(AcceleratorManager): + """Intel Habana(HPU) accelerators.""" + + @staticmethod + def get_resource_name() -> str: + return "HPU" + + @staticmethod + def get_visible_accelerator_ids_env_var() -> str: + return HABANA_VISIBLE_DEVICES_ENV_VAR + + @staticmethod + def get_current_process_visible_accelerator_ids() -> Optional[List[str]]: + hpu_visible_devices = os.environ.get( + HPUAcceleratorManager.get_visible_accelerator_ids_env_var(), None + ) + + if hpu_visible_devices is None: + return None + + if hpu_visible_devices == "": + return [] + + return list(hpu_visible_devices.split(",")) + + @staticmethod + def get_current_node_num_accelerators() -> int: + """Attempt to detect the number of HPUs on this machine. + Returns: + The number of HPUs if any were detected, otherwise 0. + """ + if HPU_PACKAGE_AVAILABLE: + import habana_frameworks.torch.hpu as torch_hpu + + if torch_hpu.is_available(): + return torch_hpu.device_count() + else: + logging.info("HPU devices not available") + return 0 + else: + return 0 + + @staticmethod + def is_initialized() -> bool: + """Attempt to check if HPU backend is initialized. + Returns: + True if backend initialized else False. + """ + if HPU_PACKAGE_AVAILABLE: + import habana_frameworks.torch.hpu as torch_hpu + + if torch_hpu.is_available() and torch_hpu.is_initialized(): + return True + else: + return False + else: + return False + + @staticmethod + def get_current_node_accelerator_type() -> Optional[str]: + """Attempt to detect the HPU family type. + Returns: + The device name (GAUDI, GAUDI2) if detected else None. + """ + if HPUAcceleratorManager.is_initialized(): + import habana_frameworks.torch.hpu as torch_hpu + + return f"Intel-{torch_hpu.get_device_name()}" + else: + logging.info("HPU type cannot be detected") + return None + + @staticmethod + def validate_resource_request_quantity( + quantity: float, + ) -> Tuple[bool, Optional[str]]: + if isinstance(quantity, float) and not quantity.is_integer(): + return ( + False, + f"{HPUAcceleratorManager.get_resource_name()} resource quantity" + " must be whole numbers. " + f"The specified quantity {quantity} is invalid.", + ) + else: + return (True, None) + + @staticmethod + def set_current_process_visible_accelerator_ids( + visible_hpu_devices: List[str], + ) -> None: + if os.environ.get(NOSET_HABANA_VISIBLE_MODULES_ENV_VAR): + return + + os.environ[ + HPUAcceleratorManager.get_visible_accelerator_ids_env_var() + ] = ",".join([str(i) for i in visible_hpu_devices]) diff --git a/.venv/lib/python3.11/site-packages/ray/_private/accelerators/intel_gpu.py b/.venv/lib/python3.11/site-packages/ray/_private/accelerators/intel_gpu.py new file mode 100644 index 0000000000000000000000000000000000000000..bd6f1c0fcbb1426f761ab407a92a188bb185dcec --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/_private/accelerators/intel_gpu.py @@ -0,0 +1,103 @@ +import os +import logging +from typing import Optional, List, Tuple + +from ray._private.accelerators.accelerator import AcceleratorManager + +logger = logging.getLogger(__name__) + +ONEAPI_DEVICE_SELECTOR_ENV_VAR = "ONEAPI_DEVICE_SELECTOR" +NOSET_ONEAPI_DEVICE_SELECTOR_ENV_VAR = "RAY_EXPERIMENTAL_NOSET_ONEAPI_DEVICE_SELECTOR" +ONEAPI_DEVICE_BACKEND_TYPE = "level_zero" +ONEAPI_DEVICE_TYPE = "gpu" + + +class IntelGPUAcceleratorManager(AcceleratorManager): + """Intel GPU accelerators.""" + + @staticmethod + def get_resource_name() -> str: + return "GPU" + + @staticmethod + def get_visible_accelerator_ids_env_var() -> str: + return ONEAPI_DEVICE_SELECTOR_ENV_VAR + + @staticmethod + def get_current_process_visible_accelerator_ids() -> Optional[List[str]]: + oneapi_visible_devices = os.environ.get( + IntelGPUAcceleratorManager.get_visible_accelerator_ids_env_var(), None + ) + if oneapi_visible_devices is None: + return None + + if oneapi_visible_devices == "": + return [] + + if oneapi_visible_devices == "NoDevFiles": + return [] + + prefix = ONEAPI_DEVICE_BACKEND_TYPE + ":" + + return list(oneapi_visible_devices.split(prefix)[1].split(",")) + + @staticmethod + def get_current_node_num_accelerators() -> int: + try: + import dpctl + except ImportError: + dpctl = None + if dpctl is None: + return 0 + + num_gpus = 0 + try: + dev_info = ONEAPI_DEVICE_BACKEND_TYPE + ":" + ONEAPI_DEVICE_TYPE + context = dpctl.SyclContext(dev_info) + num_gpus = context.device_count + except Exception: + num_gpus = 0 + return num_gpus + + @staticmethod + def get_current_node_accelerator_type() -> Optional[str]: + """Get the name of first Intel GPU. (supposed only one GPU type on a node) + Example: + name: 'Intel(R) Data Center GPU Max 1550' + return name: 'Intel-GPU-Max-1550' + Returns: + A string representing the name of Intel GPU type. + """ + try: + import dpctl + except ImportError: + dpctl = None + if dpctl is None: + return None + + accelerator_type = None + try: + dev_info = ONEAPI_DEVICE_BACKEND_TYPE + ":" + ONEAPI_DEVICE_TYPE + ":0" + dev = dpctl.SyclDevice(dev_info) + accelerator_type = "Intel-GPU-" + "-".join(dev.name.split(" ")[-2:]) + except Exception: + accelerator_type = None + return accelerator_type + + @staticmethod + def validate_resource_request_quantity( + quantity: float, + ) -> Tuple[bool, Optional[str]]: + return (True, None) + + @staticmethod + def set_current_process_visible_accelerator_ids( + visible_xpu_devices: List[str], + ) -> None: + if os.environ.get(NOSET_ONEAPI_DEVICE_SELECTOR_ENV_VAR): + return + + prefix = ONEAPI_DEVICE_BACKEND_TYPE + ":" + os.environ[ + IntelGPUAcceleratorManager.get_visible_accelerator_ids_env_var() + ] = prefix + ",".join([str(i) for i in visible_xpu_devices]) diff --git a/.venv/lib/python3.11/site-packages/ray/_private/accelerators/neuron.py b/.venv/lib/python3.11/site-packages/ray/_private/accelerators/neuron.py new file mode 100644 index 0000000000000000000000000000000000000000..7ba9eeb0666b0c0c20c7c6ae89f27cde6011450d --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/_private/accelerators/neuron.py @@ -0,0 +1,132 @@ +import os +import sys +import json +import logging +import subprocess +from typing import Optional, List, Tuple + +from ray._private.accelerators.accelerator import AcceleratorManager + +logger = logging.getLogger(__name__) + +NEURON_RT_VISIBLE_CORES_ENV_VAR = "NEURON_RT_VISIBLE_CORES" +NOSET_AWS_NEURON_RT_VISIBLE_CORES_ENV_VAR = ( + "RAY_EXPERIMENTAL_NOSET_NEURON_RT_VISIBLE_CORES" +) + +# https://awsdocs-neuron.readthedocs-hosted.com/en/latest/general/arch/neuron-hardware/inf2-arch.html#aws-inf2-arch +# https://awsdocs-neuron.readthedocs-hosted.com/en/latest/general/arch/neuron-hardware/trn1-arch.html#aws-trn1-arch +# Subject to removal after the information is available via public API +AWS_NEURON_INSTANCE_MAP = { + "trn1.2xlarge": 2, + "trn1.32xlarge": 32, + "trn1n.32xlarge": 32, + "inf2.xlarge": 2, + "inf2.8xlarge": 2, + "inf2.24xlarge": 12, + "inf2.48xlarge": 24, +} + + +class NeuronAcceleratorManager(AcceleratorManager): + """AWS Inferentia and Trainium accelerators.""" + + @staticmethod + def get_resource_name() -> str: + return "neuron_cores" + + @staticmethod + def get_visible_accelerator_ids_env_var() -> str: + return NEURON_RT_VISIBLE_CORES_ENV_VAR + + @staticmethod + def get_current_process_visible_accelerator_ids() -> Optional[List[str]]: + neuron_visible_cores = os.environ.get( + NeuronAcceleratorManager.get_visible_accelerator_ids_env_var(), None + ) + + if neuron_visible_cores is None: + return None + + if neuron_visible_cores == "": + return [] + + return list(neuron_visible_cores.split(",")) + + @staticmethod + def get_current_node_num_accelerators() -> int: + """ + Attempt to detect the number of Neuron cores on this machine. + + Returns: + The number of Neuron cores if any were detected, otherwise 0. + """ + nc_count: int = 0 + neuron_path = "/opt/aws/neuron/bin/" + if sys.platform.startswith("linux") and os.path.isdir(neuron_path): + result = subprocess.run( + [os.path.join(neuron_path, "neuron-ls"), "--json-output"], + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + ) + if result.returncode == 0 and result.stdout: + neuron_devices = json.loads(result.stdout) + for neuron_device in neuron_devices: + nc_count += neuron_device.get("nc_count", 0) + return nc_count + + @staticmethod + def get_current_node_accelerator_type() -> Optional[str]: + from ray.util.accelerators import AWS_NEURON_CORE + + return AWS_NEURON_CORE + + @staticmethod + def validate_resource_request_quantity( + quantity: float, + ) -> Tuple[bool, Optional[str]]: + if isinstance(quantity, float) and not quantity.is_integer(): + return ( + False, + f"{NeuronAcceleratorManager.get_resource_name()} resource quantity" + " must be whole numbers. " + f"The specified quantity {quantity} is invalid.", + ) + else: + return (True, None) + + @staticmethod + def set_current_process_visible_accelerator_ids( + visible_neuron_core_ids: List[str], + ) -> None: + """Set the NEURON_RT_VISIBLE_CORES environment variable based on + given visible_neuron_core_ids. + + Args: + visible_neuron_core_ids (List[str]): List of int representing core IDs. + """ + if os.environ.get(NOSET_AWS_NEURON_RT_VISIBLE_CORES_ENV_VAR): + return + + os.environ[ + NeuronAcceleratorManager.get_visible_accelerator_ids_env_var() + ] = ",".join([str(i) for i in visible_neuron_core_ids]) + + @staticmethod + def get_ec2_instance_num_accelerators( + instance_type: str, instances: dict + ) -> Optional[int]: + # TODO: AWS SDK (public API) doesn't yet expose the NeuronCore + # information. It will be available (work-in-progress) + # as xxAcceleratorInfo in InstanceTypeInfo. + # https://docs.aws.amazon.com/AWSEC2/latest/APIReference/API_InstanceTypeInfo.html + # See https://github.com/ray-project/ray/issues/38473 + return AWS_NEURON_INSTANCE_MAP.get(instance_type.lower(), None) + + @staticmethod + def get_ec2_instance_accelerator_type( + instance_type: str, instances: dict + ) -> Optional[str]: + from ray.util.accelerators import AWS_NEURON_CORE + + return AWS_NEURON_CORE diff --git a/.venv/lib/python3.11/site-packages/ray/_private/accelerators/npu.py b/.venv/lib/python3.11/site-packages/ray/_private/accelerators/npu.py new file mode 100644 index 0000000000000000000000000000000000000000..d98434cd302ae64c6cfc0df328580fc4d549855a --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/_private/accelerators/npu.py @@ -0,0 +1,99 @@ +import os +import glob +import logging +from typing import Optional, List, Tuple + +from ray._private.accelerators.accelerator import AcceleratorManager + +logger = logging.getLogger(__name__) + +ASCEND_RT_VISIBLE_DEVICES_ENV_VAR = "ASCEND_RT_VISIBLE_DEVICES" +NOSET_ASCEND_RT_VISIBLE_DEVICES_ENV_VAR = ( + "RAY_EXPERIMENTAL_NOSET_ASCEND_RT_VISIBLE_DEVICES" +) + + +class NPUAcceleratorManager(AcceleratorManager): + """Ascend NPU accelerators.""" + + @staticmethod + def get_resource_name() -> str: + return "NPU" + + @staticmethod + def get_visible_accelerator_ids_env_var() -> str: + return ASCEND_RT_VISIBLE_DEVICES_ENV_VAR + + @staticmethod + def get_current_process_visible_accelerator_ids() -> Optional[List[str]]: + ascend_visible_devices = os.environ.get( + NPUAcceleratorManager.get_visible_accelerator_ids_env_var(), None + ) + + if ascend_visible_devices is None: + return None + + if ascend_visible_devices == "": + return [] + + if ascend_visible_devices == "NoDevFiles": + return [] + + return list(ascend_visible_devices.split(",")) + + @staticmethod + def get_current_node_num_accelerators() -> int: + """Attempt to detect the number of NPUs on this machine. + + NPU chips are represented as devices within `/dev/`, either as `/dev/davinci?`. + + Returns: + The number of NPUs if any were detected, otherwise 0. + """ + try: + import acl + + device_count, ret = acl.rt.get_device_count() + if ret == 0: + return device_count + except Exception as e: + logger.debug("Could not import AscendCL: %s", e) + + try: + npu_files = glob.glob("/dev/davinci[0-9]*") + return len(npu_files) + except Exception as e: + logger.debug("Failed to detect number of NPUs: %s", e) + return 0 + + @staticmethod + def get_current_node_accelerator_type() -> Optional[str]: + """Get the type of the Ascend NPU on the current node. + + Returns: + A string of the type, such as "Ascend910A", "Ascend910B", "Ascend310P1". + """ + try: + import acl + + return acl.get_soc_name() + except Exception: + logger.exception("Failed to detect NPU type.") + return None + + @staticmethod + def validate_resource_request_quantity( + quantity: float, + ) -> Tuple[bool, Optional[str]]: + return (True, None) + + @staticmethod + def set_current_process_visible_accelerator_ids( + visible_npu_devices: List[str], + ) -> None: + if os.environ.get(NOSET_ASCEND_RT_VISIBLE_DEVICES_ENV_VAR): + return + + os.environ[ + NPUAcceleratorManager.get_visible_accelerator_ids_env_var() + ] = ",".join([str(i) for i in visible_npu_devices]) diff --git a/.venv/lib/python3.11/site-packages/ray/_private/accelerators/nvidia_gpu.py b/.venv/lib/python3.11/site-packages/ray/_private/accelerators/nvidia_gpu.py new file mode 100644 index 0000000000000000000000000000000000000000..2eaafb5a6e06e006cdd950f0f033c2c2724ac0de --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/_private/accelerators/nvidia_gpu.py @@ -0,0 +1,128 @@ +import re +import os +import logging +from typing import Optional, List, Tuple + +from ray._private.accelerators.accelerator import AcceleratorManager + +logger = logging.getLogger(__name__) + +CUDA_VISIBLE_DEVICES_ENV_VAR = "CUDA_VISIBLE_DEVICES" +NOSET_CUDA_VISIBLE_DEVICES_ENV_VAR = "RAY_EXPERIMENTAL_NOSET_CUDA_VISIBLE_DEVICES" + +# TODO(Alex): This pattern may not work for non NVIDIA Tesla GPUs (which have +# the form "Tesla V100-SXM2-16GB" or "Tesla K80"). +NVIDIA_GPU_NAME_PATTERN = re.compile(r"\w+\s+([A-Z0-9]+)") + + +class NvidiaGPUAcceleratorManager(AcceleratorManager): + """Nvidia GPU accelerators.""" + + @staticmethod + def get_resource_name() -> str: + return "GPU" + + @staticmethod + def get_visible_accelerator_ids_env_var() -> str: + return CUDA_VISIBLE_DEVICES_ENV_VAR + + @staticmethod + def get_current_process_visible_accelerator_ids() -> Optional[List[str]]: + cuda_visible_devices = os.environ.get( + NvidiaGPUAcceleratorManager.get_visible_accelerator_ids_env_var(), None + ) + if cuda_visible_devices is None: + return None + + if cuda_visible_devices == "": + return [] + + if cuda_visible_devices == "NoDevFiles": + return [] + + return list(cuda_visible_devices.split(",")) + + @staticmethod + def get_current_node_num_accelerators() -> int: + import ray._private.thirdparty.pynvml as pynvml + + try: + pynvml.nvmlInit() + except pynvml.NVMLError: + return 0 # pynvml init failed + device_count = pynvml.nvmlDeviceGetCount() + pynvml.nvmlShutdown() + return device_count + + @staticmethod + def get_current_node_accelerator_type() -> Optional[str]: + import ray._private.thirdparty.pynvml as pynvml + + try: + pynvml.nvmlInit() + except pynvml.NVMLError: + return None # pynvml init failed + device_count = pynvml.nvmlDeviceGetCount() + cuda_device_type = None + if device_count > 0: + handle = pynvml.nvmlDeviceGetHandleByIndex(0) + device_name = pynvml.nvmlDeviceGetName(handle) + if isinstance(device_name, bytes): + device_name = device_name.decode("utf-8") + cuda_device_type = ( + NvidiaGPUAcceleratorManager._gpu_name_to_accelerator_type(device_name) + ) + pynvml.nvmlShutdown() + return cuda_device_type + + @staticmethod + def _gpu_name_to_accelerator_type(name): + if name is None: + return None + match = NVIDIA_GPU_NAME_PATTERN.match(name) + return match.group(1) if match else None + + @staticmethod + def validate_resource_request_quantity( + quantity: float, + ) -> Tuple[bool, Optional[str]]: + return (True, None) + + @staticmethod + def set_current_process_visible_accelerator_ids( + visible_cuda_devices: List[str], + ) -> None: + if os.environ.get(NOSET_CUDA_VISIBLE_DEVICES_ENV_VAR): + return + + os.environ[ + NvidiaGPUAcceleratorManager.get_visible_accelerator_ids_env_var() + ] = ",".join([str(i) for i in visible_cuda_devices]) + + @staticmethod + def get_ec2_instance_num_accelerators( + instance_type: str, instances: dict + ) -> Optional[int]: + if instance_type not in instances: + return None + + gpus = instances[instance_type].get("GpuInfo", {}).get("Gpus") + if gpus is not None: + # TODO(ameer): currently we support one gpu type per node. + assert len(gpus) == 1 + return gpus[0]["Count"] + return None + + @staticmethod + def get_ec2_instance_accelerator_type( + instance_type: str, instances: dict + ) -> Optional[str]: + if instance_type not in instances: + return None + + gpus = instances[instance_type].get("GpuInfo", {}).get("Gpus") + if gpus is not None: + # TODO(ameer): currently we support one gpu type per node. + assert len(gpus) == 1 + return gpus[0]["Name"] + return None diff --git a/.venv/lib/python3.11/site-packages/ray/_private/accelerators/tpu.py b/.venv/lib/python3.11/site-packages/ray/_private/accelerators/tpu.py new file mode 100644 index 0000000000000000000000000000000000000000..1349606e8ad3f6bb91057cd5af033aa133e2bcc3 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/_private/accelerators/tpu.py @@ -0,0 +1,393 @@ +import os +import re +import glob +import requests +import logging +from functools import lru_cache +from typing import Dict, Optional, List, Tuple + +from ray._private.accelerators.accelerator import AcceleratorManager + +logger = logging.getLogger(__name__) + + +TPU_VALID_CHIP_OPTIONS = (1, 2, 4, 8) +GKE_TPU_ACCELERATOR_TYPE_ENV_VAR = "TPU_ACCELERATOR_TYPE" +GKE_TPU_WORKER_ID_ENV_VAR = "TPU_WORKER_ID" +GKE_TPU_NAME_ENV_VAR = "TPU_NAME" + +# Constants for accessing the `accelerator-type` from TPU VM +# instance metadata. +# See https://cloud.google.com/compute/docs/metadata/overview +# for more details about VM instance metadata. +GCE_TPU_ACCELERATOR_ENDPOINT = ( + "http://metadata.google.internal/computeMetadata/v1/instance/attributes/" +) +GCE_TPU_HEADERS = {"Metadata-Flavor": "Google"} +GCE_TPU_ACCELERATOR_KEY = "accelerator-type" +GCE_TPU_INSTANCE_ID_KEY = "instance-id" +GCE_TPU_WORKER_ID_KEY = "agent-worker-number" + +TPU_VISIBLE_CHIPS_ENV_VAR = "TPU_VISIBLE_CHIPS" + +NOSET_TPU_VISIBLE_CHIPS_ENV_VAR = "RAY_EXPERIMENTAL_NOSET_TPU_VISIBLE_CHIPS" + +# The following defines environment variables that allow +# us to access a subset of TPU visible chips. +# +# See: https://github.com/google/jax/issues/14977 for an example/more details. +TPU_CHIPS_PER_HOST_BOUNDS_ENV_VAR = "TPU_CHIPS_PER_HOST_BOUNDS" +TPU_CHIPS_PER_HOST_BOUNDS_1_CHIP_CONFIG = "1,1,1" +TPU_CHIPS_PER_HOST_BOUNDS_2_CHIP_CONFIG = "1,2,1" + +TPU_HOST_BOUNDS_ENV_VAR = "TPU_HOST_BOUNDS" +TPU_SINGLE_HOST_BOUNDS = "1,1,1" + + +def _get_tpu_metadata(key: str) -> Optional[str]: + """Poll and get TPU metadata.""" + try: + accelerator_type_request = requests.get( + os.path.join(GCE_TPU_ACCELERATOR_ENDPOINT, key), + headers=GCE_TPU_HEADERS, + ) + if ( + accelerator_type_request.status_code == 200 + and accelerator_type_request.text + ): + return accelerator_type_request.text + else: + logging.debug( + "Unable to poll TPU GCE Metadata. Got " + f"status code: {accelerator_type_request.status_code} and " + f"content: {accelerator_type_request.text}" + ) + except requests.RequestException as e: + logging.debug("Unable to poll the TPU GCE Metadata: %s", e) + return None + + +class TPUAcceleratorManager(AcceleratorManager): + """Google TPU accelerators.""" + + @staticmethod + def get_resource_name() -> str: + return "TPU" + + @staticmethod + def get_visible_accelerator_ids_env_var() -> str: + return TPU_VISIBLE_CHIPS_ENV_VAR + + @staticmethod + def get_current_process_visible_accelerator_ids() -> Optional[List[str]]: + tpu_visible_chips = os.environ.get( + TPUAcceleratorManager.get_visible_accelerator_ids_env_var(), None + ) + + if tpu_visible_chips is None: + return None + + if tpu_visible_chips == "": + return [] + + return list(tpu_visible_chips.split(",")) + + @staticmethod + @lru_cache() + def get_current_node_num_accelerators() -> int: + """Attempt to detect the number of TPUs on this machine. + + TPU chips are represented as devices within `/dev/`, either as + `/dev/accel*` or `/dev/vfio/*`. + + Returns: + The number of TPUs if any were detected, otherwise 0. + """ + accel_files = glob.glob("/dev/accel*") + if accel_files: + return len(accel_files) + + try: + vfio_entries = os.listdir("/dev/vfio") + numeric_entries = [int(entry) for entry in vfio_entries if entry.isdigit()] + return len(numeric_entries) + except FileNotFoundError as e: + logger.debug("Failed to detect number of TPUs: %s", e) + return 0 + + @staticmethod + def is_valid_tpu_accelerator_type(tpu_accelerator_type: str) -> bool: + """Check whether the tpu accelerator_type is formatted correctly. + + The accelerator_type field follows a form of v{generation}-{cores/chips}. + + See the following for more information: + https://cloud.google.com/sdk/gcloud/reference/compute/tpus/tpu-vm/accelerator-types/describe + + Args: + tpu_accelerator_type: The string representation of the accelerator type + to be checked for validity. + + Returns: + True if it's valid, false otherwise. + """ + expected_pattern = re.compile(r"^v\d+[a-zA-Z]*-\d+$") + if not expected_pattern.match(tpu_accelerator_type): + return False + return True + + @staticmethod + def validate_resource_request_quantity( + quantity: float, + ) -> Tuple[bool, Optional[str]]: + if quantity not in TPU_VALID_CHIP_OPTIONS: + return ( + False, + f"The number of requested 'TPU' was set to {quantity} which " + "is not a supported chip configuration. Supported configs: " + f"{TPU_VALID_CHIP_OPTIONS}", + ) + else: + return (True, None) + + @staticmethod + def set_current_process_visible_accelerator_ids( + visible_tpu_chips: List[str], + ) -> None: + """Set TPU environment variables based on the provided visible_tpu_chips. + + To access a subset of the TPU visible chips, we must use a combination of + environment variables that tells the compiler (via ML framework) the: + - Visible chips + - The physical bounds of chips per host + - The host bounds within the context of a TPU pod. + + See: https://github.com/google/jax/issues/14977 for an example/more details. + + Args: + visible_tpu_chips (List[str]): List of int representing TPU chips. + """ + if os.environ.get(NOSET_TPU_VISIBLE_CHIPS_ENV_VAR): + return + + num_visible_tpu_chips = len(visible_tpu_chips) + num_accelerators_on_node = ( + TPUAcceleratorManager.get_current_node_num_accelerators() + ) + if num_visible_tpu_chips == num_accelerators_on_node: + # Let the ML framework use the defaults + os.environ.pop(TPU_CHIPS_PER_HOST_BOUNDS_ENV_VAR, None) + os.environ.pop(TPU_HOST_BOUNDS_ENV_VAR, None) + return + os.environ[ + TPUAcceleratorManager.get_visible_accelerator_ids_env_var() + ] = ",".join([str(i) for i in visible_tpu_chips]) + if num_visible_tpu_chips == 1: + os.environ[ + TPU_CHIPS_PER_HOST_BOUNDS_ENV_VAR + ] = TPU_CHIPS_PER_HOST_BOUNDS_1_CHIP_CONFIG + os.environ[TPU_HOST_BOUNDS_ENV_VAR] = TPU_SINGLE_HOST_BOUNDS + elif num_visible_tpu_chips == 2: + os.environ[ + TPU_CHIPS_PER_HOST_BOUNDS_ENV_VAR + ] = TPU_CHIPS_PER_HOST_BOUNDS_2_CHIP_CONFIG + os.environ[TPU_HOST_BOUNDS_ENV_VAR] = TPU_SINGLE_HOST_BOUNDS + + @staticmethod + def _get_current_node_tpu_pod_type() -> Optional[str]: + """Get the TPU pod type of the current node if applicable. + + Individual TPU VMs within a TPU pod must know what type + of pod it is a part of. This is necessary for the + ML framework to work properly. + + The logic is different if the TPU was provisioned via: + ``` + gcloud tpus tpu-vm create ... + ``` + (i.e. a GCE VM), vs through GKE: + - GCE VMs will always have a metadata server to poll this info + - GKE VMS will have environment variables preset. + + Returns: + A string representing the current TPU pod type, e.g. + v4-16. + + """ + # Start with GKE-based check + accelerator_type = os.getenv(GKE_TPU_ACCELERATOR_TYPE_ENV_VAR, "") + if not accelerator_type: + # GCE-based VM check + accelerator_type = _get_tpu_metadata(key=GCE_TPU_ACCELERATOR_KEY) + if accelerator_type and TPUAcceleratorManager.is_valid_tpu_accelerator_type( + tpu_accelerator_type=accelerator_type + ): + return accelerator_type + logging.debug("Failed to get a valid accelerator type.") + return None + + @staticmethod + def get_current_node_tpu_name() -> Optional[str]: + """Return the name of the TPU pod that this worker node is a part of. + + For instance, if the TPU was created with name "my-tpu", this function + will return "my-tpu". + + If created through the Ray cluster launcher, the + name will typically be something like "ray-my-tpu-cluster-worker-aa946781-tpu". + + In case the TPU was created through KubeRay, we currently expect that the + environment variable TPU_NAME is set per TPU pod slice, in which case + this function will return the value of that environment variable. + + """ + try: + # Start with GKE-based check + tpu_name = os.getenv(GKE_TPU_NAME_ENV_VAR, None) + if not tpu_name: + # GCE-based VM check + tpu_name = _get_tpu_metadata(key=GCE_TPU_INSTANCE_ID_KEY) + return tpu_name + except ValueError as e: + logging.debug("Could not get TPU name: %s", e) + return None + + @staticmethod + def _get_current_node_tpu_worker_id() -> Optional[int]: + """Return the worker index of the TPU pod.""" + try: + # Start with GKE-based check + worker_id = os.getenv(GKE_TPU_WORKER_ID_ENV_VAR, None) + if not worker_id: + # GCE-based VM check + worker_id = _get_tpu_metadata(key=GCE_TPU_WORKER_ID_KEY) + if worker_id: + return int(worker_id) + else: + return None + except ValueError as e: + logging.debug("Could not get TPU worker id: %s", e) + return None + + @staticmethod + def get_num_workers_in_current_tpu_pod() -> Optional[int]: + """Return the total number of workers in a TPU pod.""" + tpu_pod_type = TPUAcceleratorManager._get_current_node_tpu_pod_type() + cores_per_host = TPUAcceleratorManager.get_current_node_num_accelerators() + if tpu_pod_type and cores_per_host > 0: + num_chips_or_cores = int(tpu_pod_type.split("-")[1]) + return num_chips_or_cores // cores_per_host + else: + logging.debug("Could not get num workers in TPU pod.") + return None + + @staticmethod + def get_current_node_accelerator_type() -> Optional[str]: + """Attempt to detect the TPU accelerator type. + + The output of this function will return the "ray accelerator type" + resource (e.g. TPU-V4) that indicates the TPU version. + + We also expect that our TPU nodes contain a "TPU pod type" + resource, which indicates information about the topology of + the TPU pod slice. + + We expect that the "TPU pod type" resource to be used when + running multi host workers, i.e. when TPU units are pod slices. + + We expect that the "ray accelerator type" resource to be used when + running single host workers, i.e. when TPU units are single hosts. + + Returns: + A string representing the TPU accelerator type, + e.g. "TPU-V2", "TPU-V3", "TPU-V4" if applicable, else None. + + """ + + def tpu_pod_type_to_ray_accelerator_type( + tpu_pod_type: str, + ) -> Optional[str]: + return "TPU-" + str(tpu_pod_type.split("-")[0].upper()) + + ray_accelerator_type = None + tpu_pod_type = TPUAcceleratorManager._get_current_node_tpu_pod_type() + + if tpu_pod_type is not None: + ray_accelerator_type = tpu_pod_type_to_ray_accelerator_type( + tpu_pod_type=tpu_pod_type + ) + if ray_accelerator_type is None: + logger.info( + "While trying to autodetect a TPU type, " + f"received malformed accelerator_type: {tpu_pod_type}" + ) + + if ray_accelerator_type is None: + logging.info("Failed to auto-detect TPU type.") + + return ray_accelerator_type + + def get_current_node_additional_resources() -> Optional[Dict[str, float]]: + """Get additional resources required for TPU nodes. + + This will populate the TPU pod type and the TPU name which + is used for TPU pod execution. + + When running workloads on a TPU pod, we need a way to run + the same binary on every worker in the TPU pod. + + See https://jax.readthedocs.io/en/latest/multi_process.html + for more information. + + To do this in ray, we take advantage of custom resources. We + mark worker 0 of the TPU pod as a "coordinator" that identifies + the other workers in the TPU pod. We therefore need: + - worker 0 to be targetable. + - all workers in the TPU pod to have a unique identifier consistent + within a TPU pod. + + So assuming we want to run the following workload: + + @ray.remote + def my_jax_fn(): + import jax + return jax.device_count() + + We could broadcast this on a TPU pod (e.g. a v4-16) as follows: + + @ray.remote(resources={"TPU-v4-16-head"}) + def run_jax_fn(executable): + # Note this will execute on worker 0 + tpu_name = ray.util.accelerators.tpu.get_tpu_pod_name() + num_workers = ray.util.accelerators.tpu.get_tpu_num_workers() + tpu_executable = executable.options(resources={"TPU": 4, tpu_name: 1}) + return [tpu_executable.remote() for _ in range(num_workers)] + + Returns: + A dictionary representing additional resources that may be + necessary for a particular accelerator type. + + """ + resources = {} + tpu_name = TPUAcceleratorManager.get_current_node_tpu_name() + worker_id = TPUAcceleratorManager._get_current_node_tpu_worker_id() + tpu_pod_type = TPUAcceleratorManager._get_current_node_tpu_pod_type() + + if tpu_name and worker_id is not None and tpu_pod_type: + pod_head_resource_name = f"TPU-{tpu_pod_type}-head" + # Add the name of the TPU to the resource. + resources[tpu_name] = 1 + # Only add in the TPU pod type resource to worker 0. + if worker_id == 0: + resources[pod_head_resource_name] = 1 + else: + logging.info( + "Failed to configure TPU pod. Got: " + "tpu_name: %s, worker_id: %s, accelerator_type: %s", + tpu_name, + worker_id, + tpu_pod_type, + ) + if resources: + return resources + return None diff --git a/.venv/lib/python3.11/site-packages/ray/_private/runtime_env/agent/thirdparty_files/propcache/_helpers_c.cpython-311-x86_64-linux-gnu.so b/.venv/lib/python3.11/site-packages/ray/_private/runtime_env/agent/thirdparty_files/propcache/_helpers_c.cpython-311-x86_64-linux-gnu.so new file mode 100644 index 0000000000000000000000000000000000000000..9e372717ed4022ee51bc656029c205dc4ce03877 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/_private/runtime_env/agent/thirdparty_files/propcache/_helpers_c.cpython-311-x86_64-linux-gnu.so @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a87371c20cf73e0fe5df7f255ec4523368eff6d0a6e61a6fd6a730892a134935 +size 800728 diff --git a/.venv/lib/python3.11/site-packages/ray/_private/usage/__init__.py b/.venv/lib/python3.11/site-packages/ray/_private/usage/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/.venv/lib/python3.11/site-packages/ray/_private/usage/__pycache__/__init__.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/_private/usage/__pycache__/__init__.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..8a65778f0645edb1b0c0eb785caf7c7f296d47ed Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/_private/usage/__pycache__/__init__.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/_private/usage/__pycache__/usage_constants.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/_private/usage/__pycache__/usage_constants.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..d93038f3703ab6e1231d9ceaa48c9a202bdbd1c7 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/_private/usage/__pycache__/usage_constants.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/_private/usage/__pycache__/usage_lib.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/_private/usage/__pycache__/usage_lib.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..d68388d3f02fe25f47263f9ae8cefef12b578f65 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/_private/usage/__pycache__/usage_lib.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/_private/usage/usage_constants.py b/.venv/lib/python3.11/site-packages/ray/_private/usage/usage_constants.py new file mode 100644 index 0000000000000000000000000000000000000000..2b5b97ad175e334aa963e2f02629329830c57ff8 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/_private/usage/usage_constants.py @@ -0,0 +1,63 @@ +SCHEMA_VERSION = "0.1" + +# The key to store / obtain cluster metadata. +CLUSTER_METADATA_KEY = b"CLUSTER_METADATA" + +# The name of a json file where usage stats will be written. +USAGE_STATS_FILE = "usage_stats.json" + +USAGE_STATS_ENABLED_ENV_VAR = "RAY_USAGE_STATS_ENABLED" + +USAGE_STATS_SOURCE_ENV_VAR = "RAY_USAGE_STATS_SOURCE" + +USAGE_STATS_SOURCE_OSS = "OSS" + +USAGE_STATS_ENABLED_FOR_CLI_MESSAGE = ( + "Usage stats collection is enabled. To disable this, add `--disable-usage-stats` " + "to the command that starts the cluster, or run the following command:" + " `ray disable-usage-stats` before starting the cluster. " + "See https://docs.ray.io/en/master/cluster/usage-stats.html for more details." +) + +USAGE_STATS_ENABLED_FOR_RAY_INIT_MESSAGE = ( + "Usage stats collection is enabled. To disable this, run the following command:" + " `ray disable-usage-stats` before starting Ray. " + "See https://docs.ray.io/en/master/cluster/usage-stats.html for more details." +) + +USAGE_STATS_DISABLED_MESSAGE = "Usage stats collection is disabled." + +USAGE_STATS_ENABLED_BY_DEFAULT_FOR_CLI_MESSAGE = ( + "Usage stats collection is enabled by default without user confirmation " + "because this terminal is detected to be non-interactive. " + "To disable this, add `--disable-usage-stats` to the command that starts " + "the cluster, or run the following command:" + " `ray disable-usage-stats` before starting the cluster. " + "See https://docs.ray.io/en/master/cluster/usage-stats.html for more details." +) + +USAGE_STATS_ENABLED_BY_DEFAULT_FOR_RAY_INIT_MESSAGE = ( + "Usage stats collection is enabled by default for nightly wheels. " + "To disable this, run the following command:" + " `ray disable-usage-stats` before starting Ray. " + "See https://docs.ray.io/en/master/cluster/usage-stats.html for more details." +) + +USAGE_STATS_CONFIRMATION_MESSAGE = ( + "Enable usage stats collection? " + "This prompt will auto-proceed in 10 seconds to avoid blocking cluster startup." +) + +LIBRARY_USAGE_SET_NAME = "library_usage_" + +HARDWARE_USAGE_SET_NAME = "hardware_usage_" + +# Keep in-sync with the same constants defined in usage_stats_client.h +EXTRA_USAGE_TAG_PREFIX = "extra_usage_tag_" +USAGE_STATS_NAMESPACE = "usage_stats" + +KUBERNETES_SERVICE_HOST_ENV = "KUBERNETES_SERVICE_HOST" +KUBERAY_ENV = "RAY_USAGE_STATS_KUBERAY_IN_USE" + +PROVIDER_KUBERNETES_GENERIC = "kubernetes" +PROVIDER_KUBERAY = "kuberay" diff --git a/.venv/lib/python3.11/site-packages/ray/_private/usage/usage_lib.py b/.venv/lib/python3.11/site-packages/ray/_private/usage/usage_lib.py new file mode 100644 index 0000000000000000000000000000000000000000..558f56c602ef0b39e6d084f60ffbfbb026ff9175 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/_private/usage/usage_lib.py @@ -0,0 +1,964 @@ +"""This is the module that is in charge of Ray usage report (telemetry) APIs. + +NOTE: Ray's usage report is currently "on by default". + One could opt-out, see details at https://docs.ray.io/en/master/cluster/usage-stats.html. # noqa + +Ray usage report follows the specification from +https://docs.google.com/document/d/1ZT-l9YbGHh-iWRUC91jS-ssQ5Qe2UQ43Lsoc1edCalc/edit#heading=h.17dss3b9evbj. # noqa + +# Module + +The module consists of 2 parts. + +## Public API +It contains public APIs to obtain usage report information. +APIs will be added before the usage report becomes opt-in by default. + +## Internal APIs for usage processing/report +The telemetry report consists of 5 components. This module is in charge of the top 2 layers. + +Report -> usage_lib +--------------------- +Usage data processing -> usage_lib +--------------------- +Data storage -> Ray API server +--------------------- +Aggregation -> Ray API server (currently a dashboard server) +--------------------- +Usage data collection -> Various components (Ray agent, GCS, etc.) + usage_lib (cluster metadata). + +Usage report is currently "off by default". You can enable the report by setting an environment variable +RAY_USAGE_STATS_ENABLED=1. For example, `RAY_USAGE_STATS_ENABLED=1 ray start --head`. +Or `RAY_USAGE_STATS_ENABLED=1 python [drivers with ray.init()]`. + +"Ray API server (currently a dashboard server)" reports the usage data to https://usage-stats.ray.io/. + +Data is reported every hour by default. + +Note that it is also possible to configure the interval using the environment variable, +`RAY_USAGE_STATS_REPORT_INTERVAL_S`. + +To see collected/reported data, see `usage_stats.json` inside a temp +folder (e.g., /tmp/ray/session_[id]/*). +""" +import json +import logging +import threading +import os +import platform +import sys +import time +from dataclasses import asdict, dataclass +from enum import Enum, auto +from pathlib import Path +from typing import Dict, List, Optional, Set + +import requests +import yaml + +import ray +from ray._raylet import GcsClient +import ray._private.ray_constants as ray_constants +import ray._private.usage.usage_constants as usage_constant +from ray.experimental.internal_kv import ( + _internal_kv_initialized, + _internal_kv_put, +) +from ray.core.generated import usage_pb2, gcs_pb2 + +logger = logging.getLogger(__name__) +TagKey = usage_pb2.TagKey + +################# +# Internal APIs # +################# + + +@dataclass(init=True) +class ClusterConfigToReport: + cloud_provider: Optional[str] = None + min_workers: Optional[int] = None + max_workers: Optional[int] = None + head_node_instance_type: Optional[str] = None + worker_node_instance_types: Optional[List[str]] = None + + +@dataclass(init=True) +class ClusterStatusToReport: + total_num_cpus: Optional[int] = None + total_num_gpus: Optional[int] = None + total_memory_gb: Optional[float] = None + total_object_store_memory_gb: Optional[float] = None + + +@dataclass(init=True) +class UsageStatsToReport: + """Usage stats to report""" + + #: The schema version of the report. + schema_version: str + #: The source of the data (i.e. OSS). + source: str + #: When the data is collected and reported. + collect_timestamp_ms: int + #: The total number of successful reports for the lifetime of the cluster. + total_success: Optional[int] = None + #: The total number of failed reports for the lifetime of the cluster. + total_failed: Optional[int] = None + #: The sequence number of the report. + seq_number: Optional[int] = None + #: The Ray version in use. + ray_version: Optional[str] = None + #: The Python version in use. + python_version: Optional[str] = None + #: A random id of the cluster session. + session_id: Optional[str] = None + #: The git commit hash of Ray (i.e. ray.__commit__). + git_commit: Optional[str] = None + #: The operating system in use. + os: Optional[str] = None + #: When the cluster is started. + session_start_timestamp_ms: Optional[int] = None + #: The cloud provider found in the cluster.yaml file (e.g., aws). + cloud_provider: Optional[str] = None + #: The min_workers found in the cluster.yaml file. + min_workers: Optional[int] = None + #: The max_workers found in the cluster.yaml file. + max_workers: Optional[int] = None + #: The head node instance type found in the cluster.yaml file (e.g., i3.8xlarge). + head_node_instance_type: Optional[str] = None + #: The worker node instance types found in the cluster.yaml file (e.g., i3.8xlarge). + worker_node_instance_types: Optional[List[str]] = None + #: The total num of cpus in the cluster. + total_num_cpus: Optional[int] = None + #: The total num of gpus in the cluster. + total_num_gpus: Optional[int] = None + #: The total size of memory in the cluster. + total_memory_gb: Optional[float] = None + #: The total size of object store memory in the cluster. + total_object_store_memory_gb: Optional[float] = None + #: The Ray libraries that are used (e.g., rllib). + library_usages: Optional[List[str]] = None + #: The extra tags to report when specified by an + # environment variable RAY_USAGE_STATS_EXTRA_TAGS + extra_usage_tags: Optional[Dict[str, str]] = None + #: The number of alive nodes when the report is generated. + total_num_nodes: Optional[int] = None + #: The total number of running jobs excluding internal ones + # when the report is generated. + total_num_running_jobs: Optional[int] = None + #: The libc version in the OS. + libc_version: Optional[str] = None + #: The hardwares that are used (e.g. Intel Xeon). + hardware_usages: Optional[List[str]] = None + + +@dataclass(init=True) +class UsageStatsToWrite: + """Usage stats to write to `USAGE_STATS_FILE` + + We are writing extra metadata such as the status of report + to this file. + """ + + usage_stats: UsageStatsToReport + # Whether or not the last report succeeded. + success: bool + # The error message of the last report if it happens. + error: str + + +class UsageStatsEnabledness(Enum): + ENABLED_EXPLICITLY = auto() + DISABLED_EXPLICITLY = auto() + ENABLED_BY_DEFAULT = auto() + + +_recorded_library_usages = set() +_recorded_library_usages_lock = threading.Lock() +_recorded_extra_usage_tags = dict() +_recorded_extra_usage_tags_lock = threading.Lock() + + +def _add_to_usage_set(set_name: str, value: str): + assert _internal_kv_initialized() + try: + _internal_kv_put( + f"{set_name}{value}".encode(), + b"", + namespace=usage_constant.USAGE_STATS_NAMESPACE.encode(), + ) + except Exception as e: + logger.debug(f"Failed to add {value} to usage set {set_name}, {e}") + + +def _get_usage_set(gcs_client, set_name: str) -> Set[str]: + try: + result = set() + usages = gcs_client.internal_kv_keys( + set_name.encode(), + namespace=usage_constant.USAGE_STATS_NAMESPACE.encode(), + ) + for usage in usages: + usage = usage.decode("utf-8") + result.add(usage[len(set_name) :]) + + return result + except Exception as e: + logger.debug(f"Failed to get usage set {set_name}, {e}") + return set() + + +def _put_library_usage(library_usage: str): + _add_to_usage_set(usage_constant.LIBRARY_USAGE_SET_NAME, library_usage) + + +def _put_hardware_usage(hardware_usage: str): + _add_to_usage_set(usage_constant.HARDWARE_USAGE_SET_NAME, hardware_usage) + + +def record_extra_usage_tag( + key: TagKey, value: str, gcs_client: Optional[GcsClient] = None +): + """Record extra kv usage tag. + + If the key already exists, the value will be overwritten. + + To record an extra tag, first add the key to the TagKey enum and + then call this function. + It will make a synchronous call to the internal kv store if the tag is updated. + + Args: + key: The key of the tag. + value: The value of the tag. + gcs_client: The GCS client to perform KV operation PUT. Defaults to None. + When None, it will try to get the global client from the internal_kv. + """ + key = TagKey.Name(key).lower() + with _recorded_extra_usage_tags_lock: + if _recorded_extra_usage_tags.get(key) == value: + return + _recorded_extra_usage_tags[key] = value + + if not _internal_kv_initialized() and gcs_client is None: + # This happens if the record is before ray.init and + # no GCS client is used for recording explicitly. + return + + _put_extra_usage_tag(key, value, gcs_client) + + +def _put_extra_usage_tag(key: str, value: str, gcs_client: Optional[GcsClient] = None): + try: + key = f"{usage_constant.EXTRA_USAGE_TAG_PREFIX}{key}".encode() + val = value.encode() + namespace = usage_constant.USAGE_STATS_NAMESPACE.encode() + if gcs_client is not None: + # Use the GCS client. + gcs_client.internal_kv_put(key, val, namespace=namespace) + else: + # Use internal kv. + assert _internal_kv_initialized() + _internal_kv_put(key, val, namespace=namespace) + except Exception as e: + logger.debug(f"Failed to put extra usage tag, {e}") + + +def record_hardware_usage(hardware_usage: str): + """Record hardware usage (e.g. which CPU model is used)""" + assert _internal_kv_initialized() + _put_hardware_usage(hardware_usage) + + +def record_library_usage(library_usage: str): + """Record library usage (e.g. which library is used)""" + with _recorded_library_usages_lock: + if library_usage in _recorded_library_usages: + return + _recorded_library_usages.add(library_usage) + + if not _internal_kv_initialized(): + # This happens if the library is imported before ray.init + return + + # Only report lib usage for driver / ray client / workers. Otherwise, + # it can be reported if the library is imported from + # e.g., API server. + if ( + ray._private.worker.global_worker.mode == ray.SCRIPT_MODE + or ray._private.worker.global_worker.mode == ray.WORKER_MODE + or ray.util.client.ray.is_connected() + ): + _put_library_usage(library_usage) + + +def _put_pre_init_library_usages(): + assert _internal_kv_initialized() + # NOTE: When the lib is imported from a worker, ray should + # always be initialized, so there's no need to register the + # pre init hook. + if not ( + ray._private.worker.global_worker.mode == ray.SCRIPT_MODE + or ray.util.client.ray.is_connected() + ): + return + + for library_usage in _recorded_library_usages: + _put_library_usage(library_usage) + + +def _put_pre_init_extra_usage_tags(): + assert _internal_kv_initialized() + for k, v in _recorded_extra_usage_tags.items(): + _put_extra_usage_tag(k, v) + + +def put_pre_init_usage_stats(): + _put_pre_init_library_usages() + _put_pre_init_extra_usage_tags() + + +def reset_global_state(): + global _recorded_library_usages, _recorded_extra_usage_tags + + with _recorded_library_usages_lock: + _recorded_library_usages = set() + with _recorded_extra_usage_tags_lock: + _recorded_extra_usage_tags = dict() + + +ray._private.worker._post_init_hooks.append(put_pre_init_usage_stats) + + +def _usage_stats_report_url(): + # The usage collection server URL. + # The environment variable is testing-purpose only. + return os.getenv("RAY_USAGE_STATS_REPORT_URL", "https://usage-stats.ray.io/") + + +def _usage_stats_report_interval_s(): + return int(os.getenv("RAY_USAGE_STATS_REPORT_INTERVAL_S", 3600)) + + +def _usage_stats_config_path(): + return os.getenv( + "RAY_USAGE_STATS_CONFIG_PATH", os.path.expanduser("~/.ray/config.json") + ) + + +def _usage_stats_enabledness() -> UsageStatsEnabledness: + # Env var has higher priority than config file. + usage_stats_enabled_env_var = os.getenv(usage_constant.USAGE_STATS_ENABLED_ENV_VAR) + if usage_stats_enabled_env_var == "0": + return UsageStatsEnabledness.DISABLED_EXPLICITLY + elif usage_stats_enabled_env_var == "1": + return UsageStatsEnabledness.ENABLED_EXPLICITLY + elif usage_stats_enabled_env_var is not None: + raise ValueError( + f"Valid value for {usage_constant.USAGE_STATS_ENABLED_ENV_VAR} " + f"env var is 0 or 1, but got {usage_stats_enabled_env_var}" + ) + + usage_stats_enabled_config_var = None + try: + with open(_usage_stats_config_path()) as f: + config = json.load(f) + usage_stats_enabled_config_var = config.get("usage_stats") + except FileNotFoundError: + pass + except Exception as e: + logger.debug(f"Failed to load usage stats config {e}") + + if usage_stats_enabled_config_var is False: + return UsageStatsEnabledness.DISABLED_EXPLICITLY + elif usage_stats_enabled_config_var is True: + return UsageStatsEnabledness.ENABLED_EXPLICITLY + elif usage_stats_enabled_config_var is not None: + raise ValueError( + f"Valid value for 'usage_stats' in {_usage_stats_config_path()}" + f" is true or false, but got {usage_stats_enabled_config_var}" + ) + + # Usage stats is enabled by default. + return UsageStatsEnabledness.ENABLED_BY_DEFAULT + + +def is_nightly_wheel() -> bool: + return ray.__commit__ != "{{RAY_COMMIT_SHA}}" and "dev" in ray.__version__ + + +def usage_stats_enabled() -> bool: + return _usage_stats_enabledness() is not UsageStatsEnabledness.DISABLED_EXPLICITLY + + +def usage_stats_prompt_enabled(): + return int(os.getenv("RAY_USAGE_STATS_PROMPT_ENABLED", "1")) == 1 + + +def _generate_cluster_metadata(*, ray_init_cluster: bool): + """Return a dictionary of cluster metadata. + + Params: + ray_init_cluster: Whether the cluster is started by ray.init() + """ + ray_version, python_version = ray._private.utils.compute_version_info() + # These two metadata is necessary although usage report is not enabled + # to check version compatibility. + metadata = { + "ray_version": ray_version, + "python_version": python_version, + "ray_init_cluster": ray_init_cluster, + } + # Additional metadata is recorded only when usage stats are enabled. + if usage_stats_enabled(): + metadata.update( + { + "git_commit": ray.__commit__, + "os": sys.platform, + "session_start_timestamp_ms": int(time.time() * 1000), + } + ) + if sys.platform == "linux": + # Record llibc version + (lib, ver) = platform.libc_ver() + if not lib: + metadata.update({"libc_version": "NA"}) + else: + metadata.update({"libc_version": f"{lib}:{ver}"}) + return metadata + + +def show_usage_stats_prompt(cli: bool) -> None: + if not usage_stats_prompt_enabled(): + return + + from ray.autoscaler._private.cli_logger import cli_logger + + prompt_print = cli_logger.print if cli else print + + usage_stats_enabledness = _usage_stats_enabledness() + if usage_stats_enabledness is UsageStatsEnabledness.DISABLED_EXPLICITLY: + prompt_print(usage_constant.USAGE_STATS_DISABLED_MESSAGE) + elif usage_stats_enabledness is UsageStatsEnabledness.ENABLED_BY_DEFAULT: + if not cli: + prompt_print( + usage_constant.USAGE_STATS_ENABLED_BY_DEFAULT_FOR_RAY_INIT_MESSAGE + ) + elif cli_logger.interactive: + enabled = cli_logger.confirm( + False, + usage_constant.USAGE_STATS_CONFIRMATION_MESSAGE, + _default=True, + _timeout_s=10, + ) + set_usage_stats_enabled_via_env_var(enabled) + # Remember user's choice. + try: + set_usage_stats_enabled_via_config(enabled) + except Exception as e: + logger.debug( + f"Failed to persist usage stats choice for future clusters: {e}" + ) + if enabled: + prompt_print(usage_constant.USAGE_STATS_ENABLED_FOR_CLI_MESSAGE) + else: + prompt_print(usage_constant.USAGE_STATS_DISABLED_MESSAGE) + else: + prompt_print( + usage_constant.USAGE_STATS_ENABLED_BY_DEFAULT_FOR_CLI_MESSAGE, + ) + else: + assert usage_stats_enabledness is UsageStatsEnabledness.ENABLED_EXPLICITLY + prompt_print( + usage_constant.USAGE_STATS_ENABLED_FOR_CLI_MESSAGE + if cli + else usage_constant.USAGE_STATS_ENABLED_FOR_RAY_INIT_MESSAGE + ) + + +def set_usage_stats_enabled_via_config(enabled) -> None: + config = {} + try: + with open(_usage_stats_config_path()) as f: + config = json.load(f) + if not isinstance(config, dict): + logger.debug( + f"Invalid ray config file, should be a json dict but got {type(config)}" + ) + config = {} + except FileNotFoundError: + pass + except Exception as e: + logger.debug(f"Failed to load ray config file {e}") + + config["usage_stats"] = enabled + + try: + os.makedirs(os.path.dirname(_usage_stats_config_path()), exist_ok=True) + with open(_usage_stats_config_path(), "w") as f: + json.dump(config, f) + except Exception as e: + raise Exception( + "Failed to " + f'{"enable" if enabled else "disable"}' + ' usage stats by writing {"usage_stats": ' + f'{"true" if enabled else "false"}' + "} to " + f"{_usage_stats_config_path()}" + ) from e + + +def set_usage_stats_enabled_via_env_var(enabled) -> None: + os.environ[usage_constant.USAGE_STATS_ENABLED_ENV_VAR] = "1" if enabled else "0" + + +def put_cluster_metadata(gcs_client, *, ray_init_cluster) -> None: + """Generate the cluster metadata and store it to GCS. + + It is a blocking API. + + Params: + gcs_client: The GCS client to perform KV operation PUT. + ray_init_cluster: Whether the cluster is started by ray.init() + + Raises: + gRPC exceptions if PUT fails. + """ + metadata = _generate_cluster_metadata(ray_init_cluster=ray_init_cluster) + gcs_client.internal_kv_put( + usage_constant.CLUSTER_METADATA_KEY, + json.dumps(metadata).encode(), + overwrite=True, + namespace=ray_constants.KV_NAMESPACE_CLUSTER, + ) + return metadata + + +def get_total_num_running_jobs_to_report(gcs_client) -> Optional[int]: + """Return the total number of running jobs in the cluster excluding internal ones""" + try: + result = gcs_client.get_all_job_info( + skip_submission_job_info_field=True, skip_is_running_tasks_field=True + ) + total_num_running_jobs = 0 + for job_info in result.values(): + if not job_info.is_dead and not job_info.config.ray_namespace.startswith( + "_ray_internal" + ): + total_num_running_jobs += 1 + return total_num_running_jobs + except Exception as e: + logger.info(f"Faile to query number of running jobs in the cluster: {e}") + return None + + +def get_total_num_nodes_to_report(gcs_client, timeout=None) -> Optional[int]: + """Return the total number of alive nodes in the cluster""" + try: + result = gcs_client.get_all_node_info(timeout=timeout) + total_num_nodes = 0 + for node_id, node_info in result.items(): + if node_info.state == gcs_pb2.GcsNodeInfo.GcsNodeState.ALIVE: + total_num_nodes += 1 + return total_num_nodes + except Exception as e: + logger.info(f"Faile to query number of nodes in the cluster: {e}") + return None + + +def get_library_usages_to_report(gcs_client) -> List[str]: + return list(_get_usage_set(gcs_client, usage_constant.LIBRARY_USAGE_SET_NAME)) + + +def get_hardware_usages_to_report(gcs_client) -> List[str]: + return list(_get_usage_set(gcs_client, usage_constant.HARDWARE_USAGE_SET_NAME)) + + +def get_extra_usage_tags_to_report(gcs_client) -> Dict[str, str]: + """Get the extra usage tags from env var and gcs kv store. + + The env var should be given this way; key=value;key=value. + If parsing is failed, it will return the empty data. + + Returns: + Extra usage tags as kv pairs. + """ + extra_usage_tags = dict() + + extra_usage_tags_env_var = os.getenv("RAY_USAGE_STATS_EXTRA_TAGS", None) + if extra_usage_tags_env_var: + try: + kvs = extra_usage_tags_env_var.strip(";").split(";") + for kv in kvs: + k, v = kv.split("=") + extra_usage_tags[k] = v + except Exception as e: + logger.info(f"Failed to parse extra usage tags env var. Error: {e}") + + valid_tag_keys = [tag_key.lower() for tag_key in TagKey.keys()] + try: + keys = gcs_client.internal_kv_keys( + usage_constant.EXTRA_USAGE_TAG_PREFIX.encode(), + namespace=usage_constant.USAGE_STATS_NAMESPACE.encode(), + ) + for key in keys: + value = gcs_client.internal_kv_get( + key, namespace=usage_constant.USAGE_STATS_NAMESPACE.encode() + ) + key = key.decode("utf-8") + key = key[len(usage_constant.EXTRA_USAGE_TAG_PREFIX) :] + assert key in valid_tag_keys + extra_usage_tags[key] = value.decode("utf-8") + except Exception as e: + logger.info(f"Failed to get extra usage tags from kv store {e}") + return extra_usage_tags + + +def _get_cluster_status_to_report_v2(gcs_client) -> ClusterStatusToReport: + """ + Get the current status of this cluster. A temporary proxy for the + autoscaler v2 API. + + It is a blocking API. + + Params: + gcs_client: The GCS client. + + Returns: + The current cluster status or empty ClusterStatusToReport + if it fails to get that information. + """ + from ray.autoscaler.v2.sdk import get_cluster_status + + result = ClusterStatusToReport() + try: + cluster_status = get_cluster_status(gcs_client.address) + total_resources = cluster_status.total_resources() + result.total_num_cpus = int(total_resources.get("CPU", 0)) + result.total_num_gpus = int(total_resources.get("GPU", 0)) + + to_GiB = 1 / 2**30 + result.total_memory_gb = total_resources.get("memory", 0) * to_GiB + result.total_object_store_memory_gb = ( + total_resources.get("object_store_memory", 0) * to_GiB + ) + except Exception as e: + logger.info(f"Failed to get cluster status to report {e}") + finally: + return result + + +def get_cluster_status_to_report(gcs_client) -> ClusterStatusToReport: + """Get the current status of this cluster. + + It is a blocking API. + + Params: + gcs_client: The GCS client to perform KV operation GET. + + Returns: + The current cluster status or empty if it fails to get that information. + """ + try: + + from ray.autoscaler.v2.utils import is_autoscaler_v2 + + if is_autoscaler_v2(): + return _get_cluster_status_to_report_v2(gcs_client) + + cluster_status = gcs_client.internal_kv_get( + ray._private.ray_constants.DEBUG_AUTOSCALING_STATUS.encode(), + namespace=None, + ) + if not cluster_status: + return ClusterStatusToReport() + + result = ClusterStatusToReport() + to_GiB = 1 / 2**30 + cluster_status = json.loads(cluster_status.decode("utf-8")) + if ( + "load_metrics_report" not in cluster_status + or "usage" not in cluster_status["load_metrics_report"] + ): + return ClusterStatusToReport() + + usage = cluster_status["load_metrics_report"]["usage"] + # usage is a map from resource to (used, total) pair + if "CPU" in usage: + result.total_num_cpus = int(usage["CPU"][1]) + if "GPU" in usage: + result.total_num_gpus = int(usage["GPU"][1]) + if "memory" in usage: + result.total_memory_gb = usage["memory"][1] * to_GiB + if "object_store_memory" in usage: + result.total_object_store_memory_gb = ( + usage["object_store_memory"][1] * to_GiB + ) + return result + except Exception as e: + logger.info(f"Failed to get cluster status to report {e}") + return ClusterStatusToReport() + + +def get_cluster_config_to_report( + cluster_config_file_path: str, +) -> ClusterConfigToReport: + """Get the static cluster (autoscaler) config used to launch this cluster. + + Params: + cluster_config_file_path: The file path to the cluster config file. + + Returns: + The cluster (autoscaler) config or empty if it fails to get that information. + """ + + def get_instance_type(node_config): + if not node_config: + return None + if "InstanceType" in node_config: + # aws + return node_config["InstanceType"] + if "machineType" in node_config: + # gcp + return node_config["machineType"] + if ( + "azure_arm_parameters" in node_config + and "vmSize" in node_config["azure_arm_parameters"] + ): + return node_config["azure_arm_parameters"]["vmSize"] + return None + + try: + with open(cluster_config_file_path) as f: + config = yaml.safe_load(f) + result = ClusterConfigToReport() + if "min_workers" in config: + result.min_workers = config["min_workers"] + if "max_workers" in config: + result.max_workers = config["max_workers"] + + if "provider" in config and "type" in config["provider"]: + result.cloud_provider = config["provider"]["type"] + + if "head_node_type" not in config: + return result + if "available_node_types" not in config: + return result + head_node_type = config["head_node_type"] + available_node_types = config["available_node_types"] + for available_node_type in available_node_types: + if available_node_type == head_node_type: + head_node_instance_type = get_instance_type( + available_node_types[available_node_type].get("node_config") + ) + if head_node_instance_type: + result.head_node_instance_type = head_node_instance_type + else: + worker_node_instance_type = get_instance_type( + available_node_types[available_node_type].get("node_config") + ) + if worker_node_instance_type: + result.worker_node_instance_types = ( + result.worker_node_instance_types or set() + ) + result.worker_node_instance_types.add(worker_node_instance_type) + if result.worker_node_instance_types: + result.worker_node_instance_types = list( + result.worker_node_instance_types + ) + return result + except FileNotFoundError: + # It's a manually started cluster or k8s cluster + result = ClusterConfigToReport() + # Check if we're on Kubernetes + if usage_constant.KUBERNETES_SERVICE_HOST_ENV in os.environ: + # Check if we're using KubeRay >= 0.4.0. + if usage_constant.KUBERAY_ENV in os.environ: + result.cloud_provider = usage_constant.PROVIDER_KUBERAY + # Else, we're on Kubernetes but not in either of the above categories. + else: + result.cloud_provider = usage_constant.PROVIDER_KUBERNETES_GENERIC + return result + except Exception as e: + logger.info(f"Failed to get cluster config to report {e}") + return ClusterConfigToReport() + + +def get_cluster_metadata(gcs_client) -> dict: + """Get the cluster metadata from GCS. + + It is a blocking API. + + This will return None if `put_cluster_metadata` was never called. + + Params: + gcs_client: The GCS client to perform KV operation GET. + + Returns: + The cluster metadata in a dictinoary. + + Raises: + RuntimeError if it fails to obtain cluster metadata from GCS. + """ + return json.loads( + gcs_client.internal_kv_get( + usage_constant.CLUSTER_METADATA_KEY, + namespace=ray_constants.KV_NAMESPACE_CLUSTER, + ).decode("utf-8") + ) + + +def is_ray_init_cluster(gcs_client: ray._raylet.GcsClient) -> bool: + """Return whether the cluster is started by ray.init()""" + cluster_metadata = get_cluster_metadata(gcs_client) + return cluster_metadata["ray_init_cluster"] + + +def generate_disabled_report_data() -> UsageStatsToReport: + """Generate the report data indicating usage stats is disabled""" + data = UsageStatsToReport( + schema_version=usage_constant.SCHEMA_VERSION, + source=os.getenv( + usage_constant.USAGE_STATS_SOURCE_ENV_VAR, + usage_constant.USAGE_STATS_SOURCE_OSS, + ), + collect_timestamp_ms=int(time.time() * 1000), + ) + return data + + +def generate_report_data( + cluster_config_to_report: ClusterConfigToReport, + total_success: int, + total_failed: int, + seq_number: int, + gcs_address: str, + cluster_id: str, +) -> UsageStatsToReport: + """Generate the report data. + + Params: + cluster_config_to_report: The cluster (autoscaler) + config generated by `get_cluster_config_to_report`. + total_success: The total number of successful report + for the lifetime of the cluster. + total_failed: The total number of failed report + for the lifetime of the cluster. + seq_number: The sequence number that's incremented whenever + a new report is sent. + gcs_address: the address of gcs to get data to report. + cluster_id: hex id of the cluster. + + Returns: + UsageStats + """ + assert cluster_id + + gcs_client = ray._raylet.GcsClient( + address=gcs_address, nums_reconnect_retry=20, cluster_id=cluster_id + ) + + cluster_metadata = get_cluster_metadata(gcs_client) + cluster_status_to_report = get_cluster_status_to_report(gcs_client) + + data = UsageStatsToReport( + schema_version=usage_constant.SCHEMA_VERSION, + source=os.getenv( + usage_constant.USAGE_STATS_SOURCE_ENV_VAR, + usage_constant.USAGE_STATS_SOURCE_OSS, + ), + collect_timestamp_ms=int(time.time() * 1000), + total_success=total_success, + total_failed=total_failed, + seq_number=seq_number, + ray_version=cluster_metadata["ray_version"], + python_version=cluster_metadata["python_version"], + session_id=cluster_id, + git_commit=cluster_metadata["git_commit"], + os=cluster_metadata["os"], + session_start_timestamp_ms=cluster_metadata["session_start_timestamp_ms"], + cloud_provider=cluster_config_to_report.cloud_provider, + min_workers=cluster_config_to_report.min_workers, + max_workers=cluster_config_to_report.max_workers, + head_node_instance_type=cluster_config_to_report.head_node_instance_type, + worker_node_instance_types=cluster_config_to_report.worker_node_instance_types, + total_num_cpus=cluster_status_to_report.total_num_cpus, + total_num_gpus=cluster_status_to_report.total_num_gpus, + total_memory_gb=cluster_status_to_report.total_memory_gb, + total_object_store_memory_gb=cluster_status_to_report.total_object_store_memory_gb, # noqa: E501 + library_usages=get_library_usages_to_report(gcs_client), + extra_usage_tags=get_extra_usage_tags_to_report(gcs_client), + total_num_nodes=get_total_num_nodes_to_report(gcs_client), + total_num_running_jobs=get_total_num_running_jobs_to_report(gcs_client), + libc_version=cluster_metadata.get("libc_version"), + hardware_usages=get_hardware_usages_to_report(gcs_client), + ) + return data + + +def generate_write_data( + usage_stats: UsageStatsToReport, + error: str, +) -> UsageStatsToWrite: + """Generate the report data. + + Params: + usage_stats: The usage stats that were reported. + error: The error message of failed reports. + + Returns: + UsageStatsToWrite + """ + data = UsageStatsToWrite( + usage_stats=usage_stats, + success=error is None, + error=error, + ) + return data + + +class UsageReportClient: + """The client implementation for usage report. + + It is in charge of writing usage stats to the directory + and report usage stats. + """ + + def write_usage_data(self, data: UsageStatsToWrite, dir_path: str) -> None: + """Write the usage data to the directory. + + Params: + data: Data to report + dir_path: The path to the directory to write usage data. + """ + # Atomically update the file. + dir_path = Path(dir_path) + destination = dir_path / usage_constant.USAGE_STATS_FILE + temp = dir_path / f"{usage_constant.USAGE_STATS_FILE}.tmp" + with temp.open(mode="w") as json_file: + json_file.write(json.dumps(asdict(data))) + if sys.platform == "win32": + # Windows 32 doesn't support atomic renaming, so we should delete + # the file first. + destination.unlink(missing_ok=True) + temp.rename(destination) + + def report_usage_data(self, url: str, data: UsageStatsToReport) -> None: + """Report the usage data to the usage server. + + Params: + url: The URL to update resource usage. + data: Data to report. + + Raises: + requests.HTTPError if requests fails. + """ + r = requests.request( + "POST", + url, + headers={"Content-Type": "application/json"}, + json=asdict(data), + timeout=10, + ) + r.raise_for_status() + return r diff --git a/.venv/lib/python3.11/site-packages/ray/_private/workers/__init__.py b/.venv/lib/python3.11/site-packages/ray/_private/workers/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/.venv/lib/python3.11/site-packages/ray/_private/workers/__pycache__/__init__.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/_private/workers/__pycache__/__init__.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..2c8bb2fdec1eea3ab8e03a2b416d3eb512a41c63 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/_private/workers/__pycache__/__init__.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/_private/workers/__pycache__/default_worker.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/_private/workers/__pycache__/default_worker.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..bba94e86d3ddba7ed786bca987d407ecd3205b68 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/_private/workers/__pycache__/default_worker.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/_private/workers/__pycache__/setup_worker.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/_private/workers/__pycache__/setup_worker.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..6ac7a0da7e2d0f6b0347325c02436d6ae114e6f1 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/_private/workers/__pycache__/setup_worker.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/_private/workers/default_worker.py b/.venv/lib/python3.11/site-packages/ray/_private/workers/default_worker.py new file mode 100644 index 0000000000000000000000000000000000000000..2a6ffe2bcce9e18ff01c77eb4a6dc9978016a09e --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/_private/workers/default_worker.py @@ -0,0 +1,304 @@ +import os +import argparse +import base64 +import json +import time + +import ray +import ray._private.node +import ray._private.ray_constants as ray_constants +import ray._private.utils +import ray.actor +from ray._private.async_compat import try_install_uvloop +from ray._private.parameter import RayParams +from ray._private.ray_logging import configure_log_file, get_worker_log_file_name +from ray._private.runtime_env.setup_hook import load_and_execute_setup_hook + +parser = argparse.ArgumentParser( + description=("Parse addresses for the worker to connect to.") +) +parser.add_argument( + "--cluster-id", + required=True, + type=str, + help="the auto-generated ID of the cluster", +) +parser.add_argument( + "--node-id", + required=True, + type=str, + help="the auto-generated ID of the node", +) +parser.add_argument( + "--node-ip-address", + required=True, + type=str, + help="the ip address of the worker's node", +) +parser.add_argument( + "--node-manager-port", required=True, type=int, help="the port of the worker's node" +) +parser.add_argument( + "--raylet-ip-address", + required=False, + type=str, + default=None, + help="the ip address of the worker's raylet", +) +parser.add_argument( + "--redis-address", required=True, type=str, help="the address to use for Redis" +) +parser.add_argument( + "--gcs-address", required=True, type=str, help="the address to use for GCS" +) +parser.add_argument( + "--redis-username", + required=False, + type=str, + default=None, + help="the username to use for Redis", +) +parser.add_argument( + "--redis-password", + required=False, + type=str, + default=None, + help="the password to use for Redis", +) +parser.add_argument( + "--object-store-name", required=True, type=str, help="the object store's name" +) +parser.add_argument("--raylet-name", required=False, type=str, help="the raylet's name") +parser.add_argument( + "--logging-level", + required=False, + type=str, + default=ray_constants.LOGGER_LEVEL, + choices=ray_constants.LOGGER_LEVEL_CHOICES, + help=ray_constants.LOGGER_LEVEL_HELP, +) +parser.add_argument( + "--logging-format", + required=False, + type=str, + default=ray_constants.LOGGER_FORMAT, + help=ray_constants.LOGGER_FORMAT_HELP, +) +parser.add_argument( + "--temp-dir", + required=False, + type=str, + default=None, + help="Specify the path of the temporary directory use by Ray process.", +) +parser.add_argument( + "--storage", + required=False, + type=str, + default=None, + help="Specify the persistent storage path.", +) +parser.add_argument( + "--load-code-from-local", + default=False, + action="store_true", + help="True if code is loaded from local files, as opposed to the GCS.", +) +parser.add_argument( + "--worker-type", + required=False, + type=str, + default="WORKER", + help="Specify the type of the worker process", +) +parser.add_argument( + "--metrics-agent-port", + required=True, + type=int, + help="the port of the node's metric agent.", +) +parser.add_argument( + "--runtime-env-agent-port", + required=True, + type=int, + default=None, + help="The port on which the runtime env agent receives HTTP requests.", +) +parser.add_argument( + "--object-spilling-config", + required=False, + type=str, + default="", + help="The configuration of object spilling. Only used by I/O workers.", +) +parser.add_argument( + "--logging-rotate-bytes", + required=False, + type=int, + default=ray_constants.LOGGING_ROTATE_BYTES, + help="Specify the max bytes for rotating " + "log file, default is " + f"{ray_constants.LOGGING_ROTATE_BYTES} bytes.", +) +parser.add_argument( + "--logging-rotate-backup-count", + required=False, + type=int, + default=ray_constants.LOGGING_ROTATE_BACKUP_COUNT, + help="Specify the backup count of rotated log file, default is " + f"{ray_constants.LOGGING_ROTATE_BACKUP_COUNT}.", +) +parser.add_argument( + "--runtime-env-hash", + required=False, + type=int, + default=0, + help="The computed hash of the runtime env for this worker.", +) +parser.add_argument( + "--startup-token", + required=True, + type=int, + help="The startup token assigned to this worker process by the raylet.", +) +parser.add_argument( + "--ray-debugger-external", + default=False, + action="store_true", + help="True if Ray debugger is made available externally.", +) +parser.add_argument("--session-name", required=False, help="The current session name") +parser.add_argument( + "--webui", + required=False, + help="The address of web ui", +) +parser.add_argument( + "--worker-launch-time-ms", + required=True, + type=int, + help="The time when raylet starts to launch the worker process.", +) + +parser.add_argument( + "--worker-preload-modules", + type=str, + required=False, + help=( + "A comma-separated list of Python module names " + "to import before accepting work." + ), +) + +if __name__ == "__main__": + # NOTE(sang): For some reason, if we move the code below + # to a separate function, tensorflow will capture that method + # as a step function. For more details, check out + # https://github.com/ray-project/ray/pull/12225#issue-525059663. + args = parser.parse_args() + ray._private.ray_logging.setup_logger(args.logging_level, args.logging_format) + worker_launched_time_ms = time.time_ns() // 1e6 + if args.worker_type == "WORKER": + mode = ray.WORKER_MODE + elif args.worker_type == "SPILL_WORKER": + mode = ray.SPILL_WORKER_MODE + elif args.worker_type == "RESTORE_WORKER": + mode = ray.RESTORE_WORKER_MODE + else: + raise ValueError("Unknown worker type: " + args.worker_type) + + # Try installing uvloop as default event-loop implementation + # for asyncio + try_install_uvloop() + + raylet_ip_address = args.raylet_ip_address + if raylet_ip_address is None: + raylet_ip_address = args.node_ip_address + ray_params = RayParams( + node_ip_address=args.node_ip_address, + raylet_ip_address=raylet_ip_address, + node_manager_port=args.node_manager_port, + redis_address=args.redis_address, + redis_username=args.redis_username, + redis_password=args.redis_password, + plasma_store_socket_name=args.object_store_name, + raylet_socket_name=args.raylet_name, + temp_dir=args.temp_dir, + storage=args.storage, + metrics_agent_port=args.metrics_agent_port, + runtime_env_agent_port=args.runtime_env_agent_port, + gcs_address=args.gcs_address, + session_name=args.session_name, + webui=args.webui, + cluster_id=args.cluster_id, + node_id=args.node_id, + ) + node = ray._private.node.Node( + ray_params, + head=False, + shutdown_at_exit=False, + spawn_reaper=False, + connect_only=True, + default_worker=True, + ) + + # NOTE(suquark): We must initialize the external storage before we + # connect to raylet. Otherwise we may receive requests before the + # external storage is intialized. + if mode == ray.RESTORE_WORKER_MODE or mode == ray.SPILL_WORKER_MODE: + from ray._private import external_storage, storage + + storage._init_storage(args.storage, is_head=False) + if args.object_spilling_config: + object_spilling_config = base64.b64decode(args.object_spilling_config) + object_spilling_config = json.loads(object_spilling_config) + else: + object_spilling_config = {} + external_storage.setup_external_storage( + object_spilling_config, node.node_id, node.session_name + ) + + ray._private.worker._global_node = node + ray._private.worker.connect( + node, + node.session_name, + mode=mode, + runtime_env_hash=args.runtime_env_hash, + startup_token=args.startup_token, + ray_debugger_external=args.ray_debugger_external, + worker_launch_time_ms=args.worker_launch_time_ms, + worker_launched_time_ms=worker_launched_time_ms, + ) + + worker = ray._private.worker.global_worker + + # Setup log file. + out_file, err_file = node.get_log_file_handles( + get_worker_log_file_name(args.worker_type) + ) + configure_log_file(out_file, err_file) + worker.set_out_file(out_file) + worker.set_err_file(err_file) + + if mode == ray.WORKER_MODE and args.worker_preload_modules: + module_names_to_import = args.worker_preload_modules.split(",") + ray._private.utils.try_import_each_module(module_names_to_import) + + # If the worker setup function is configured, run it. + worker_process_setup_hook_key = os.getenv( + ray_constants.WORKER_PROCESS_SETUP_HOOK_ENV_VAR + ) + if worker_process_setup_hook_key: + error = load_and_execute_setup_hook(worker_process_setup_hook_key) + if error is not None: + worker.core_worker.drain_and_exit_worker("system", error) + + if mode == ray.WORKER_MODE: + worker.main_loop() + elif mode in [ray.RESTORE_WORKER_MODE, ray.SPILL_WORKER_MODE]: + # It is handled by another thread in the C++ core worker. + # We just need to keep the worker alive. + while True: + time.sleep(100000) + else: + raise ValueError(f"Unexcepted worker mode: {mode}") diff --git a/.venv/lib/python3.11/site-packages/ray/_private/workers/setup_worker.py b/.venv/lib/python3.11/site-packages/ray/_private/workers/setup_worker.py new file mode 100644 index 0000000000000000000000000000000000000000..23ba980a5bb22b2ce648c6df45f8ba0b7e73f3dd --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/_private/workers/setup_worker.py @@ -0,0 +1,33 @@ +import argparse +import logging + +from ray._private.ray_constants import LOGGER_FORMAT, LOGGER_LEVEL +from ray._private.ray_logging import setup_logger +from ray._private.runtime_env.context import RuntimeEnvContext +from ray.core.generated.common_pb2 import Language + +logger = logging.getLogger(__name__) + +parser = argparse.ArgumentParser( + description=("Set up the environment for a Ray worker and launch the worker.") +) + +parser.add_argument( + "--serialized-runtime-env-context", + type=str, + help="the serialized runtime env context", +) + +parser.add_argument("--language", type=str, help="the language type of the worker") + + +if __name__ == "__main__": + setup_logger(LOGGER_LEVEL, LOGGER_FORMAT) + args, remaining_args = parser.parse_known_args() + # NOTE(edoakes): args.serialized_runtime_env_context is only None when + # we're starting the main Ray client proxy server. That case should + # probably not even go through this codepath. + runtime_env_context = RuntimeEnvContext.deserialize( + args.serialized_runtime_env_context or "{}" + ) + runtime_env_context.exec_worker(remaining_args, Language.Value(args.language)) diff --git a/.venv/lib/python3.11/site-packages/ray/jars/ray_dist.jar b/.venv/lib/python3.11/site-packages/ray/jars/ray_dist.jar new file mode 100644 index 0000000000000000000000000000000000000000..564e8b27ef6ad6f49d2f032060c59b34b44837e5 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/jars/ray_dist.jar @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1f3835fe29f363a67c05160a5c60634942abbd46720e587faad488cadebd2e8a +size 32364530 diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/__init__.py b/.venv/lib/python3.11/site-packages/ray/rllib/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..f63b8173d43382c546dd1aaa1d09c316bd3ba846 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/rllib/__init__.py @@ -0,0 +1,55 @@ +import logging + +from ray._private.usage import usage_lib + +# Note: do not introduce unnecessary library dependencies here, e.g. gym. +# This file is imported from the tune module in order to register RLlib agents. +from ray.rllib.env.base_env import BaseEnv +from ray.rllib.env.external_env import ExternalEnv +from ray.rllib.env.multi_agent_env import MultiAgentEnv +from ray.rllib.env.vector_env import VectorEnv +from ray.rllib.evaluation.rollout_worker import RolloutWorker +from ray.rllib.policy.policy import Policy +from ray.rllib.policy.sample_batch import SampleBatch +from ray.rllib.policy.tf_policy import TFPolicy +from ray.rllib.policy.torch_policy import TorchPolicy +from ray.tune.registry import register_trainable + + +def _setup_logger(): + logger = logging.getLogger("ray.rllib") + handler = logging.StreamHandler() + handler.setFormatter( + logging.Formatter( + "%(asctime)s\t%(levelname)s %(filename)s:%(lineno)s -- %(message)s" + ) + ) + logger.addHandler(handler) + logger.propagate = False + + +def _register_all(): + from ray.rllib.algorithms.registry import ALGORITHMS, _get_algorithm_class + + for key, get_trainable_class_and_config in ALGORITHMS.items(): + register_trainable(key, get_trainable_class_and_config()[0]) + + for key in ["__fake", "__sigmoid_fake_data", "__parameter_tuning"]: + register_trainable(key, _get_algorithm_class(key)) + + +_setup_logger() + +usage_lib.record_library_usage("rllib") + +__all__ = [ + "Policy", + "TFPolicy", + "TorchPolicy", + "RolloutWorker", + "SampleBatch", + "BaseEnv", + "MultiAgentEnv", + "VectorEnv", + "ExternalEnv", +] diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/execution/__init__.py b/.venv/lib/python3.11/site-packages/ray/rllib/execution/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..d6a2b3345f868a12fb219485ce11e62a511abecc --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/rllib/execution/__init__.py @@ -0,0 +1,23 @@ +from ray.rllib.execution.learner_thread import LearnerThread +from ray.rllib.execution.multi_gpu_learner_thread import MultiGPULearnerThread +from ray.rllib.execution.minibatch_buffer import MinibatchBuffer +from ray.rllib.execution.replay_ops import SimpleReplayBuffer +from ray.rllib.execution.rollout_ops import ( + standardize_fields, + synchronous_parallel_sample, +) +from ray.rllib.execution.train_ops import ( + train_one_step, + multi_gpu_train_one_step, +) + +__all__ = [ + "multi_gpu_train_one_step", + "standardize_fields", + "synchronous_parallel_sample", + "train_one_step", + "LearnerThread", + "MultiGPULearnerThread", + "SimpleReplayBuffer", + "MinibatchBuffer", +] diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/execution/__pycache__/__init__.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/execution/__pycache__/__init__.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..da6f73fc34cae24dae18449a56bced27d29d38be Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/execution/__pycache__/__init__.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/execution/__pycache__/learner_thread.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/execution/__pycache__/learner_thread.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..6a376377f844c1fc961a994ad71eb3c5b8231f79 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/execution/__pycache__/learner_thread.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/execution/__pycache__/minibatch_buffer.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/execution/__pycache__/minibatch_buffer.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..259582cdc5722e0f243499e9c3b047f5c44f8246 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/execution/__pycache__/minibatch_buffer.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/execution/__pycache__/multi_gpu_learner_thread.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/execution/__pycache__/multi_gpu_learner_thread.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..7245376c1b1faca7fdda60ccfee10f1e8b06096c Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/execution/__pycache__/multi_gpu_learner_thread.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/execution/__pycache__/replay_ops.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/execution/__pycache__/replay_ops.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..4210303bd111b8ccd0426703479e14c6c7e11b6c Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/execution/__pycache__/replay_ops.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/execution/__pycache__/rollout_ops.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/execution/__pycache__/rollout_ops.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..d2ccd2ffec7d2740546dc2259cc900de99e4ceb2 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/execution/__pycache__/rollout_ops.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/execution/__pycache__/segment_tree.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/execution/__pycache__/segment_tree.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..34924828807bacdcaf3c13612b81f6950894b791 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/execution/__pycache__/segment_tree.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/execution/__pycache__/train_ops.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/execution/__pycache__/train_ops.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..55169ab6428e0b0259ca12d38ff2c02fa0c95ec3 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/execution/__pycache__/train_ops.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/execution/buffers/__init__.py b/.venv/lib/python3.11/site-packages/ray/rllib/execution/buffers/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/execution/buffers/__pycache__/mixin_replay_buffer.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/execution/buffers/__pycache__/mixin_replay_buffer.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..23ee3f4322025372d8f2f53238d54379d513fa99 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/execution/buffers/__pycache__/mixin_replay_buffer.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/execution/learner_thread.py b/.venv/lib/python3.11/site-packages/ray/rllib/execution/learner_thread.py new file mode 100644 index 0000000000000000000000000000000000000000..49340a972c350acc6584d8274c87aab937198886 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/rllib/execution/learner_thread.py @@ -0,0 +1,137 @@ +import copy +import queue +import threading +from typing import Dict, Optional + +from ray.util.timer import _Timer +from ray.rllib.evaluation.rollout_worker import RolloutWorker +from ray.rllib.execution.minibatch_buffer import MinibatchBuffer +from ray.rllib.utils.annotations import OldAPIStack +from ray.rllib.utils.framework import try_import_tf +from ray.rllib.utils.metrics.learner_info import LearnerInfoBuilder, LEARNER_INFO +from ray.rllib.utils.metrics.window_stat import WindowStat +from ray.util.iter import _NextValueNotReady + +tf1, tf, tfv = try_import_tf() + + +@OldAPIStack +class LearnerThread(threading.Thread): + """Background thread that updates the local model from sample trajectories. + + The learner thread communicates with the main thread through Queues. This + is needed since Ray operations can only be run on the main thread. In + addition, moving heavyweight gradient ops session runs off the main thread + improves overall throughput. + """ + + def __init__( + self, + local_worker: RolloutWorker, + minibatch_buffer_size: int, + num_sgd_iter: int, + learner_queue_size: int, + learner_queue_timeout: int, + ): + """Initialize the learner thread. + + Args: + local_worker: process local rollout worker holding + policies this thread will call learn_on_batch() on + minibatch_buffer_size: max number of train batches to store + in the minibatching buffer + num_sgd_iter: number of passes to learn on per train batch + learner_queue_size: max size of queue of inbound + train batches to this thread + learner_queue_timeout: raise an exception if the queue has + been empty for this long in seconds + """ + threading.Thread.__init__(self) + self.learner_queue_size = WindowStat("size", 50) + self.local_worker = local_worker + self.inqueue = queue.Queue(maxsize=learner_queue_size) + self.outqueue = queue.Queue() + self.minibatch_buffer = MinibatchBuffer( + inqueue=self.inqueue, + size=minibatch_buffer_size, + timeout=learner_queue_timeout, + num_passes=num_sgd_iter, + init_num_passes=num_sgd_iter, + ) + self.queue_timer = _Timer() + self.grad_timer = _Timer() + self.load_timer = _Timer() + self.load_wait_timer = _Timer() + self.daemon = True + self.policy_ids_updated = [] + self.learner_info = {} + self.stopped = False + self.num_steps = 0 + + def run(self) -> None: + # Switch on eager mode if configured. + if self.local_worker.config.framework_str == "tf2": + tf1.enable_eager_execution() + while not self.stopped: + self.step() + + def step(self) -> Optional[_NextValueNotReady]: + with self.queue_timer: + try: + batch, _ = self.minibatch_buffer.get() + except queue.Empty: + return _NextValueNotReady() + with self.grad_timer: + # Use LearnerInfoBuilder as a unified way to build the final + # results dict from `learn_on_loaded_batch` call(s). + # This makes sure results dicts always have the same structure + # no matter the setup (multi-GPU, multi-agent, minibatch SGD, + # tf vs torch). + learner_info_builder = LearnerInfoBuilder(num_devices=1) + if self.local_worker.config.policy_states_are_swappable: + self.local_worker.lock() + multi_agent_results = self.local_worker.learn_on_batch(batch) + if self.local_worker.config.policy_states_are_swappable: + self.local_worker.unlock() + self.policy_ids_updated.extend(list(multi_agent_results.keys())) + for pid, results in multi_agent_results.items(): + learner_info_builder.add_learn_on_batch_results(results, pid) + self.learner_info = learner_info_builder.finalize() + + self.num_steps += 1 + # Put tuple: env-steps, agent-steps, and learner info into the queue. + self.outqueue.put((batch.count, batch.agent_steps(), self.learner_info)) + self.learner_queue_size.push(self.inqueue.qsize()) + + def add_learner_metrics(self, result: Dict, overwrite_learner_info=True) -> Dict: + """Add internal metrics to a result dict.""" + + def timer_to_ms(timer): + return round(1000 * timer.mean, 3) + + if overwrite_learner_info: + result["info"].update( + { + "learner_queue": self.learner_queue_size.stats(), + LEARNER_INFO: copy.deepcopy(self.learner_info), + "timing_breakdown": { + "learner_grad_time_ms": timer_to_ms(self.grad_timer), + "learner_load_time_ms": timer_to_ms(self.load_timer), + "learner_load_wait_time_ms": timer_to_ms(self.load_wait_timer), + "learner_dequeue_time_ms": timer_to_ms(self.queue_timer), + }, + } + ) + else: + result["info"].update( + { + "learner_queue": self.learner_queue_size.stats(), + "timing_breakdown": { + "learner_grad_time_ms": timer_to_ms(self.grad_timer), + "learner_load_time_ms": timer_to_ms(self.load_timer), + "learner_load_wait_time_ms": timer_to_ms(self.load_wait_timer), + "learner_dequeue_time_ms": timer_to_ms(self.queue_timer), + }, + } + ) + return result diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/execution/minibatch_buffer.py b/.venv/lib/python3.11/site-packages/ray/rllib/execution/minibatch_buffer.py new file mode 100644 index 0000000000000000000000000000000000000000..29b224ce2e30f81c2b825d63d3056d0f8bc4c595 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/rllib/execution/minibatch_buffer.py @@ -0,0 +1,61 @@ +from typing import Any, Tuple +import queue + +from ray.rllib.utils.annotations import OldAPIStack + + +@OldAPIStack +class MinibatchBuffer: + """Ring buffer of recent data batches for minibatch SGD. + + This is for use with AsyncSamplesOptimizer. + """ + + def __init__( + self, + inqueue: queue.Queue, + size: int, + timeout: float, + num_passes: int, + init_num_passes: int = 1, + ): + """Initialize a minibatch buffer. + + Args: + inqueue (queue.Queue): Queue to populate the internal ring buffer + from. + size: Max number of data items to buffer. + timeout: Queue timeout + num_passes: Max num times each data item should be emitted. + init_num_passes: Initial passes for each data item. + Maxiumum number of passes per item are increased to num_passes over + time. + """ + self.inqueue = inqueue + self.size = size + self.timeout = timeout + self.max_initial_ttl = num_passes + self.cur_initial_ttl = init_num_passes + self.buffers = [None] * size + self.ttl = [0] * size + self.idx = 0 + + def get(self) -> Tuple[Any, bool]: + """Get a new batch from the internal ring buffer. + + Returns: + buf: Data item saved from inqueue. + released: True if the item is now removed from the ring buffer. + """ + if self.ttl[self.idx] <= 0: + self.buffers[self.idx] = self.inqueue.get(timeout=self.timeout) + self.ttl[self.idx] = self.cur_initial_ttl + if self.cur_initial_ttl < self.max_initial_ttl: + self.cur_initial_ttl += 1 + buf = self.buffers[self.idx] + self.ttl[self.idx] -= 1 + released = self.ttl[self.idx] <= 0 + if released: + self.buffers[self.idx] = None + self.idx = (self.idx + 1) % len(self.buffers) + return buf, released diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/execution/multi_gpu_learner_thread.py b/.venv/lib/python3.11/site-packages/ray/rllib/execution/multi_gpu_learner_thread.py new file mode 100644 index 0000000000000000000000000000000000000000..aacf797b32b8b65da6e49f74492b5684e56bb1b7 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/rllib/execution/multi_gpu_learner_thread.py @@ -0,0 +1,245 @@ +import logging +import queue +import threading + +from ray.util.timer import _Timer +from ray.rllib.execution.learner_thread import LearnerThread +from ray.rllib.execution.minibatch_buffer import MinibatchBuffer +from ray.rllib.policy.sample_batch import SampleBatch +from ray.rllib.utils.annotations import OldAPIStack, override +from ray.rllib.utils.deprecation import deprecation_warning +from ray.rllib.utils.framework import try_import_tf +from ray.rllib.utils.metrics.learner_info import LearnerInfoBuilder +from ray.rllib.evaluation.rollout_worker import RolloutWorker + +tf1, tf, tfv = try_import_tf() + +logger = logging.getLogger(__name__) + + +@OldAPIStack +class MultiGPULearnerThread(LearnerThread): + """Learner that can use multiple GPUs and parallel loading. + + This class is used for async sampling algorithms. + + Example workflow: 2 GPUs and 3 multi-GPU tower stacks. + -> On each GPU, there are 3 slots for batches, indexed 0, 1, and 2. + + Workers collect data from env and push it into inqueue: + Workers -> (data) -> self.inqueue + + We also have two queues, indicating, which stacks are loaded and which + are not. + - idle_tower_stacks = [0, 1, 2] <- all 3 stacks are free at first. + - ready_tower_stacks = [] <- None of the 3 stacks is loaded with data. + + `ready_tower_stacks` is managed by `ready_tower_stacks_buffer` for + possible minibatch-SGD iterations per loaded batch (this avoids a reload + from CPU to GPU for each SGD iter). + + n _MultiGPULoaderThreads: self.inqueue -get()-> + policy.load_batch_into_buffer() -> ready_stacks = [0 ...] + + This thread: self.ready_tower_stacks_buffer -get()-> + policy.learn_on_loaded_batch() -> if SGD-iters done, + put stack index back in idle_tower_stacks queue. + """ + + def __init__( + self, + local_worker: RolloutWorker, + num_gpus: int = 1, + lr=None, # deprecated. + train_batch_size: int = 500, + num_multi_gpu_tower_stacks: int = 1, + num_sgd_iter: int = 1, + learner_queue_size: int = 16, + learner_queue_timeout: int = 300, + num_data_load_threads: int = 16, + _fake_gpus: bool = False, + # Deprecated arg, use + minibatch_buffer_size=None, + ): + """Initializes a MultiGPULearnerThread instance. + + Args: + local_worker: Local RolloutWorker holding + policies this thread will call `load_batch_into_buffer` and + `learn_on_loaded_batch` on. + num_gpus: Number of GPUs to use for data-parallel SGD. + train_batch_size: Size of batches (minibatches if + `num_sgd_iter` > 1) to learn on. + num_multi_gpu_tower_stacks: Number of buffers to parallelly + load data into on one device. Each buffer is of size of + `train_batch_size` and hence increases GPU memory usage + accordingly. + num_sgd_iter: Number of passes to learn on per train batch + (minibatch if `num_sgd_iter` > 1). + learner_queue_size: Max size of queue of inbound + train batches to this thread. + num_data_load_threads: Number of threads to use to load + data into GPU memory in parallel. + """ + # Deprecated: No need to specify as we don't need the actual + # minibatch-buffer anyways. + if minibatch_buffer_size: + deprecation_warning( + old="MultiGPULearnerThread.minibatch_buffer_size", + error=True, + ) + super().__init__( + local_worker=local_worker, + minibatch_buffer_size=0, + num_sgd_iter=num_sgd_iter, + learner_queue_size=learner_queue_size, + learner_queue_timeout=learner_queue_timeout, + ) + # Delete reference to parent's minibatch_buffer, which is not needed. + # Instead, in multi-GPU mode, we pull tower stack indices from the + # `self.ready_tower_stacks_buffer` buffer, whose size is exactly + # `num_multi_gpu_tower_stacks`. + self.minibatch_buffer = None + + self.train_batch_size = train_batch_size + + self.policy_map = self.local_worker.policy_map + self.devices = next(iter(self.policy_map.values())).devices + + logger.info("MultiGPULearnerThread devices {}".format(self.devices)) + assert self.train_batch_size % len(self.devices) == 0 + assert self.train_batch_size >= len(self.devices), "batch too small" + + self.tower_stack_indices = list(range(num_multi_gpu_tower_stacks)) + + # Two queues for tower stacks: + # a) Those that are loaded with data ("ready") + # b) Those that are ready to be loaded with new data ("idle"). + self.idle_tower_stacks = queue.Queue() + self.ready_tower_stacks = queue.Queue() + # In the beginning, all stacks are idle (no loading has taken place + # yet). + for idx in self.tower_stack_indices: + self.idle_tower_stacks.put(idx) + # Start n threads that are responsible for loading data into the + # different (idle) stacks. + for i in range(num_data_load_threads): + self.loader_thread = _MultiGPULoaderThread(self, share_stats=(i == 0)) + self.loader_thread.start() + + # Create a buffer that holds stack indices that are "ready" + # (loaded with data). Those are stacks that we can call + # "learn_on_loaded_batch" on. + self.ready_tower_stacks_buffer = MinibatchBuffer( + self.ready_tower_stacks, + num_multi_gpu_tower_stacks, + learner_queue_timeout, + num_sgd_iter, + ) + + @override(LearnerThread) + def step(self) -> None: + if not self.loader_thread.is_alive(): + raise RuntimeError( + "The `_MultiGPULoaderThread` has died! Will therefore also terminate " + "the `MultiGPULearnerThread`." + ) + + with self.load_wait_timer: + buffer_idx, released = self.ready_tower_stacks_buffer.get() + + get_num_samples_loaded_into_buffer = 0 + with self.grad_timer: + # Use LearnerInfoBuilder as a unified way to build the final + # results dict from `learn_on_loaded_batch` call(s). + # This makes sure results dicts always have the same structure + # no matter the setup (multi-GPU, multi-agent, minibatch SGD, + # tf vs torch). + learner_info_builder = LearnerInfoBuilder(num_devices=len(self.devices)) + + for pid in self.policy_map.keys(): + # Not a policy-to-train. + if ( + self.local_worker.is_policy_to_train is not None + and not self.local_worker.is_policy_to_train(pid) + ): + continue + policy = self.policy_map[pid] + default_policy_results = policy.learn_on_loaded_batch( + offset=0, buffer_index=buffer_idx + ) + learner_info_builder.add_learn_on_batch_results( + default_policy_results, policy_id=pid + ) + self.policy_ids_updated.append(pid) + get_num_samples_loaded_into_buffer += ( + policy.get_num_samples_loaded_into_buffer(buffer_idx) + ) + + self.learner_info = learner_info_builder.finalize() + + if released: + self.idle_tower_stacks.put(buffer_idx) + + # Put tuple: env-steps, agent-steps, and learner info into the queue. + self.outqueue.put( + ( + get_num_samples_loaded_into_buffer, + get_num_samples_loaded_into_buffer, + self.learner_info, + ) + ) + self.learner_queue_size.push(self.inqueue.qsize()) + + +class _MultiGPULoaderThread(threading.Thread): + def __init__( + self, multi_gpu_learner_thread: MultiGPULearnerThread, share_stats: bool + ): + threading.Thread.__init__(self) + self.multi_gpu_learner_thread = multi_gpu_learner_thread + self.daemon = True + if share_stats: + self.queue_timer = multi_gpu_learner_thread.queue_timer + self.load_timer = multi_gpu_learner_thread.load_timer + else: + self.queue_timer = _Timer() + self.load_timer = _Timer() + + def run(self) -> None: + while True: + self._step() + + def _step(self) -> None: + s = self.multi_gpu_learner_thread + policy_map = s.policy_map + + # Get a new batch from the data (inqueue). + with self.queue_timer: + batch = s.inqueue.get() + + # Get next idle stack for loading. + buffer_idx = s.idle_tower_stacks.get() + + # Load the batch into the idle stack. + with self.load_timer: + for pid in policy_map.keys(): + if ( + s.local_worker.is_policy_to_train is not None + and not s.local_worker.is_policy_to_train(pid, batch) + ): + continue + policy = policy_map[pid] + if isinstance(batch, SampleBatch): + policy.load_batch_into_buffer( + batch=batch, + buffer_index=buffer_idx, + ) + elif pid in batch.policy_batches: + policy.load_batch_into_buffer( + batch=batch.policy_batches[pid], + buffer_index=buffer_idx, + ) + + # Tag just-loaded stack as "ready". + s.ready_tower_stacks.put(buffer_idx) diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/execution/replay_ops.py b/.venv/lib/python3.11/site-packages/ray/rllib/execution/replay_ops.py new file mode 100644 index 0000000000000000000000000000000000000000..bcd1f026cf1e1b3696229f7c27b7f0d504142a64 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/rllib/execution/replay_ops.py @@ -0,0 +1,37 @@ +from typing import Optional +import random + +from ray.rllib.utils.annotations import OldAPIStack +from ray.rllib.utils.replay_buffers.replay_buffer import warn_replay_capacity +from ray.rllib.utils.typing import SampleBatchType + + +@OldAPIStack +class SimpleReplayBuffer: + """Simple replay buffer that operates over batches.""" + + def __init__(self, num_slots: int, replay_proportion: Optional[float] = None): + """Initialize SimpleReplayBuffer. + + Args: + num_slots: Number of batches to store in total. + """ + self.num_slots = num_slots + self.replay_batches = [] + self.replay_index = 0 + + def add_batch(self, sample_batch: SampleBatchType) -> None: + warn_replay_capacity(item=sample_batch, num_items=self.num_slots) + if self.num_slots > 0: + if len(self.replay_batches) < self.num_slots: + self.replay_batches.append(sample_batch) + else: + self.replay_batches[self.replay_index] = sample_batch + self.replay_index += 1 + self.replay_index %= self.num_slots + + def replay(self) -> SampleBatchType: + return random.choice(self.replay_batches) + + def __len__(self): + return len(self.replay_batches) diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/execution/rollout_ops.py b/.venv/lib/python3.11/site-packages/ray/rllib/execution/rollout_ops.py new file mode 100644 index 0000000000000000000000000000000000000000..d9a683fa7dbff4a5640f5b60f4845149ed9aed58 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/rllib/execution/rollout_ops.py @@ -0,0 +1,207 @@ +import logging +from typing import List, Optional, Union +import tree + +from ray.rllib.env.env_runner_group import EnvRunnerGroup +from ray.rllib.policy.sample_batch import ( + SampleBatch, + DEFAULT_POLICY_ID, + concat_samples, +) +from ray.rllib.utils.annotations import ExperimentalAPI, OldAPIStack +from ray.rllib.utils.metrics import NUM_AGENT_STEPS_SAMPLED, NUM_ENV_STEPS_SAMPLED +from ray.rllib.utils.sgd import standardized +from ray.rllib.utils.typing import EpisodeType, SampleBatchType + +logger = logging.getLogger(__name__) + + +@ExperimentalAPI +def synchronous_parallel_sample( + *, + worker_set: EnvRunnerGroup, + max_agent_steps: Optional[int] = None, + max_env_steps: Optional[int] = None, + concat: bool = True, + sample_timeout_s: Optional[float] = None, + random_actions: bool = False, + _uses_new_env_runners: bool = False, + _return_metrics: bool = False, +) -> Union[List[SampleBatchType], SampleBatchType, List[EpisodeType], EpisodeType]: + """Runs parallel and synchronous rollouts on all remote workers. + + Waits for all workers to return from the remote calls. + + If no remote workers exist (num_workers == 0), use the local worker + for sampling. + + Alternatively to calling `worker.sample.remote()`, the user can provide a + `remote_fn()`, which will be applied to the worker(s) instead. + + Args: + worker_set: The EnvRunnerGroup to use for sampling. + remote_fn: If provided, use `worker.apply.remote(remote_fn)` instead + of `worker.sample.remote()` to generate the requests. + max_agent_steps: Optional number of agent steps to be included in the + final batch or list of episodes. + max_env_steps: Optional number of environment steps to be included in the + final batch or list of episodes. + concat: Whether to aggregate all resulting batches or episodes. in case of + batches the list of batches is concatinated at the end. in case of + episodes all episode lists from workers are flattened into a single list. + sample_timeout_s: The timeout in sec to use on the `foreach_env_runner` call. + After this time, the call will return with a result (or not if all + EnvRunners are stalling). If None, will block indefinitely and not timeout. + _uses_new_env_runners: Whether the new `EnvRunner API` is used. In this case + episodes instead of `SampleBatch` objects are returned. + + Returns: + The list of collected sample batch types or episode types (one for each parallel + rollout worker in the given `worker_set`). + + .. testcode:: + + # Define an RLlib Algorithm. + from ray.rllib.algorithms.ppo import PPO, PPOConfig + config = ( + PPOConfig() + .environment("CartPole-v1") + ) + algorithm = config.build() + # 2 remote EnvRunners (num_env_runners=2): + episodes = synchronous_parallel_sample( + worker_set=algorithm.env_runner_group, + _uses_new_env_runners=True, + concat=False, + ) + print(len(episodes)) + + .. testoutput:: + + 2 + """ + # Only allow one of `max_agent_steps` or `max_env_steps` to be defined. + assert not (max_agent_steps is not None and max_env_steps is not None) + + agent_or_env_steps = 0 + max_agent_or_env_steps = max_agent_steps or max_env_steps or None + sample_batches_or_episodes = [] + all_stats_dicts = [] + + random_action_kwargs = {} if not random_actions else {"random_actions": True} + + # Stop collecting batches as soon as one criterium is met. + while (max_agent_or_env_steps is None and agent_or_env_steps == 0) or ( + max_agent_or_env_steps is not None + and agent_or_env_steps < max_agent_or_env_steps + ): + # No remote workers in the set -> Use local worker for collecting + # samples. + if worker_set.num_remote_workers() <= 0: + sampled_data = [worker_set.local_env_runner.sample(**random_action_kwargs)] + if _return_metrics: + stats_dicts = [worker_set.local_env_runner.get_metrics()] + # Loop over remote workers' `sample()` method in parallel. + else: + sampled_data = worker_set.foreach_env_runner( + ( + (lambda w: w.sample(**random_action_kwargs)) + if not _return_metrics + else (lambda w: (w.sample(**random_action_kwargs), w.get_metrics())) + ), + local_env_runner=False, + timeout_seconds=sample_timeout_s, + ) + # Nothing was returned (maybe all workers are stalling) or no healthy + # remote workers left: Break. + # There is no point staying in this loop, since we will not be able to + # get any new samples if we don't have any healthy remote workers left. + if not sampled_data or worker_set.num_healthy_remote_workers() <= 0: + if not sampled_data: + logger.warning( + "No samples returned from remote workers. If you have a " + "slow environment or model, consider increasing the " + "`sample_timeout_s` or decreasing the " + "`rollout_fragment_length` in `AlgorithmConfig.env_runners()." + ) + elif worker_set.num_healthy_remote_workers() <= 0: + logger.warning( + "No healthy remote workers left. Trying to restore workers ..." + ) + break + + if _return_metrics: + stats_dicts = [s[1] for s in sampled_data] + sampled_data = [s[0] for s in sampled_data] + + # Update our counters for the stopping criterion of the while loop. + if _return_metrics: + if max_agent_steps: + agent_or_env_steps += sum( + int(agent_stat) + for stat_dict in stats_dicts + for agent_stat in stat_dict[NUM_AGENT_STEPS_SAMPLED].values() + ) + else: + agent_or_env_steps += sum( + int(stat_dict[NUM_ENV_STEPS_SAMPLED]) for stat_dict in stats_dicts + ) + sample_batches_or_episodes.extend(sampled_data) + all_stats_dicts.extend(stats_dicts) + else: + for batch_or_episode in sampled_data: + if max_agent_steps: + agent_or_env_steps += ( + sum(e.agent_steps() for e in batch_or_episode) + if _uses_new_env_runners + else batch_or_episode.agent_steps() + ) + else: + agent_or_env_steps += ( + sum(e.env_steps() for e in batch_or_episode) + if _uses_new_env_runners + else batch_or_episode.env_steps() + ) + sample_batches_or_episodes.append(batch_or_episode) + # Break out (and ignore the remaining samples) if max timesteps (batch + # size) reached. We want to avoid collecting batches that are too large + # only because of a failed/restarted worker causing a second iteration + # of the main loop. + if ( + max_agent_or_env_steps is not None + and agent_or_env_steps >= max_agent_or_env_steps + ): + break + + if concat is True: + # If we have episodes flatten the episode list. + if _uses_new_env_runners: + sample_batches_or_episodes = tree.flatten(sample_batches_or_episodes) + # Otherwise we concatenate the `SampleBatch` objects + else: + sample_batches_or_episodes = concat_samples(sample_batches_or_episodes) + + if _return_metrics: + return sample_batches_or_episodes, all_stats_dicts + return sample_batches_or_episodes + + +@OldAPIStack +def standardize_fields(samples: SampleBatchType, fields: List[str]) -> SampleBatchType: + """Standardize fields of the given SampleBatch""" + wrapped = False + + if isinstance(samples, SampleBatch): + samples = samples.as_multi_agent() + wrapped = True + + for policy_id in samples.policy_batches: + batch = samples.policy_batches[policy_id] + for field in fields: + if field in batch: + batch[field] = standardized(batch[field]) + + if wrapped: + samples = samples.policy_batches[DEFAULT_POLICY_ID] + + return samples diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/execution/segment_tree.py b/.venv/lib/python3.11/site-packages/ray/rllib/execution/segment_tree.py new file mode 100644 index 0000000000000000000000000000000000000000..5e7a5fd102f60ec53ca2a1a39f1ea68205f10ed2 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/rllib/execution/segment_tree.py @@ -0,0 +1,212 @@ +import operator +from typing import Any, Optional + + +class SegmentTree: + """A Segment Tree data structure. + + https://en.wikipedia.org/wiki/Segment_tree + + Can be used as regular array, but with two important differences: + + a) Setting an item's value is slightly slower. It is O(lg capacity), + instead of O(1). + b) Offers efficient `reduce` operation which reduces the tree's values + over some specified contiguous subsequence of items in the array. + Operation could be e.g. min/max/sum. + + The data is stored in a list, where the length is 2 * capacity. + The second half of the list stores the actual values for each index, so if + capacity=8, values are stored at indices 8 to 15. The first half of the + array contains the reduced-values of the different (binary divided) + segments, e.g. (capacity=4): + 0=not used + 1=reduced-value over all elements (array indices 4 to 7). + 2=reduced-value over array indices (4 and 5). + 3=reduced-value over array indices (6 and 7). + 4-7: values of the tree. + NOTE that the values of the tree are accessed by indices starting at 0, so + `tree[0]` accesses `internal_array[4]` in the above example. + """ + + def __init__( + self, capacity: int, operation: Any, neutral_element: Optional[Any] = None + ): + """Initializes a Segment Tree object. + + Args: + capacity: Total size of the array - must be a power of two. + operation: Lambda obj, obj -> obj + The operation for combining elements (eg. sum, max). + Must be a mathematical group together with the set of + possible values for array elements. + neutral_element (Optional[obj]): The neutral element for + `operation`. Use None for automatically finding a value: + max: float("-inf"), min: float("inf"), sum: 0.0. + """ + + assert ( + capacity > 0 and capacity & (capacity - 1) == 0 + ), "Capacity must be positive and a power of 2!" + self.capacity = capacity + if neutral_element is None: + neutral_element = ( + 0.0 + if operation is operator.add + else float("-inf") + if operation is max + else float("inf") + ) + self.neutral_element = neutral_element + self.value = [self.neutral_element for _ in range(2 * capacity)] + self.operation = operation + + def reduce(self, start: int = 0, end: Optional[int] = None) -> Any: + """Applies `self.operation` to subsequence of our values. + + Subsequence is contiguous, includes `start` and excludes `end`. + + self.operation( + arr[start], operation(arr[start+1], operation(... arr[end]))) + + Args: + start: Start index to apply reduction to. + end (Optional[int]): End index to apply reduction to (excluded). + + Returns: + any: The result of reducing self.operation over the specified + range of `self._value` elements. + """ + if end is None: + end = self.capacity + elif end < 0: + end += self.capacity + + # Init result with neutral element. + result = self.neutral_element + # Map start/end to our actual index space (second half of array). + start += self.capacity + end += self.capacity + + # Example: + # internal-array (first half=sums, second half=actual values): + # 0 1 2 3 | 4 5 6 7 + # - 6 1 5 | 1 0 2 3 + + # tree.sum(0, 3) = 3 + # internally: start=4, end=7 -> sum values 1 0 2 = 3. + + # Iterate over tree starting in the actual-values (second half) + # section. + # 1) start=4 is even -> do nothing. + # 2) end=7 is odd -> end-- -> end=6 -> add value to result: result=2 + # 3) int-divide start and end by 2: start=2, end=3 + # 4) start still smaller end -> iterate once more. + # 5) start=2 is even -> do nothing. + # 6) end=3 is odd -> end-- -> end=2 -> add value to result: result=1 + # NOTE: This adds the sum of indices 4 and 5 to the result. + + # Iterate as long as start != end. + while start < end: + + # If start is odd: Add its value to result and move start to + # next even value. + if start & 1: + result = self.operation(result, self.value[start]) + start += 1 + + # If end is odd: Move end to previous even value, then add its + # value to result. NOTE: This takes care of excluding `end` in any + # situation. + if end & 1: + end -= 1 + result = self.operation(result, self.value[end]) + + # Divide both start and end by 2 to make them "jump" into the + # next upper level reduce-index space. + start //= 2 + end //= 2 + + # Then repeat till start == end. + + return result + + def __setitem__(self, idx: int, val: float) -> None: + """ + Inserts/overwrites a value in/into the tree. + + Args: + idx: The index to insert to. Must be in [0, `self.capacity`[ + val: The value to insert. + """ + assert 0 <= idx < self.capacity, f"idx={idx} capacity={self.capacity}" + + # Index of the leaf to insert into (always insert in "second half" + # of the tree, the first half is reserved for already calculated + # reduction-values). + idx += self.capacity + self.value[idx] = val + + # Recalculate all affected reduction values (in "first half" of tree). + idx = idx >> 1 # Divide by 2 (faster than division). + while idx >= 1: + update_idx = 2 * idx # calculate only once + # Update the reduction value at the correct "first half" idx. + self.value[idx] = self.operation( + self.value[update_idx], self.value[update_idx + 1] + ) + idx = idx >> 1 # Divide by 2 (faster than division). + + def __getitem__(self, idx: int) -> Any: + assert 0 <= idx < self.capacity + return self.value[idx + self.capacity] + + def get_state(self): + return self.value + + def set_state(self, state): + assert len(state) == self.capacity * 2 + self.value = state + + +class SumSegmentTree(SegmentTree): + """A SegmentTree with the reduction `operation`=operator.add.""" + + def __init__(self, capacity: int): + super(SumSegmentTree, self).__init__(capacity=capacity, operation=operator.add) + + def sum(self, start: int = 0, end: Optional[Any] = None) -> Any: + """Returns the sum over a sub-segment of the tree.""" + return self.reduce(start, end) + + def find_prefixsum_idx(self, prefixsum: float) -> int: + """Finds highest i, for which: sum(arr[0]+..+arr[i - i]) <= prefixsum. + + Args: + prefixsum: `prefixsum` upper bound in above constraint. + + Returns: + int: Largest possible index (i) satisfying above constraint. + """ + assert 0 <= prefixsum <= self.sum() + 1e-5 + # Global sum node. + idx = 1 + + # While non-leaf (first half of tree). + while idx < self.capacity: + update_idx = 2 * idx + if self.value[update_idx] > prefixsum: + idx = update_idx + else: + prefixsum -= self.value[update_idx] + idx = update_idx + 1 + return idx - self.capacity + + +class MinSegmentTree(SegmentTree): + def __init__(self, capacity: int): + super(MinSegmentTree, self).__init__(capacity=capacity, operation=min) + + def min(self, start: int = 0, end: Optional[Any] = None) -> Any: + """Returns min(arr[start], ..., arr[end])""" + return self.reduce(start, end) diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/execution/train_ops.py b/.venv/lib/python3.11/site-packages/ray/rllib/execution/train_ops.py new file mode 100644 index 0000000000000000000000000000000000000000..2b2b76bc671e7dab8db39fe1da38d1e577563afe --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/rllib/execution/train_ops.py @@ -0,0 +1,204 @@ +import logging +import numpy as np +import math +from typing import Dict + +from ray.rllib.policy.sample_batch import DEFAULT_POLICY_ID +from ray.rllib.utils.annotations import OldAPIStack +from ray.rllib.utils.framework import try_import_tf +from ray.rllib.utils.deprecation import deprecation_warning +from ray.rllib.utils.metrics import ( + NUM_ENV_STEPS_TRAINED, + NUM_AGENT_STEPS_TRAINED, + LEARN_ON_BATCH_TIMER, + LOAD_BATCH_TIMER, +) +from ray.rllib.utils.metrics.learner_info import LearnerInfoBuilder +from ray.rllib.utils.sgd import do_minibatch_sgd +from ray.util import log_once + +tf1, tf, tfv = try_import_tf() + +logger = logging.getLogger(__name__) + + +@OldAPIStack +def train_one_step(algorithm, train_batch, policies_to_train=None) -> Dict: + """Function that improves the all policies in `train_batch` on the local worker. + + .. testcode:: + :skipif: True + + from ray.rllib.execution.rollout_ops import synchronous_parallel_sample + algo = [...] + train_batch = synchronous_parallel_sample(algo.env_runner_group) + # This trains the policy on one batch. + print(train_one_step(algo, train_batch))) + + .. testoutput:: + + {"default_policy": ...} + + Updates the NUM_ENV_STEPS_TRAINED and NUM_AGENT_STEPS_TRAINED counters as well as + the LEARN_ON_BATCH_TIMER timer of the `algorithm` object. + """ + config = algorithm.config + workers = algorithm.env_runner_group + local_worker = workers.local_env_runner + num_sgd_iter = config.get("num_epochs", config.get("num_sgd_iter", 1)) + minibatch_size = config.get("minibatch_size") + if minibatch_size is None: + minibatch_size = config.get("sgd_minibatch_size", 0) + + learn_timer = algorithm._timers[LEARN_ON_BATCH_TIMER] + with learn_timer: + # Subsample minibatches (size=`minibatch_size`) from the + # train batch and loop through train batch `num_sgd_iter` times. + if num_sgd_iter > 1 or minibatch_size > 0: + info = do_minibatch_sgd( + train_batch, + { + pid: local_worker.get_policy(pid) + for pid in policies_to_train + or local_worker.get_policies_to_train(train_batch) + }, + local_worker, + num_sgd_iter, + minibatch_size, + [], + ) + # Single update step using train batch. + else: + info = local_worker.learn_on_batch(train_batch) + + learn_timer.push_units_processed(train_batch.count) + algorithm._counters[NUM_ENV_STEPS_TRAINED] += train_batch.count + algorithm._counters[NUM_AGENT_STEPS_TRAINED] += train_batch.agent_steps() + + if algorithm.reward_estimators: + info[DEFAULT_POLICY_ID]["off_policy_estimation"] = {} + for name, estimator in algorithm.reward_estimators.items(): + info[DEFAULT_POLICY_ID]["off_policy_estimation"][name] = estimator.train( + train_batch + ) + return info + + +@OldAPIStack +def multi_gpu_train_one_step(algorithm, train_batch) -> Dict: + """Multi-GPU version of train_one_step. + + Uses the policies' `load_batch_into_buffer` and `learn_on_loaded_batch` methods + to be more efficient wrt CPU/GPU data transfers. For example, when doing multiple + passes through a train batch (e.g. for PPO) using `config.num_sgd_iter`, the + actual train batch is only split once and loaded once into the GPU(s). + + .. testcode:: + :skipif: True + + from ray.rllib.execution.rollout_ops import synchronous_parallel_sample + algo = [...] + train_batch = synchronous_parallel_sample(algo.env_runner_group) + # This trains the policy on one batch. + print(multi_gpu_train_one_step(algo, train_batch))) + + .. testoutput:: + + {"default_policy": ...} + + Updates the NUM_ENV_STEPS_TRAINED and NUM_AGENT_STEPS_TRAINED counters as well as + the LOAD_BATCH_TIMER and LEARN_ON_BATCH_TIMER timers of the Algorithm instance. + """ + if log_once("mulit_gpu_train_one_step_deprecation_warning"): + deprecation_warning( + old=("ray.rllib.execution.train_ops." "multi_gpu_train_one_step") + ) + config = algorithm.config + workers = algorithm.env_runner_group + local_worker = workers.local_env_runner + num_sgd_iter = config.get("num_epochs", config.get("num_sgd_iter", 1)) + minibatch_size = config.get("minibatch_size") + if minibatch_size is None: + minibatch_size = config["train_batch_size"] + + # Determine the number of devices (GPUs or 1 CPU) we use. + num_devices = int(math.ceil(config["num_gpus"] or 1)) + + # Make sure total batch size is dividable by the number of devices. + # Batch size per tower. + per_device_batch_size = minibatch_size // num_devices + # Total batch size. + batch_size = per_device_batch_size * num_devices + assert batch_size % num_devices == 0 + assert batch_size >= num_devices, "Batch size too small!" + + # Handle everything as if multi-agent. + train_batch = train_batch.as_multi_agent() + + # Load data into GPUs. + load_timer = algorithm._timers[LOAD_BATCH_TIMER] + with load_timer: + num_loaded_samples = {} + for policy_id, batch in train_batch.policy_batches.items(): + # Not a policy-to-train. + if ( + local_worker.is_policy_to_train is not None + and not local_worker.is_policy_to_train(policy_id, train_batch) + ): + continue + + # Decompress SampleBatch, in case some columns are compressed. + batch.decompress_if_needed() + + # Load the entire train batch into the Policy's only buffer + # (idx=0). Policies only have >1 buffers, if we are training + # asynchronously. + num_loaded_samples[policy_id] = local_worker.policy_map[ + policy_id + ].load_batch_into_buffer(batch, buffer_index=0) + + # Execute minibatch SGD on loaded data. + learn_timer = algorithm._timers[LEARN_ON_BATCH_TIMER] + with learn_timer: + # Use LearnerInfoBuilder as a unified way to build the final + # results dict from `learn_on_loaded_batch` call(s). + # This makes sure results dicts always have the same structure + # no matter the setup (multi-GPU, multi-agent, minibatch SGD, + # tf vs torch). + learner_info_builder = LearnerInfoBuilder(num_devices=num_devices) + + for policy_id, samples_per_device in num_loaded_samples.items(): + policy = local_worker.policy_map[policy_id] + num_batches = max(1, int(samples_per_device) // int(per_device_batch_size)) + logger.debug("== sgd epochs for {} ==".format(policy_id)) + for _ in range(num_sgd_iter): + permutation = np.random.permutation(num_batches) + for batch_index in range(num_batches): + # Learn on the pre-loaded data in the buffer. + # Note: For minibatch SGD, the data is an offset into + # the pre-loaded entire train batch. + results = policy.learn_on_loaded_batch( + permutation[batch_index] * per_device_batch_size, buffer_index=0 + ) + + learner_info_builder.add_learn_on_batch_results(results, policy_id) + + # Tower reduce and finalize results. + learner_info = learner_info_builder.finalize() + + load_timer.push_units_processed(train_batch.count) + learn_timer.push_units_processed(train_batch.count) + + # TODO: Move this into Algorithm's `training_step` method for + # better transparency. + algorithm._counters[NUM_ENV_STEPS_TRAINED] += train_batch.count + algorithm._counters[NUM_AGENT_STEPS_TRAINED] += train_batch.agent_steps() + + if algorithm.reward_estimators: + learner_info[DEFAULT_POLICY_ID]["off_policy_estimation"] = {} + for name, estimator in algorithm.reward_estimators.items(): + learner_info[DEFAULT_POLICY_ID]["off_policy_estimation"][ + name + ] = estimator.train(train_batch) + + return learner_info diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/tuned_examples/__init__.py b/.venv/lib/python3.11/site-packages/ray/rllib/tuned_examples/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/tuned_examples/__pycache__/__init__.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/tuned_examples/__pycache__/__init__.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..daf782b58e5950d3868201afb5b78b1b1f33945e Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/tuned_examples/__pycache__/__init__.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/tuned_examples/__pycache__/cleanup_experiment.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/tuned_examples/__pycache__/cleanup_experiment.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..38fe8912d1b3524f1817bc28806da7b362bc8f2b Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/tuned_examples/__pycache__/cleanup_experiment.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/tuned_examples/cleanup_experiment.py b/.venv/lib/python3.11/site-packages/ray/rllib/tuned_examples/cleanup_experiment.py new file mode 100644 index 0000000000000000000000000000000000000000..370568db79959b64b4232caeaf879219c71c050f --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/rllib/tuned_examples/cleanup_experiment.py @@ -0,0 +1,186 @@ +""" +This script automates cleaning up a benchmark/experiment run of some algo +against some config (with possibly more than one tune trial, +e.g. torch=grid_search([True, False])). + +Run `python cleanup_experiment.py --help` for more information. + +Use on an input directory with trial contents e.g.: +.. +IMPALA_BreakoutNoFrameskip-v4_0_use_pytorch=False_2020-05-11_10-17-54topr3h9k +IMPALA_BreakoutNoFrameskip-v4_0_use_pytorch=False_2020-05-11_13-59-35dqaetxnf +IMPALA_BreakoutNoFrameskip-v4_0_use_pytorch=False_2020-05-11_17-21-28tbhedw72 +IMPALA_BreakoutNoFrameskip-v4_2_use_pytorch=True_2020-05-11_10-17-54lv20cgn_ +IMPALA_BreakoutNoFrameskip-v4_2_use_pytorch=True_2020-05-11_13-59-35kwzhax_y +IMPALA_BreakoutNoFrameskip-v4_2_use_pytorch=True_2020-05-11_17-21-28a5j0s7za + +Then run: +>> python cleanup_experiment.py --experiment-dir [parent dir w/ trial sub-dirs] +>> --output-dir [your out dir] --results-filter dumb_col_2,superfluous_col3 +>> --results-max-size [max results file size in kb before(!) zipping] + +The script will create one output sub-dir for each trial and only copy +the configuration and the csv results (filtered and every nth row removed +based on the given args). +""" + +import argparse +import json +import os +import re +import shutil +import yaml + +parser = argparse.ArgumentParser() +parser.add_argument( + "--experiment-dir", + type=str, + help="Experiment dir in which all sub-runs (seeds) are " + "located (as sub-dirs). Each sub0-run dir must contain the files: " + "params.json and progress.csv.", +) +parser.add_argument( + "--output-dir", + type=str, + help="The output dir, in which the cleaned up output will be placed.", +) +parser.add_argument( + "--results-filter", + type=str, + help="comma-separated list of csv fields to exclude.", + default="experiment_id,pid,hostname,node_ip,trial_id,hist_stats/episode_" + "reward,hist_stats/episode_lengths,experiment_tag", +) +parser.add_argument( + "--results-max-size", + type=int, + help="the max. size of the final results.csv file (in kb). Will erase " + "every nth line in the original input to reach that goal. " + "Use 0 for no limit (default=100).", + default=100, +) + + +def process_single_run(in_dir, out_dir): + exp_dir = os.listdir(in_dir) + + # Make sure trials dir is ok. + assert ( + "params.json" in exp_dir and "progress.csv" in exp_dir + ), "params.json or progress.csv not found in {}!".format(in_dir) + + os.makedirs(out_dir, exist_ok=True) + + for file in exp_dir: + absfile = os.path.join(in_dir, file) + # Config file -> Convert to yaml and move to output dir. + if file == "params.json": + assert os.path.isfile(absfile), "{} not a file!".format(file) + with open(absfile) as fp: + contents = json.load(fp) + with open(os.path.join(out_dir, "config.yaml"), "w") as fp: + yaml.dump(contents, fp) + # Progress csv file -> Filter out some columns, cut, and write to + # output_dir. + elif file == "progress.csv": + assert os.path.isfile(absfile), "{} not a file!".format(file) + col_idx_to_filter = [] + with open(absfile) as fp: + # Get column names. + col_names_orig = fp.readline().strip().split(",") + # Split by comma (abiding to quotes), filter out + # unwanted columns, then write to disk. + cols_to_filter = args.results_filter.split(",") + for i, c in enumerate(col_names_orig): + if c in cols_to_filter: + col_idx_to_filter.insert(0, i) + col_names = col_names_orig.copy() + for idx in col_idx_to_filter: + col_names.pop(idx) + absfile_out = os.path.join(out_dir, "progress.csv") + with open(absfile_out, "w") as out_fp: + print(",".join(col_names), file=out_fp) + while True: + line = fp.readline().strip() + if not line: + break + line = re.sub( + "(,{2,})", + lambda m: ",None" * (len(m.group()) - 1) + ",", + line, + ) + cols = re.findall('".+?"|[^,]+', line) + if len(cols) != len(col_names_orig): + continue + for idx in col_idx_to_filter: + cols.pop(idx) + print(",".join(cols), file=out_fp) + + # Reduce the size of the output file if necessary. + out_size = os.path.getsize(absfile_out) + max_size = args.results_max_size * 1024 + if 0 < max_size < out_size: + # Figure out roughly every which line we have to drop. + ratio = out_size / max_size + # If ratio > 2.0, we'll have to keep only every nth line. + if ratio > 2.0: + nth = out_size // max_size + os.system( + "awk 'NR==1||NR%{}==0' {} > {}.new".format( + nth, absfile_out, absfile_out + ) + ) + # If ratio < 2.0 (>1.0), we'll have to drop every nth line. + else: + nth = out_size // (out_size - max_size) + os.system( + "awk 'NR==1||NR%{}!=0' {} > {}.new".format( + nth, absfile_out, absfile_out + ) + ) + os.remove(absfile_out) + os.rename(absfile_out + ".new", absfile_out) + + # Zip progress.csv into results.zip. + zip_file = os.path.join(out_dir, "results.zip") + try: + os.remove(zip_file) + except FileNotFoundError: + pass + os.system( + "zip -j {} {}".format(zip_file, os.path.join(out_dir, "progress.csv")) + ) + os.remove(os.path.join(out_dir, "progress.csv")) + + # TBX events file -> Move as is. + elif re.search("^(events\\.out\\.|params\\.pkl)", file): + assert os.path.isfile(absfile), "{} not a file!".format(file) + shutil.copyfile(absfile, os.path.join(out_dir, file)) + + +if __name__ == "__main__": + args = parser.parse_args() + exp_dir = os.listdir(args.experiment_dir) + # Loop through all sub-directories. + for i, sub_run in enumerate(sorted(exp_dir)): + abspath = os.path.join(args.experiment_dir, sub_run) + # This is a seed run. + if os.path.isdir(abspath) and re.search( + "^(\\w+?)_(\\w+?-v\\d+)(_\\d+)", sub_run + ): + # Create meaningful output dir name: + # [algo]_[env]_[trial #]_[trial-config]_[date YYYY-MM-DD]. + cleaned_up_out = re.sub( + "^(\\w+?)_(\\w+?-v\\d+)(_\\d+)(_.+)?(_\\d{4}-\\d{2}-\\d{2})" + "_\\d{2}-\\d{2}-\\w+", + "{:02}_\\1_\\2\\4\\5".format(i), + sub_run, + ) + # Remove superflous `env=` specifier (anv always included in name). + cleaned_up_out = re.sub( + "^(.+)env=\\w+?-v\\d+,?(.+)", "\\1\\2", cleaned_up_out + ) + out_path = os.path.join(args.output_dir, cleaned_up_out) + process_single_run(abspath, out_path) + # Done. + print("done") diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/tuned_examples/dreamerv3/__init__.py b/.venv/lib/python3.11/site-packages/ray/rllib/tuned_examples/dreamerv3/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/tuned_examples/dreamerv3/__pycache__/__init__.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/tuned_examples/dreamerv3/__pycache__/__init__.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..379bbd74be04021c3b54583d01e3922740ddb313 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/tuned_examples/dreamerv3/__pycache__/__init__.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/tuned_examples/dreamerv3/__pycache__/atari_100k.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/tuned_examples/dreamerv3/__pycache__/atari_100k.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..5d11d0f3317fbb9bc9eb159d2a1f7ae1c9718979 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/tuned_examples/dreamerv3/__pycache__/atari_100k.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/tuned_examples/dreamerv3/__pycache__/atari_200M.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/tuned_examples/dreamerv3/__pycache__/atari_200M.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..5b471cff2a2f2921d6df83d38e128d4b2467fb05 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/tuned_examples/dreamerv3/__pycache__/atari_200M.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/tuned_examples/dreamerv3/__pycache__/cartpole.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/tuned_examples/dreamerv3/__pycache__/cartpole.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..0961815d0a6fecb7f3925020cc4a6c9ac77aee31 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/tuned_examples/dreamerv3/__pycache__/cartpole.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/tuned_examples/dreamerv3/__pycache__/dm_control_suite_vision.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/tuned_examples/dreamerv3/__pycache__/dm_control_suite_vision.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..56f8d94e76494eb0e31efc2b6e785eb59b5cf80f Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/tuned_examples/dreamerv3/__pycache__/dm_control_suite_vision.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/tuned_examples/dreamerv3/__pycache__/flappy_bird.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/tuned_examples/dreamerv3/__pycache__/flappy_bird.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e546863d2105de3236277ab84eacb03b13250902 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/tuned_examples/dreamerv3/__pycache__/flappy_bird.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/tuned_examples/dreamerv3/__pycache__/frozenlake_2x2.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/tuned_examples/dreamerv3/__pycache__/frozenlake_2x2.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..2ba94abb53c3e1fd83c720bb653e07952c3afe50 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/tuned_examples/dreamerv3/__pycache__/frozenlake_2x2.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/tuned_examples/dreamerv3/__pycache__/frozenlake_4x4_deterministic.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/tuned_examples/dreamerv3/__pycache__/frozenlake_4x4_deterministic.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..b60a4276140c61d76c9b2aa63269c28f25ea6142 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/tuned_examples/dreamerv3/__pycache__/frozenlake_4x4_deterministic.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/tuned_examples/dreamerv3/__pycache__/gymnasium_robotics.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/tuned_examples/dreamerv3/__pycache__/gymnasium_robotics.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e58d3aa7368334d140fd6623afd4d581cf68edde Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/tuned_examples/dreamerv3/__pycache__/gymnasium_robotics.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/tuned_examples/dreamerv3/__pycache__/highway_env.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/tuned_examples/dreamerv3/__pycache__/highway_env.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..030643596cdd92b59241af47e0c2720f312168d0 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/tuned_examples/dreamerv3/__pycache__/highway_env.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/tuned_examples/dreamerv3/__pycache__/pendulum.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/rllib/tuned_examples/dreamerv3/__pycache__/pendulum.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..6207c652dd2a92da9383ea2a150dda2ac0f22b7d Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/rllib/tuned_examples/dreamerv3/__pycache__/pendulum.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/tuned_examples/dreamerv3/atari_100k.py b/.venv/lib/python3.11/site-packages/ray/rllib/tuned_examples/dreamerv3/atari_100k.py new file mode 100644 index 0000000000000000000000000000000000000000..60419424124d967351049d1d52b824218d690d61 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/rllib/tuned_examples/dreamerv3/atari_100k.py @@ -0,0 +1,74 @@ +""" +[1] Mastering Diverse Domains through World Models - 2023 +D. Hafner, J. Pasukonis, J. Ba, T. Lillicrap +https://arxiv.org/pdf/2301.04104v1.pdf + +[2] Mastering Atari with Discrete World Models - 2021 +D. Hafner, T. Lillicrap, M. Norouzi, J. Ba +https://arxiv.org/pdf/2010.02193.pdf +""" + +# Run with: +# python [this script name].py --env ale_py:ALE/[gym ID e.g. Pong-v5] + +# To see all available options: +# python [this script name].py --help + +from ray.rllib.algorithms.dreamerv3.dreamerv3 import DreamerV3Config +from ray.rllib.utils.test_utils import add_rllib_example_script_args + +parser = add_rllib_example_script_args( + default_iters=1000000, + default_reward=20.0, + default_timesteps=100000, +) +# Use `parser` to add your own custom command line options to this script +# and (if needed) use their values to set up `config` below. +args = parser.parse_args() + +config = ( + DreamerV3Config() + .environment( + env=args.env, + # [2]: "We follow the evaluation protocol of Machado et al. (2018) with 200M + # environment steps, action repeat of 4, a time limit of 108,000 steps per + # episode that correspond to 30 minutes of game play, no access to life + # information, full action space, and sticky actions. Because the world model + # integrates information over time, DreamerV2 does not use frame stacking. + # The experiments use a single-task setup where a separate agent is trained + # for each game. Moreover, each agent uses only a single environment instance. + env_config={ + # "sticky actions" but not according to Danijar's 100k configs. + "repeat_action_probability": 0.0, + # "full action space" but not according to Danijar's 100k configs. + "full_action_space": False, + # Already done by MaxAndSkip wrapper: "action repeat" == 4. + "frameskip": 1, + }, + ) + .env_runners( + num_env_runners=(args.num_env_runners or 0), + # If we use >1 GPU and increase the batch size accordingly, we should also + # increase the number of envs per worker. + num_envs_per_env_runner=(args.num_learners or 1), + remote_worker_envs=(args.num_learners > 1), + ) + .reporting( + metrics_num_episodes_for_smoothing=(args.num_learners or 1), + report_images_and_videos=False, + report_dream_data=False, + report_individual_batch_item_stats=False, + ) + # See Appendix A. + .training( + model_size="S", + training_ratio=1024, + batch_size_B=16 * (args.num_learners or 1), + ) +) + + +if __name__ == "__main__": + from ray.rllib.utils.test_utils import run_rllib_example_script_experiment + + run_rllib_example_script_experiment(config, args, keep_config=True) diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/tuned_examples/dreamerv3/atari_200M.py b/.venv/lib/python3.11/site-packages/ray/rllib/tuned_examples/dreamerv3/atari_200M.py new file mode 100644 index 0000000000000000000000000000000000000000..ff13e90bb32d2fdb0ea823ed9505804ea056ce0c --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/rllib/tuned_examples/dreamerv3/atari_200M.py @@ -0,0 +1,80 @@ +""" +[1] Mastering Diverse Domains through World Models - 2023 +D. Hafner, J. Pasukonis, J. Ba, T. Lillicrap +https://arxiv.org/pdf/2301.04104v1.pdf + +[2] Mastering Atari with Discrete World Models - 2021 +D. Hafner, T. Lillicrap, M. Norouzi, J. Ba +https://arxiv.org/pdf/2010.02193.pdf +""" + +# Run with: +# python [this script name].py --env ale_py:ALE/[gym ID e.g. Pong-v5] + +# To see all available options: +# python [this script name].py --help + +from ray.rllib.algorithms.dreamerv3.dreamerv3 import DreamerV3Config +from ray.rllib.utils.test_utils import add_rllib_example_script_args + +parser = add_rllib_example_script_args( + default_iters=1000000, + default_reward=20.0, + default_timesteps=1000000, +) +# Use `parser` to add your own custom command line options to this script +# and (if needed) use their values to set up `config` below. +args = parser.parse_args() + +config = ( + DreamerV3Config() + .resources( + # For each (parallelized) env, we should provide a CPU. Lower this number + # if you don't have enough CPUs. + num_cpus_for_main_process=8 + * (args.num_learners or 1), + ) + .environment( + env=args.env, + # [2]: "We follow the evaluation protocol of Machado et al. (2018) with 200M + # environment steps, action repeat of 4, a time limit of 108,000 steps per + # episode that correspond to 30 minutes of game play, no access to life + # information, full action space, and sticky actions. Because the world model + # integrates information over time, DreamerV2 does not use frame stacking. + # The experiments use a single-task setup where a separate agent is trained + # for each game. Moreover, each agent uses only a single environment instance. + env_config={ + # "sticky actions" but not according to Danijar's 100k configs. + "repeat_action_probability": 0.0, + # "full action space" but not according to Danijar's 100k configs. + "full_action_space": False, + # Already done by MaxAndSkip wrapper: "action repeat" == 4. + "frameskip": 1, + }, + ) + .env_runners( + num_env_runners=(args.num_env_runners or 0), + # If we use >1 GPU and increase the batch size accordingly, we should also + # increase the number of envs per worker. + num_envs_per_env_runner=8 * (args.num_learners or 1), + remote_worker_envs=True, + ) + .reporting( + metrics_num_episodes_for_smoothing=(args.num_learners or 1), + report_images_and_videos=False, + report_dream_data=False, + report_individual_batch_item_stats=False, + ) + # See Appendix A. + .training( + model_size="XL", + training_ratio=64, + batch_size_B=16 * (args.num_learners or 1), + ) +) + + +if __name__ == "__main__": + from ray.rllib.utils.test_utils import run_rllib_example_script_experiment + + run_rllib_example_script_experiment(config, args, keep_config=True) diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/tuned_examples/dreamerv3/cartpole.py b/.venv/lib/python3.11/site-packages/ray/rllib/tuned_examples/dreamerv3/cartpole.py new file mode 100644 index 0000000000000000000000000000000000000000..b8131519974172f5ef7174b66027ba04f3463eb6 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/rllib/tuned_examples/dreamerv3/cartpole.py @@ -0,0 +1,22 @@ +""" +[1] Mastering Diverse Domains through World Models - 2023 +D. Hafner, J. Pasukonis, J. Ba, T. Lillicrap +https://arxiv.org/pdf/2301.04104v1.pdf + +[2] Mastering Atari with Discrete World Models - 2021 +D. Hafner, T. Lillicrap, M. Norouzi, J. Ba +https://arxiv.org/pdf/2010.02193.pdf +""" +from ray.rllib.algorithms.dreamerv3.dreamerv3 import DreamerV3Config + +# Run with: +# python run_regression_tests.py --dir [this file] + +config = ( + DreamerV3Config() + .environment("CartPole-v1") + .training( + model_size="XS", + training_ratio=1024, + ) +) diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/tuned_examples/dreamerv3/dm_control_suite_vision.py b/.venv/lib/python3.11/site-packages/ray/rllib/tuned_examples/dreamerv3/dm_control_suite_vision.py new file mode 100644 index 0000000000000000000000000000000000000000..8035d7e3ada348727ddea996ab9a801b5e6c09a2 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/rllib/tuned_examples/dreamerv3/dm_control_suite_vision.py @@ -0,0 +1,55 @@ +""" +[1] Mastering Diverse Domains through World Models - 2023 +D. Hafner, J. Pasukonis, J. Ba, T. Lillicrap +https://arxiv.org/pdf/2301.04104v1.pdf + +[2] Mastering Atari with Discrete World Models - 2021 +D. Hafner, T. Lillicrap, M. Norouzi, J. Ba +https://arxiv.org/pdf/2010.02193.pdf +""" + +# Run with: +# python [this script name].py --env DMC/[task]/[domain] (e.g. DMC/cartpole/swingup) + +# To see all available options: +# python [this script name].py --help + +from ray.rllib.algorithms.dreamerv3.dreamerv3 import DreamerV3Config +from ray.rllib.utils.test_utils import add_rllib_example_script_args + +parser = add_rllib_example_script_args( + default_iters=1000000, + default_reward=800.0, + default_timesteps=1000000, +) +# Use `parser` to add your own custom command line options to this script +# and (if needed) use their values to set up `config` below. +args = parser.parse_args() + +config = ( + DreamerV3Config() + # Use image observations. + .environment( + env=args.env, + env_config={"from_pixels": True}, + ) + .env_runners( + num_env_runners=(args.num_env_runners or 0), + # If we use >1 GPU and increase the batch size accordingly, we should also + # increase the number of envs per worker. + num_envs_per_env_runner=4 * (args.num_learners or 1), + remote_worker_envs=True, + ) + .reporting( + metrics_num_episodes_for_smoothing=(args.num_learners or 1), + report_images_and_videos=False, + report_dream_data=False, + report_individual_batch_item_stats=False, + ) + # See Appendix A. + .training( + model_size="S", + training_ratio=512, + batch_size_B=16 * (args.num_learners or 1), + ) +) diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/tuned_examples/dreamerv3/flappy_bird.py b/.venv/lib/python3.11/site-packages/ray/rllib/tuned_examples/dreamerv3/flappy_bird.py new file mode 100644 index 0000000000000000000000000000000000000000..31755b6dfe3c6f4ed3d1c6d8ed4068ed6012c237 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/rllib/tuned_examples/dreamerv3/flappy_bird.py @@ -0,0 +1,78 @@ +""" +[1] Mastering Diverse Domains through World Models - 2023 +D. Hafner, J. Pasukonis, J. Ba, T. Lillicrap +https://arxiv.org/pdf/2301.04104v1.pdf + +[2] Mastering Atari with Discrete World Models - 2021 +D. Hafner, T. Lillicrap, M. Norouzi, J. Ba +https://arxiv.org/pdf/2010.02193.pdf +""" + +# Run with: +# python run_regression_tests.py --dir [this file] + +from ray.rllib.algorithms.dreamerv3.dreamerv3 import DreamerV3Config +from ray import tune + + +# Number of GPUs to run on. +num_gpus = 0 + +# DreamerV3 config and default (1 GPU) learning rates. +config = DreamerV3Config() +w = config.world_model_lr +c = config.critic_lr + + +def _env_creator(ctx): + import flappy_bird_gymnasium # noqa doctest: +SKIP + import gymnasium as gym + from supersuit.generic_wrappers import resize_v1 + from ray.rllib.algorithms.dreamerv3.utils.env_runner import NormalizedImageEnv + + return NormalizedImageEnv( + resize_v1( # resize to 64x64 and normalize images + gym.make("FlappyBird-rgb-v0", audio_on=False), x_size=64, y_size=64 + ) + ) + + +# Register the FlappyBird-rgb-v0 env including necessary wrappers via the +# `tune.register_env()` API. +tune.register_env("flappy-bird", _env_creator) + +# Further specify the DreamerV3 config object to use. +( + config.environment("flappy-bird") + .resources( + num_cpus_for_main_process=1, + ) + .learners( + num_learners=0 if num_gpus == 1 else num_gpus, + num_gpus_per_learner=1 if num_gpus else 0, + ) + .env_runners( + # If we use >1 GPU and increase the batch size accordingly, we should also + # increase the number of envs per worker. + num_envs_per_env_runner=8 * (num_gpus or 1), + remote_worker_envs=True, + ) + .reporting( + metrics_num_episodes_for_smoothing=(num_gpus or 1), + report_images_and_videos=False, + report_dream_data=False, + report_individual_batch_item_stats=False, + ) + # See Appendix A. + .training( + model_size="M", + training_ratio=64, + batch_size_B=16 * (num_gpus or 1), + # Use a well established 4-GPU lr scheduling recipe: + # ~ 1000 training updates with 0.4x[default rates], then over a few hundred + # steps, increase to 4x[default rates]. + world_model_lr=[[0, 0.4 * w], [8000, 0.4 * w], [10000, 3 * w]], + critic_lr=[[0, 0.4 * c], [8000, 0.4 * c], [10000, 3 * c]], + actor_lr=[[0, 0.4 * c], [8000, 0.4 * c], [10000, 3 * c]], + ) +) diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/tuned_examples/dreamerv3/frozenlake_2x2.py b/.venv/lib/python3.11/site-packages/ray/rllib/tuned_examples/dreamerv3/frozenlake_2x2.py new file mode 100644 index 0000000000000000000000000000000000000000..03ac201479d3555407742a3c862a3d80a1fc5321 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/rllib/tuned_examples/dreamerv3/frozenlake_2x2.py @@ -0,0 +1,31 @@ +""" +[1] Mastering Diverse Domains through World Models - 2023 +D. Hafner, J. Pasukonis, J. Ba, T. Lillicrap +https://arxiv.org/pdf/2301.04104v1.pdf + +[2] Mastering Atari with Discrete World Models - 2021 +D. Hafner, T. Lillicrap, M. Norouzi, J. Ba +https://arxiv.org/pdf/2010.02193.pdf +""" +from ray.rllib.algorithms.dreamerv3.dreamerv3 import DreamerV3Config + +# Run with: +# python run_regression_tests.py --dir [this file] + +config = ( + DreamerV3Config() + .environment( + "FrozenLake-v1", + env_config={ + "desc": [ + "SF", + "HG", + ], + "is_slippery": False, + }, + ) + .training( + model_size="XS", + training_ratio=1024, + ) +) diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/tuned_examples/dreamerv3/frozenlake_4x4_deterministic.py b/.venv/lib/python3.11/site-packages/ray/rllib/tuned_examples/dreamerv3/frozenlake_4x4_deterministic.py new file mode 100644 index 0000000000000000000000000000000000000000..dd6a8047092564d993e67070253b17d08c57f352 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/rllib/tuned_examples/dreamerv3/frozenlake_4x4_deterministic.py @@ -0,0 +1,28 @@ +""" +[1] Mastering Diverse Domains through World Models - 2023 +D. Hafner, J. Pasukonis, J. Ba, T. Lillicrap +https://arxiv.org/pdf/2301.04104v1.pdf + +[2] Mastering Atari with Discrete World Models - 2021 +D. Hafner, T. Lillicrap, M. Norouzi, J. Ba +https://arxiv.org/pdf/2010.02193.pdf +""" +from ray.rllib.algorithms.dreamerv3.dreamerv3 import DreamerV3Config + +# Run with: +# python run_regression_tests.py --dir [this file] + +config = ( + DreamerV3Config() + .environment( + "FrozenLake-v1", + env_config={ + "map_name": "4x4", + "is_slippery": False, + }, + ) + .training( + model_size="nano", + training_ratio=1024, + ) +) diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/tuned_examples/dreamerv3/gymnasium_robotics.py b/.venv/lib/python3.11/site-packages/ray/rllib/tuned_examples/dreamerv3/gymnasium_robotics.py new file mode 100644 index 0000000000000000000000000000000000000000..14fd1f93070304bac0d233f09f430d44b8fb25dc --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/rllib/tuned_examples/dreamerv3/gymnasium_robotics.py @@ -0,0 +1,66 @@ +""" +[1] Mastering Diverse Domains through World Models - 2023 +D. Hafner, J. Pasukonis, J. Ba, T. Lillicrap +https://arxiv.org/pdf/2301.04104v1.pdf + +[2] Mastering Atari with Discrete World Models - 2021 +D. Hafner, T. Lillicrap, M. Norouzi, J. Ba +https://arxiv.org/pdf/2010.02193.pdf +""" + +# Run with: +# python run_regression_tests.py --dir [this file] + +try: + import gymnasium_robotics # noqa +except (ImportError, ModuleNotFoundError): + print("You have to `pip install gymnasium_robotics` in order to run this example!") + +import gymnasium as gym + +from ray.rllib.algorithms.dreamerv3.dreamerv3 import DreamerV3Config +from ray import tune + + +# Number of GPUs to run on. +num_gpus = 4 + +# Register the gymnasium robotics env (including necessary wrappers and options) via the +# `tune.register_env()` API. +# Create the specific gymnasium robotics env. +# e.g. AdroitHandHammerSparse-v1 or FrankaKitchen-v1. +# return gym.make("FrankaKitchen-v1", tasks_to_complete=["microwave", "kettle"]) +tune.register_env("flappy-bird", lambda ctx: gym.make("AdroitHandHammer-v1")) + +# Define the DreamerV3 config object to use. +config = DreamerV3Config() +w = config.world_model_lr +c = config.critic_lr +# Further specify the details of our config object. +( + config.resources( + num_cpus_for_main_process=8 * (num_gpus or 1), + ) + .learners( + num_learners=0 if num_gpus == 1 else num_gpus, + num_gpus_per_learner=1 if num_gpus else 0, + ) + # If we use >1 GPU and increase the batch size accordingly, we should also + # increase the number of envs per worker. + .env_runners(num_envs_per_env_runner=8 * (num_gpus or 1), remote_worker_envs=True) + .reporting( + metrics_num_episodes_for_smoothing=(num_gpus or 1), + report_images_and_videos=False, + report_dream_data=False, + report_individual_batch_item_stats=False, + ) + # See Appendix A. + .training( + model_size="XL", + training_ratio=64, + batch_size_B=16 * (num_gpus or 1), + world_model_lr=[[0, 0.4 * w], [50000, 0.4 * w], [100000, 3 * w]], + critic_lr=[[0, 0.4 * c], [50000, 0.4 * c], [100000, 3 * c]], + actor_lr=[[0, 0.4 * c], [50000, 0.4 * c], [100000, 3 * c]], + ) +) diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/tuned_examples/dreamerv3/highway_env.py b/.venv/lib/python3.11/site-packages/ray/rllib/tuned_examples/dreamerv3/highway_env.py new file mode 100644 index 0000000000000000000000000000000000000000..c3588f502c1aa2dd1c8affdfa2a947b3234360dc --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/rllib/tuned_examples/dreamerv3/highway_env.py @@ -0,0 +1,71 @@ +""" +[1] Mastering Diverse Domains through World Models - 2023 +D. Hafner, J. Pasukonis, J. Ba, T. Lillicrap +https://arxiv.org/pdf/2301.04104v1.pdf + +[2] Mastering Atari with Discrete World Models - 2021 +D. Hafner, T. Lillicrap, M. Norouzi, J. Ba +https://arxiv.org/pdf/2010.02193.pdf +""" + +# Run with: +# python run_regression_tests.py --dir [this file] + +try: + import highway_env # noqa +except (ImportError, ModuleNotFoundError): + print("You have to `pip install highway_env` in order to run this example!") + +import gymnasium as gym + +from ray.rllib.algorithms.dreamerv3.dreamerv3 import DreamerV3Config +from ray import tune + + +# Number of GPUs to run on. +num_gpus = 4 + +# Register the highway env (including necessary wrappers and options) via the +# `tune.register_env()` API. +# Create the specific env. +# e.g. roundabout-v0 or racetrack-v0 +tune.register_env("flappy-bird", lambda ctx: gym.make("intersection-v0", policy_freq=5)) + +# Define the DreamerV3 config object to use. +config = DreamerV3Config() +w = config.world_model_lr +c = config.critic_lr + +( + config.resources( + num_cpus_for_main_process=1, + ) + .learners( + num_learners=0 if num_gpus == 1 else num_gpus, + num_gpus_per_learner=1 if num_gpus else 0, + ) + .env_runners( + # If we use >1 GPU and increase the batch size accordingly, we should also + # increase the number of envs per worker. + num_envs_per_env_runner=8 * (num_gpus or 1), + remote_worker_envs=True, + ) + .reporting( + metrics_num_episodes_for_smoothing=(num_gpus or 1), + report_images_and_videos=False, + report_dream_data=False, + report_individual_batch_item_stats=False, + ) + # See Appendix A. + .training( + model_size="M", + training_ratio=64, + batch_size_B=16 * (num_gpus or 1), + # Use a well established 4-GPU lr scheduling recipe: + # ~ 1000 training updates with 0.4x[default rates], then over a few hundred + # steps, increase to 4x[default rates]. + world_model_lr=[[0, 0.4 * w], [8000, 0.4 * w], [10000, 3 * w]], + critic_lr=[[0, 0.4 * c], [8000, 0.4 * c], [10000, 3 * c]], + actor_lr=[[0, 0.4 * c], [8000, 0.4 * c], [10000, 3 * c]], + ) +) diff --git a/.venv/lib/python3.11/site-packages/ray/rllib/tuned_examples/dreamerv3/pendulum.py b/.venv/lib/python3.11/site-packages/ray/rllib/tuned_examples/dreamerv3/pendulum.py new file mode 100644 index 0000000000000000000000000000000000000000..4acc4b9aa85a9386286e7f1fef1400b5b5fcbf4b --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/rllib/tuned_examples/dreamerv3/pendulum.py @@ -0,0 +1,19 @@ +""" +[1] Mastering Diverse Domains through World Models - 2023 +D. Hafner, J. Pasukonis, J. Ba, T. Lillicrap +https://arxiv.org/pdf/2301.04104v1.pdf + +[2] Mastering Atari with Discrete World Models - 2021 +D. Hafner, T. Lillicrap, M. Norouzi, J. Ba +https://arxiv.org/pdf/2010.02193.pdf +""" +from ray.rllib.algorithms.dreamerv3.dreamerv3 import DreamerV3Config + +# Run with: +# python run_regression_tests.py --dir [this file] + +config = ( + DreamerV3Config() + .environment("Pendulum-v1") + .training(model_size="XS", training_ratio=1024) +)