koichi12 commited on Feb 12, 2025

Commit

adce983

verified ·

1 Parent(s): e78b2cd

Add files using upload-large-folder tool

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +2 -0
.venv/lib/python3.11/site-packages/ray/_private/__pycache__/process_watcher.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/_private/accelerators/__init__.py +77 -0
.venv/lib/python3.11/site-packages/ray/_private/accelerators/__pycache__/__init__.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/_private/accelerators/__pycache__/accelerator.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/_private/accelerators/__pycache__/amd_gpu.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/_private/accelerators/__pycache__/hpu.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/_private/accelerators/__pycache__/intel_gpu.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/_private/accelerators/__pycache__/neuron.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/_private/accelerators/__pycache__/npu.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/_private/accelerators/__pycache__/nvidia_gpu.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/_private/accelerators/__pycache__/tpu.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/_private/accelerators/accelerator.py +138 -0
.venv/lib/python3.11/site-packages/ray/_private/accelerators/hpu.py +121 -0
.venv/lib/python3.11/site-packages/ray/_private/accelerators/intel_gpu.py +103 -0
.venv/lib/python3.11/site-packages/ray/_private/accelerators/neuron.py +132 -0
.venv/lib/python3.11/site-packages/ray/_private/accelerators/npu.py +99 -0
.venv/lib/python3.11/site-packages/ray/_private/accelerators/nvidia_gpu.py +128 -0
.venv/lib/python3.11/site-packages/ray/_private/accelerators/tpu.py +393 -0
.venv/lib/python3.11/site-packages/ray/_private/runtime_env/agent/thirdparty_files/propcache/_helpers_c.cpython-311-x86_64-linux-gnu.so +3 -0
.venv/lib/python3.11/site-packages/ray/_private/usage/__init__.py +0 -0
.venv/lib/python3.11/site-packages/ray/_private/usage/__pycache__/__init__.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/_private/usage/__pycache__/usage_constants.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/_private/usage/__pycache__/usage_lib.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/_private/usage/usage_constants.py +63 -0
.venv/lib/python3.11/site-packages/ray/_private/usage/usage_lib.py +964 -0
.venv/lib/python3.11/site-packages/ray/_private/workers/__init__.py +0 -0
.venv/lib/python3.11/site-packages/ray/_private/workers/__pycache__/__init__.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/_private/workers/__pycache__/default_worker.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/_private/workers/__pycache__/setup_worker.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/_private/workers/default_worker.py +304 -0
.venv/lib/python3.11/site-packages/ray/_private/workers/setup_worker.py +33 -0
.venv/lib/python3.11/site-packages/ray/jars/ray_dist.jar +3 -0
.venv/lib/python3.11/site-packages/ray/rllib/__init__.py +55 -0
.venv/lib/python3.11/site-packages/ray/rllib/execution/__init__.py +23 -0
.venv/lib/python3.11/site-packages/ray/rllib/execution/__pycache__/__init__.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/rllib/execution/__pycache__/learner_thread.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/rllib/execution/__pycache__/minibatch_buffer.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/rllib/execution/__pycache__/multi_gpu_learner_thread.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/rllib/execution/__pycache__/replay_ops.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/rllib/execution/__pycache__/rollout_ops.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/rllib/execution/__pycache__/segment_tree.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/rllib/execution/__pycache__/train_ops.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/rllib/execution/buffers/__init__.py +0 -0
.venv/lib/python3.11/site-packages/ray/rllib/execution/buffers/__pycache__/mixin_replay_buffer.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/rllib/execution/learner_thread.py +137 -0
.venv/lib/python3.11/site-packages/ray/rllib/execution/minibatch_buffer.py +61 -0
.venv/lib/python3.11/site-packages/ray/rllib/execution/multi_gpu_learner_thread.py +245 -0
.venv/lib/python3.11/site-packages/ray/rllib/execution/replay_ops.py +37 -0
.venv/lib/python3.11/site-packages/ray/rllib/execution/rollout_ops.py +207 -0

.gitattributes CHANGED Viewed

@@ -171,3 +171,5 @@ tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_inductor/_
 .venv/lib/python3.11/site-packages/ray/_private/runtime_env/agent/thirdparty_files/aiohttp/_websocket/reader_c.cpython-311-x86_64-linux-gnu.so filter=lfs diff=lfs merge=lfs -text
 .venv/lib/python3.11/site-packages/ray/_private/thirdparty/tabulate/__pycache__/tabulate.cpython-311.pyc filter=lfs diff=lfs merge=lfs -text
 .venv/lib/python3.11/site-packages/ray/_private/runtime_env/agent/thirdparty_files/idna/__pycache__/idnadata.cpython-311.pyc filter=lfs diff=lfs merge=lfs -text

 .venv/lib/python3.11/site-packages/ray/_private/runtime_env/agent/thirdparty_files/aiohttp/_websocket/reader_c.cpython-311-x86_64-linux-gnu.so filter=lfs diff=lfs merge=lfs -text
 .venv/lib/python3.11/site-packages/ray/_private/thirdparty/tabulate/__pycache__/tabulate.cpython-311.pyc filter=lfs diff=lfs merge=lfs -text
 .venv/lib/python3.11/site-packages/ray/_private/runtime_env/agent/thirdparty_files/idna/__pycache__/idnadata.cpython-311.pyc filter=lfs diff=lfs merge=lfs -text
+.venv/lib/python3.11/site-packages/ray/_private/runtime_env/agent/thirdparty_files/propcache/_helpers_c.cpython-311-x86_64-linux-gnu.so filter=lfs diff=lfs merge=lfs -text
+.venv/lib/python3.11/site-packages/ray/jars/ray_dist.jar filter=lfs diff=lfs merge=lfs -text

.venv/lib/python3.11/site-packages/ray/_private/__pycache__/process_watcher.cpython-311.pyc ADDED Viewed

Binary file (8.85 kB). View file

.venv/lib/python3.11/site-packages/ray/_private/accelerators/__init__.py ADDED Viewed

	@@ -0,0 +1,77 @@

+from typing import Set, Optional
+from ray._private.accelerators.accelerator import AcceleratorManager
+from ray._private.accelerators.nvidia_gpu import NvidiaGPUAcceleratorManager
+from ray._private.accelerators.intel_gpu import IntelGPUAcceleratorManager
+from ray._private.accelerators.amd_gpu import AMDGPUAcceleratorManager
+from ray._private.accelerators.tpu import TPUAcceleratorManager
+from ray._private.accelerators.neuron import NeuronAcceleratorManager
+from ray._private.accelerators.hpu import HPUAcceleratorManager
+from ray._private.accelerators.npu import NPUAcceleratorManager
+def get_all_accelerator_managers() -> Set[AcceleratorManager]:
+    """Get all accelerator managers supported by Ray."""
+    return {
+        NvidiaGPUAcceleratorManager,
+        IntelGPUAcceleratorManager,
+        AMDGPUAcceleratorManager,
+        TPUAcceleratorManager,
+        NeuronAcceleratorManager,
+        HPUAcceleratorManager,
+        NPUAcceleratorManager,
+    }
+def get_all_accelerator_resource_names() -> Set[str]:
+    """Get all resource names for accelerators."""
+    return {
+        accelerator_manager.get_resource_name()
+        for accelerator_manager in get_all_accelerator_managers()
+    }
+def get_accelerator_manager_for_resource(
+    resource_name: str,
+) -> Optional[AcceleratorManager]:
+    """Get the corresponding accelerator manager for the given
+    accelerator resource name
+    E.g., TPUAcceleratorManager is returned if resource name is "TPU"
+    """
+    try:
+        return get_accelerator_manager_for_resource._resource_name_to_accelerator_manager.get(  # noqa: E501
+            resource_name, None
+        )
+    except AttributeError:
+        # Lazy initialization.
+        resource_name_to_accelerator_manager = {
+            accelerator_manager.get_resource_name(): accelerator_manager
+            for accelerator_manager in get_all_accelerator_managers()
+        }
+        # Special handling for GPU resource name since multiple accelerator managers
+        # have the same GPU resource name.
+        if AMDGPUAcceleratorManager.get_current_node_num_accelerators() > 0:
+            resource_name_to_accelerator_manager["GPU"] = AMDGPUAcceleratorManager
+        elif IntelGPUAcceleratorManager.get_current_node_num_accelerators() > 0:
+            resource_name_to_accelerator_manager["GPU"] = IntelGPUAcceleratorManager
+        else:
+            resource_name_to_accelerator_manager["GPU"] = NvidiaGPUAcceleratorManager
+        get_accelerator_manager_for_resource._resource_name_to_accelerator_manager = (
+            resource_name_to_accelerator_manager
+        )
+        return resource_name_to_accelerator_manager.get(resource_name, None)
+__all__ = [
+    "NvidiaGPUAcceleratorManager",
+    "IntelGPUAcceleratorManager",
+    "AMDGPUAcceleratorManager",
+    "TPUAcceleratorManager",
+    "NeuronAcceleratorManager",
+    "HPUAcceleratorManager",
+    "NPUAcceleratorManager",
+    "get_all_accelerator_managers",
+    "get_all_accelerator_resource_names",
+    "get_accelerator_manager_for_resource",
+]

.venv/lib/python3.11/site-packages/ray/_private/accelerators/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (3.48 kB). View file

.venv/lib/python3.11/site-packages/ray/_private/accelerators/__pycache__/accelerator.cpython-311.pyc ADDED Viewed

Binary file (7.02 kB). View file

.venv/lib/python3.11/site-packages/ray/_private/accelerators/__pycache__/amd_gpu.cpython-311.pyc ADDED Viewed

Binary file (7.34 kB). View file

.venv/lib/python3.11/site-packages/ray/_private/accelerators/__pycache__/hpu.cpython-311.pyc ADDED Viewed

Binary file (6.4 kB). View file

.venv/lib/python3.11/site-packages/ray/_private/accelerators/__pycache__/intel_gpu.cpython-311.pyc ADDED Viewed

Binary file (5.66 kB). View file

.venv/lib/python3.11/site-packages/ray/_private/accelerators/__pycache__/neuron.cpython-311.pyc ADDED Viewed

Binary file (6.72 kB). View file

.venv/lib/python3.11/site-packages/ray/_private/accelerators/__pycache__/npu.cpython-311.pyc ADDED Viewed

Binary file (5.47 kB). View file

.venv/lib/python3.11/site-packages/ray/_private/accelerators/__pycache__/nvidia_gpu.cpython-311.pyc ADDED Viewed

Binary file (7.05 kB). View file

.venv/lib/python3.11/site-packages/ray/_private/accelerators/__pycache__/tpu.cpython-311.pyc ADDED Viewed

Binary file (18 kB). View file

.venv/lib/python3.11/site-packages/ray/_private/accelerators/accelerator.py ADDED Viewed

	@@ -0,0 +1,138 @@

+from abc import ABC, abstractmethod
+from typing import Dict, Optional, List, Tuple
+class AcceleratorManager(ABC):
+    """This class contains all the functions needed for supporting
+    an accelerator family in Ray."""
+    @staticmethod
+    @abstractmethod
+    def get_resource_name() -> str:
+        """Get the name of the resource representing this accelerator family.
+        Returns:
+            The resource name: e.g., the resource name for Nvidia GPUs is "GPU"
+        """
+    @staticmethod
+    @abstractmethod
+    def get_visible_accelerator_ids_env_var() -> str:
+        """Get the env var that sets the ids of visible accelerators of this family.
+        Returns:
+            The env var for setting visible accelerator ids: e.g.,
+                CUDA_VISIBLE_DEVICES for Nvidia GPUs.
+        """
+    @staticmethod
+    @abstractmethod
+    def get_current_node_num_accelerators() -> int:
+        """Get the total number of accelerators of this family on the current node.
+        Returns:
+            The detected total number of accelerators of this family.
+            Return 0 if the current node doesn't contain accelerators of this family.
+        """
+    @staticmethod
+    @abstractmethod
+    def get_current_node_accelerator_type() -> Optional[str]:
+        """Get the type of the accelerator of this family on the current node.
+        Currently Ray only supports single accelerator type of
+        an accelerator family on each node.
+        The result should only be used when get_current_node_num_accelerators() > 0.
+        Returns:
+            The detected accelerator type of this family: e.g., H100 for Nvidia GPU.
+            Return None if it's unknown or the node doesn't have
+            accelerators of this family.
+        """
+    @staticmethod
+    @abstractmethod
+    def get_current_node_additional_resources() -> Optional[Dict[str, float]]:
+        """Get any additional resources required for the current node.
+        In case a particular accelerator type requires considerations for
+        additional resources (e.g. for TPUs, providing the TPU pod type and
+        TPU name), this function can be used to provide the
+        additional logical resources.
+        Returns:
+            A dictionary representing additional resources that may be
+            necessary for a particular accelerator type.
+        """
+    @staticmethod
+    @abstractmethod
+    def validate_resource_request_quantity(
+        quantity: float,
+    ) -> Tuple[bool, Optional[str]]:
+        """Validate the resource request quantity of this accelerator resource.
+        Args:
+            quantity: The resource request quantity to be validated.
+        Returns:
+            (valid, error_message) tuple: the first element of the tuple
+                indicates whether the given quantity is valid or not,
+                the second element is the error message
+                if the given quantity is invalid.
+        """
+    @staticmethod
+    @abstractmethod
+    def get_current_process_visible_accelerator_ids() -> Optional[List[str]]:
+        """Get the ids of accelerators of this family that are visible to the current process.
+        Returns:
+            The list of visiable accelerator ids.
+            Return None if all accelerators are visible.
+        """
+    @staticmethod
+    @abstractmethod
+    def set_current_process_visible_accelerator_ids(ids: List[str]) -> None:
+        """Set the ids of accelerators of this family that are visible to the current process.
+        Args:
+            ids: The ids of visible accelerators of this family.
+        """
+    @staticmethod
+    def get_ec2_instance_num_accelerators(
+        instance_type: str, instances: dict
+    ) -> Optional[int]:
+        """Get the number of accelerators of this family on ec2 instance with given type.
+        Args:
+            instance_type: The ec2 instance type.
+            instances: Map from ec2 instance type to instance metadata returned by
+                ec2 `describe-instance-types`.
+        Returns:
+            The number of accelerators of this family on the ec2 instance
+            with given type.
+            Return None if it's unknown.
+        """
+        return None
+    @staticmethod
+    def get_ec2_instance_accelerator_type(
+        instance_type: str, instances: dict
+    ) -> Optional[str]:
+        """Get the accelerator type of this family on ec2 instance with given type.
+        Args:
+            instance_type: The ec2 instance type.
+            instances: Map from ec2 instance type to instance metadata returned by
+                ec2 `describe-instance-types`.
+        Returns:
+            The accelerator type of this family on the ec2 instance with given type.
+            Return None if it's unknown.
+        """
+        return None

.venv/lib/python3.11/site-packages/ray/_private/accelerators/hpu.py ADDED Viewed

	@@ -0,0 +1,121 @@

+import os
+import logging
+from typing import Optional, List, Tuple
+from functools import lru_cache
+from importlib.util import find_spec
+from ray._private.accelerators.accelerator import AcceleratorManager
+logger = logging.getLogger(__name__)
+HABANA_VISIBLE_DEVICES_ENV_VAR = "HABANA_VISIBLE_MODULES"
+NOSET_HABANA_VISIBLE_MODULES_ENV_VAR = "RAY_EXPERIMENTAL_NOSET_HABANA_VISIBLE_MODULES"
+@lru_cache()
+def is_package_present(package_name: str) -> bool:
+    try:
+        return find_spec(package_name) is not None
+    except ModuleNotFoundError:
+        return False
+HPU_PACKAGE_AVAILABLE = is_package_present("habana_frameworks")
+class HPUAcceleratorManager(AcceleratorManager):
+    """Intel Habana(HPU) accelerators."""
+    @staticmethod
+    def get_resource_name() -> str:
+        return "HPU"
+    @staticmethod
+    def get_visible_accelerator_ids_env_var() -> str:
+        return HABANA_VISIBLE_DEVICES_ENV_VAR
+    @staticmethod
+    def get_current_process_visible_accelerator_ids() -> Optional[List[str]]:
+        hpu_visible_devices = os.environ.get(
+            HPUAcceleratorManager.get_visible_accelerator_ids_env_var(), None
+        )
+        if hpu_visible_devices is None:
+            return None
+        if hpu_visible_devices == "":
+            return []
+        return list(hpu_visible_devices.split(","))
+    @staticmethod
+    def get_current_node_num_accelerators() -> int:
+        """Attempt to detect the number of HPUs on this machine.
+        Returns:
+            The number of HPUs if any were detected, otherwise 0.
+        """
+        if HPU_PACKAGE_AVAILABLE:
+            import habana_frameworks.torch.hpu as torch_hpu
+            if torch_hpu.is_available():
+                return torch_hpu.device_count()
+            else:
+                logging.info("HPU devices not available")
+                return 0
+        else:
+            return 0
+    @staticmethod
+    def is_initialized() -> bool:
+        """Attempt to check if HPU backend is initialized.
+        Returns:
+            True if backend initialized else False.
+        """
+        if HPU_PACKAGE_AVAILABLE:
+            import habana_frameworks.torch.hpu as torch_hpu
+            if torch_hpu.is_available() and torch_hpu.is_initialized():
+                return True
+            else:
+                return False
+        else:
+            return False
+    @staticmethod
+    def get_current_node_accelerator_type() -> Optional[str]:
+        """Attempt to detect the HPU family type.
+        Returns:
+            The device name (GAUDI, GAUDI2) if detected else None.
+        """
+        if HPUAcceleratorManager.is_initialized():
+            import habana_frameworks.torch.hpu as torch_hpu
+            return f"Intel-{torch_hpu.get_device_name()}"
+        else:
+            logging.info("HPU type cannot be detected")
+            return None
+    @staticmethod
+    def validate_resource_request_quantity(
+        quantity: float,
+    ) -> Tuple[bool, Optional[str]]:
+        if isinstance(quantity, float) and not quantity.is_integer():
+            return (
+                False,
+                f"{HPUAcceleratorManager.get_resource_name()} resource quantity"
+                " must be whole numbers. "
+                f"The specified quantity {quantity} is invalid.",
+            )
+        else:
+            return (True, None)
+    @staticmethod
+    def set_current_process_visible_accelerator_ids(
+        visible_hpu_devices: List[str],
+    ) -> None:
+        if os.environ.get(NOSET_HABANA_VISIBLE_MODULES_ENV_VAR):
+            return
+        os.environ[
+            HPUAcceleratorManager.get_visible_accelerator_ids_env_var()
+        ] = ",".join([str(i) for i in visible_hpu_devices])

.venv/lib/python3.11/site-packages/ray/_private/accelerators/intel_gpu.py ADDED Viewed

	@@ -0,0 +1,103 @@

+import os
+import logging
+from typing import Optional, List, Tuple
+from ray._private.accelerators.accelerator import AcceleratorManager
+logger = logging.getLogger(__name__)
+ONEAPI_DEVICE_SELECTOR_ENV_VAR = "ONEAPI_DEVICE_SELECTOR"
+NOSET_ONEAPI_DEVICE_SELECTOR_ENV_VAR = "RAY_EXPERIMENTAL_NOSET_ONEAPI_DEVICE_SELECTOR"
+ONEAPI_DEVICE_BACKEND_TYPE = "level_zero"
+ONEAPI_DEVICE_TYPE = "gpu"
+class IntelGPUAcceleratorManager(AcceleratorManager):
+    """Intel GPU accelerators."""
+    @staticmethod
+    def get_resource_name() -> str:
+        return "GPU"
+    @staticmethod
+    def get_visible_accelerator_ids_env_var() -> str:
+        return ONEAPI_DEVICE_SELECTOR_ENV_VAR
+    @staticmethod
+    def get_current_process_visible_accelerator_ids() -> Optional[List[str]]:
+        oneapi_visible_devices = os.environ.get(
+            IntelGPUAcceleratorManager.get_visible_accelerator_ids_env_var(), None
+        )
+        if oneapi_visible_devices is None:
+            return None
+        if oneapi_visible_devices == "":
+            return []
+        if oneapi_visible_devices == "NoDevFiles":
+            return []
+        prefix = ONEAPI_DEVICE_BACKEND_TYPE + ":"
+        return list(oneapi_visible_devices.split(prefix)[1].split(","))
+    @staticmethod
+    def get_current_node_num_accelerators() -> int:
+        try:
+            import dpctl
+        except ImportError:
+            dpctl = None
+        if dpctl is None:
+            return 0
+        num_gpus = 0
+        try:
+            dev_info = ONEAPI_DEVICE_BACKEND_TYPE + ":" + ONEAPI_DEVICE_TYPE
+            context = dpctl.SyclContext(dev_info)
+            num_gpus = context.device_count
+        except Exception:
+            num_gpus = 0
+        return num_gpus
+    @staticmethod
+    def get_current_node_accelerator_type() -> Optional[str]:
+        """Get the name of first Intel GPU. (supposed only one GPU type on a node)
+        Example:
+            name: 'Intel(R) Data Center GPU Max 1550'
+            return name: 'Intel-GPU-Max-1550'
+        Returns:
+            A string representing the name of Intel GPU type.
+        """
+        try:
+            import dpctl
+        except ImportError:
+            dpctl = None
+        if dpctl is None:
+            return None
+        accelerator_type = None
+        try:
+            dev_info = ONEAPI_DEVICE_BACKEND_TYPE + ":" + ONEAPI_DEVICE_TYPE + ":0"
+            dev = dpctl.SyclDevice(dev_info)
+            accelerator_type = "Intel-GPU-" + "-".join(dev.name.split(" ")[-2:])
+        except Exception:
+            accelerator_type = None
+        return accelerator_type
+    @staticmethod
+    def validate_resource_request_quantity(
+        quantity: float,
+    ) -> Tuple[bool, Optional[str]]:
+        return (True, None)
+    @staticmethod
+    def set_current_process_visible_accelerator_ids(
+        visible_xpu_devices: List[str],
+    ) -> None:
+        if os.environ.get(NOSET_ONEAPI_DEVICE_SELECTOR_ENV_VAR):
+            return
+        prefix = ONEAPI_DEVICE_BACKEND_TYPE + ":"
+        os.environ[
+            IntelGPUAcceleratorManager.get_visible_accelerator_ids_env_var()
+        ] = prefix + ",".join([str(i) for i in visible_xpu_devices])

.venv/lib/python3.11/site-packages/ray/_private/accelerators/neuron.py ADDED Viewed

	@@ -0,0 +1,132 @@

+import os
+import sys
+import json
+import logging
+import subprocess
+from typing import Optional, List, Tuple
+from ray._private.accelerators.accelerator import AcceleratorManager
+logger = logging.getLogger(__name__)
+NEURON_RT_VISIBLE_CORES_ENV_VAR = "NEURON_RT_VISIBLE_CORES"
+NOSET_AWS_NEURON_RT_VISIBLE_CORES_ENV_VAR = (
+    "RAY_EXPERIMENTAL_NOSET_NEURON_RT_VISIBLE_CORES"
+)
+# https://awsdocs-neuron.readthedocs-hosted.com/en/latest/general/arch/neuron-hardware/inf2-arch.html#aws-inf2-arch
+# https://awsdocs-neuron.readthedocs-hosted.com/en/latest/general/arch/neuron-hardware/trn1-arch.html#aws-trn1-arch
+# Subject to removal after the information is available via public API
+AWS_NEURON_INSTANCE_MAP = {
+    "trn1.2xlarge": 2,
+    "trn1.32xlarge": 32,
+    "trn1n.32xlarge": 32,
+    "inf2.xlarge": 2,
+    "inf2.8xlarge": 2,
+    "inf2.24xlarge": 12,
+    "inf2.48xlarge": 24,
+}
+class NeuronAcceleratorManager(AcceleratorManager):
+    """AWS Inferentia and Trainium accelerators."""
+    @staticmethod
+    def get_resource_name() -> str:
+        return "neuron_cores"
+    @staticmethod
+    def get_visible_accelerator_ids_env_var() -> str:
+        return NEURON_RT_VISIBLE_CORES_ENV_VAR
+    @staticmethod
+    def get_current_process_visible_accelerator_ids() -> Optional[List[str]]:
+        neuron_visible_cores = os.environ.get(
+            NeuronAcceleratorManager.get_visible_accelerator_ids_env_var(), None
+        )
+        if neuron_visible_cores is None:
+            return None
+        if neuron_visible_cores == "":
+            return []
+        return list(neuron_visible_cores.split(","))
+    @staticmethod
+    def get_current_node_num_accelerators() -> int:
+        """
+        Attempt to detect the number of Neuron cores on this machine.
+        Returns:
+            The number of Neuron cores if any were detected, otherwise 0.
+        """
+        nc_count: int = 0
+        neuron_path = "/opt/aws/neuron/bin/"
+        if sys.platform.startswith("linux") and os.path.isdir(neuron_path):
+            result = subprocess.run(
+                [os.path.join(neuron_path, "neuron-ls"), "--json-output"],
+                stdout=subprocess.PIPE,
+                stderr=subprocess.PIPE,
+            )
+            if result.returncode == 0 and result.stdout:
+                neuron_devices = json.loads(result.stdout)
+                for neuron_device in neuron_devices:
+                    nc_count += neuron_device.get("nc_count", 0)
+        return nc_count
+    @staticmethod
+    def get_current_node_accelerator_type() -> Optional[str]:
+        from ray.util.accelerators import AWS_NEURON_CORE
+        return AWS_NEURON_CORE
+    @staticmethod
+    def validate_resource_request_quantity(
+        quantity: float,
+    ) -> Tuple[bool, Optional[str]]:
+        if isinstance(quantity, float) and not quantity.is_integer():
+            return (
+                False,
+                f"{NeuronAcceleratorManager.get_resource_name()} resource quantity"
+                " must be whole numbers. "
+                f"The specified quantity {quantity} is invalid.",
+            )
+        else:
+            return (True, None)
+    @staticmethod
+    def set_current_process_visible_accelerator_ids(
+        visible_neuron_core_ids: List[str],
+    ) -> None:
+        """Set the NEURON_RT_VISIBLE_CORES environment variable based on
+        given visible_neuron_core_ids.
+        Args:
+            visible_neuron_core_ids (List[str]): List of int representing core IDs.
+        """
+        if os.environ.get(NOSET_AWS_NEURON_RT_VISIBLE_CORES_ENV_VAR):
+            return
+        os.environ[
+            NeuronAcceleratorManager.get_visible_accelerator_ids_env_var()
+        ] = ",".join([str(i) for i in visible_neuron_core_ids])
+    @staticmethod
+    def get_ec2_instance_num_accelerators(
+        instance_type: str, instances: dict
+    ) -> Optional[int]:
+        # TODO: AWS SDK (public API) doesn't yet expose the NeuronCore
+        # information. It will be available (work-in-progress)
+        # as xxAcceleratorInfo in InstanceTypeInfo.
+        # https://docs.aws.amazon.com/AWSEC2/latest/APIReference/API_InstanceTypeInfo.html
+        # See https://github.com/ray-project/ray/issues/38473
+        return AWS_NEURON_INSTANCE_MAP.get(instance_type.lower(), None)
+    @staticmethod
+    def get_ec2_instance_accelerator_type(
+        instance_type: str, instances: dict
+    ) -> Optional[str]:
+        from ray.util.accelerators import AWS_NEURON_CORE
+        return AWS_NEURON_CORE

.venv/lib/python3.11/site-packages/ray/_private/accelerators/npu.py ADDED Viewed

	@@ -0,0 +1,99 @@

+import os
+import glob
+import logging
+from typing import Optional, List, Tuple
+from ray._private.accelerators.accelerator import AcceleratorManager
+logger = logging.getLogger(__name__)
+ASCEND_RT_VISIBLE_DEVICES_ENV_VAR = "ASCEND_RT_VISIBLE_DEVICES"
+NOSET_ASCEND_RT_VISIBLE_DEVICES_ENV_VAR = (
+    "RAY_EXPERIMENTAL_NOSET_ASCEND_RT_VISIBLE_DEVICES"
+)
+class NPUAcceleratorManager(AcceleratorManager):
+    """Ascend NPU accelerators."""
+    @staticmethod
+    def get_resource_name() -> str:
+        return "NPU"
+    @staticmethod
+    def get_visible_accelerator_ids_env_var() -> str:
+        return ASCEND_RT_VISIBLE_DEVICES_ENV_VAR
+    @staticmethod
+    def get_current_process_visible_accelerator_ids() -> Optional[List[str]]:
+        ascend_visible_devices = os.environ.get(
+            NPUAcceleratorManager.get_visible_accelerator_ids_env_var(), None
+        )
+        if ascend_visible_devices is None:
+            return None
+        if ascend_visible_devices == "":
+            return []
+        if ascend_visible_devices == "NoDevFiles":
+            return []
+        return list(ascend_visible_devices.split(","))
+    @staticmethod
+    def get_current_node_num_accelerators() -> int:
+        """Attempt to detect the number of NPUs on this machine.
+        NPU chips are represented as devices within `/dev/`, either as `/dev/davinci?`.
+        Returns:
+            The number of NPUs if any were detected, otherwise 0.
+        """
+        try:
+            import acl
+            device_count, ret = acl.rt.get_device_count()
+            if ret == 0:
+                return device_count
+        except Exception as e:
+            logger.debug("Could not import AscendCL: %s", e)
+        try:
+            npu_files = glob.glob("/dev/davinci[0-9]*")
+            return len(npu_files)
+        except Exception as e:
+            logger.debug("Failed to detect number of NPUs: %s", e)
+        return 0
+    @staticmethod
+    def get_current_node_accelerator_type() -> Optional[str]:
+        """Get the type of the Ascend NPU on the current node.
+        Returns:
+            A string of the type, such as "Ascend910A", "Ascend910B", "Ascend310P1".
+        """
+        try:
+            import acl
+            return acl.get_soc_name()
+        except Exception:
+            logger.exception("Failed to detect NPU type.")
+        return None
+    @staticmethod
+    def validate_resource_request_quantity(
+        quantity: float,
+    ) -> Tuple[bool, Optional[str]]:
+        return (True, None)
+    @staticmethod
+    def set_current_process_visible_accelerator_ids(
+        visible_npu_devices: List[str],
+    ) -> None:
+        if os.environ.get(NOSET_ASCEND_RT_VISIBLE_DEVICES_ENV_VAR):
+            return
+        os.environ[
+            NPUAcceleratorManager.get_visible_accelerator_ids_env_var()
+        ] = ",".join([str(i) for i in visible_npu_devices])

.venv/lib/python3.11/site-packages/ray/_private/accelerators/nvidia_gpu.py ADDED Viewed

	@@ -0,0 +1,128 @@

+import re
+import os
+import logging
+from typing import Optional, List, Tuple
+from ray._private.accelerators.accelerator import AcceleratorManager
+logger = logging.getLogger(__name__)
+CUDA_VISIBLE_DEVICES_ENV_VAR = "CUDA_VISIBLE_DEVICES"
+NOSET_CUDA_VISIBLE_DEVICES_ENV_VAR = "RAY_EXPERIMENTAL_NOSET_CUDA_VISIBLE_DEVICES"
+# TODO(Alex): This pattern may not work for non NVIDIA Tesla GPUs (which have
+# the form "Tesla V100-SXM2-16GB" or "Tesla K80").
+NVIDIA_GPU_NAME_PATTERN = re.compile(r"\w+\s+([A-Z0-9]+)")
+class NvidiaGPUAcceleratorManager(AcceleratorManager):
+    """Nvidia GPU accelerators."""
+    @staticmethod
+    def get_resource_name() -> str:
+        return "GPU"
+    @staticmethod
+    def get_visible_accelerator_ids_env_var() -> str:
+        return CUDA_VISIBLE_DEVICES_ENV_VAR
+    @staticmethod
+    def get_current_process_visible_accelerator_ids() -> Optional[List[str]]:
+        cuda_visible_devices = os.environ.get(
+            NvidiaGPUAcceleratorManager.get_visible_accelerator_ids_env_var(), None
+        )
+        if cuda_visible_devices is None:
+            return None
+        if cuda_visible_devices == "":
+            return []
+        if cuda_visible_devices == "NoDevFiles":
+            return []
+        return list(cuda_visible_devices.split(","))
+    @staticmethod
+    def get_current_node_num_accelerators() -> int:
+        import ray._private.thirdparty.pynvml as pynvml
+        try:
+            pynvml.nvmlInit()
+        except pynvml.NVMLError:
+            return 0  # pynvml init failed
+        device_count = pynvml.nvmlDeviceGetCount()
+        pynvml.nvmlShutdown()
+        return device_count
+    @staticmethod
+    def get_current_node_accelerator_type() -> Optional[str]:
+        import ray._private.thirdparty.pynvml as pynvml
+        try:
+            pynvml.nvmlInit()
+        except pynvml.NVMLError:
+            return None  # pynvml init failed
+        device_count = pynvml.nvmlDeviceGetCount()
+        cuda_device_type = None
+        if device_count > 0:
+            handle = pynvml.nvmlDeviceGetHandleByIndex(0)
+            device_name = pynvml.nvmlDeviceGetName(handle)
+            if isinstance(device_name, bytes):
+                device_name = device_name.decode("utf-8")
+            cuda_device_type = (
+                NvidiaGPUAcceleratorManager._gpu_name_to_accelerator_type(device_name)
+            )
+        pynvml.nvmlShutdown()
+        return cuda_device_type
+    @staticmethod
+    def _gpu_name_to_accelerator_type(name):
+        if name is None:
+            return None
+        match = NVIDIA_GPU_NAME_PATTERN.match(name)
+        return match.group(1) if match else None
+    @staticmethod
+    def validate_resource_request_quantity(
+        quantity: float,
+    ) -> Tuple[bool, Optional[str]]:
+        return (True, None)
+    @staticmethod
+    def set_current_process_visible_accelerator_ids(
+        visible_cuda_devices: List[str],
+    ) -> None:
+        if os.environ.get(NOSET_CUDA_VISIBLE_DEVICES_ENV_VAR):
+            return
+        os.environ[
+            NvidiaGPUAcceleratorManager.get_visible_accelerator_ids_env_var()
+        ] = ",".join([str(i) for i in visible_cuda_devices])
+    @staticmethod
+    def get_ec2_instance_num_accelerators(
+        instance_type: str, instances: dict
+    ) -> Optional[int]:
+        if instance_type not in instances:
+            return None
+        gpus = instances[instance_type].get("GpuInfo", {}).get("Gpus")
+        if gpus is not None:
+            # TODO(ameer): currently we support one gpu type per node.
+            assert len(gpus) == 1
+            return gpus[0]["Count"]
+        return None
+    @staticmethod
+    def get_ec2_instance_accelerator_type(
+        instance_type: str, instances: dict
+    ) -> Optional[str]:
+        if instance_type not in instances:
+            return None
+        gpus = instances[instance_type].get("GpuInfo", {}).get("Gpus")
+        if gpus is not None:
+            # TODO(ameer): currently we support one gpu type per node.
+            assert len(gpus) == 1
+            return gpus[0]["Name"]
+        return None

.venv/lib/python3.11/site-packages/ray/_private/accelerators/tpu.py ADDED Viewed

	@@ -0,0 +1,393 @@

+import os
+import re
+import glob
+import requests
+import logging
+from functools import lru_cache
+from typing import Dict, Optional, List, Tuple
+from ray._private.accelerators.accelerator import AcceleratorManager
+logger = logging.getLogger(__name__)
+TPU_VALID_CHIP_OPTIONS = (1, 2, 4, 8)
+GKE_TPU_ACCELERATOR_TYPE_ENV_VAR = "TPU_ACCELERATOR_TYPE"
+GKE_TPU_WORKER_ID_ENV_VAR = "TPU_WORKER_ID"
+GKE_TPU_NAME_ENV_VAR = "TPU_NAME"
+# Constants for accessing the `accelerator-type` from TPU VM
+# instance metadata.
+# See https://cloud.google.com/compute/docs/metadata/overview
+# for more details about VM instance metadata.
+GCE_TPU_ACCELERATOR_ENDPOINT = (
+    "http://metadata.google.internal/computeMetadata/v1/instance/attributes/"
+)
+GCE_TPU_HEADERS = {"Metadata-Flavor": "Google"}
+GCE_TPU_ACCELERATOR_KEY = "accelerator-type"
+GCE_TPU_INSTANCE_ID_KEY = "instance-id"
+GCE_TPU_WORKER_ID_KEY = "agent-worker-number"
+TPU_VISIBLE_CHIPS_ENV_VAR = "TPU_VISIBLE_CHIPS"
+NOSET_TPU_VISIBLE_CHIPS_ENV_VAR = "RAY_EXPERIMENTAL_NOSET_TPU_VISIBLE_CHIPS"
+# The following defines environment variables that allow
+# us to access a subset of TPU visible chips.
+#
+# See: https://github.com/google/jax/issues/14977 for an example/more details.
+TPU_CHIPS_PER_HOST_BOUNDS_ENV_VAR = "TPU_CHIPS_PER_HOST_BOUNDS"
+TPU_CHIPS_PER_HOST_BOUNDS_1_CHIP_CONFIG = "1,1,1"
+TPU_CHIPS_PER_HOST_BOUNDS_2_CHIP_CONFIG = "1,2,1"
+TPU_HOST_BOUNDS_ENV_VAR = "TPU_HOST_BOUNDS"
+TPU_SINGLE_HOST_BOUNDS = "1,1,1"
+def _get_tpu_metadata(key: str) -> Optional[str]:
+    """Poll and get TPU metadata."""
+    try:
+        accelerator_type_request = requests.get(
+            os.path.join(GCE_TPU_ACCELERATOR_ENDPOINT, key),
+            headers=GCE_TPU_HEADERS,
+        )
+        if (
+            accelerator_type_request.status_code == 200
+            and accelerator_type_request.text
+        ):
+            return accelerator_type_request.text
+        else:
+            logging.debug(
+                "Unable to poll TPU GCE Metadata. Got "
+                f"status code: {accelerator_type_request.status_code} and "
+                f"content: {accelerator_type_request.text}"
+            )
+    except requests.RequestException as e:
+        logging.debug("Unable to poll the TPU GCE Metadata: %s", e)
+    return None
+class TPUAcceleratorManager(AcceleratorManager):
+    """Google TPU accelerators."""
+    @staticmethod
+    def get_resource_name() -> str:
+        return "TPU"
+    @staticmethod
+    def get_visible_accelerator_ids_env_var() -> str:
+        return TPU_VISIBLE_CHIPS_ENV_VAR
+    @staticmethod
+    def get_current_process_visible_accelerator_ids() -> Optional[List[str]]:
+        tpu_visible_chips = os.environ.get(
+            TPUAcceleratorManager.get_visible_accelerator_ids_env_var(), None
+        )
+        if tpu_visible_chips is None:
+            return None
+        if tpu_visible_chips == "":
+            return []
+        return list(tpu_visible_chips.split(","))
+    @staticmethod
+    @lru_cache()
+    def get_current_node_num_accelerators() -> int:
+        """Attempt to detect the number of TPUs on this machine.
+        TPU chips are represented as devices within `/dev/`, either as
+        `/dev/accel*` or `/dev/vfio/*`.
+        Returns:
+            The number of TPUs if any were detected, otherwise 0.
+        """
+        accel_files = glob.glob("/dev/accel*")
+        if accel_files:
+            return len(accel_files)
+        try:
+            vfio_entries = os.listdir("/dev/vfio")
+            numeric_entries = [int(entry) for entry in vfio_entries if entry.isdigit()]
+            return len(numeric_entries)
+        except FileNotFoundError as e:
+            logger.debug("Failed to detect number of TPUs: %s", e)
+            return 0
+    @staticmethod
+    def is_valid_tpu_accelerator_type(tpu_accelerator_type: str) -> bool:
+        """Check whether the tpu accelerator_type is formatted correctly.
+        The accelerator_type field follows a form of v{generation}-{cores/chips}.
+        See the following for more information:
+        https://cloud.google.com/sdk/gcloud/reference/compute/tpus/tpu-vm/accelerator-types/describe
+        Args:
+            tpu_accelerator_type: The string representation of the accelerator type
+                to be checked for validity.
+        Returns:
+            True if it's valid, false otherwise.
+        """
+        expected_pattern = re.compile(r"^v\d+[a-zA-Z]*-\d+$")
+        if not expected_pattern.match(tpu_accelerator_type):
+            return False
+        return True
+    @staticmethod
+    def validate_resource_request_quantity(
+        quantity: float,
+    ) -> Tuple[bool, Optional[str]]:
+        if quantity not in TPU_VALID_CHIP_OPTIONS:
+            return (
+                False,
+                f"The number of requested 'TPU' was set to {quantity} which "
+                "is not a supported chip configuration. Supported configs: "
+                f"{TPU_VALID_CHIP_OPTIONS}",
+            )
+        else:
+            return (True, None)
+    @staticmethod
+    def set_current_process_visible_accelerator_ids(
+        visible_tpu_chips: List[str],
+    ) -> None:
+        """Set TPU environment variables based on the provided visible_tpu_chips.
+        To access a subset of the TPU visible chips, we must use a combination of
+        environment variables that tells the compiler (via ML framework) the:
+        - Visible chips
+        - The physical bounds of chips per host
+        - The host bounds within the context of a TPU pod.
+        See: https://github.com/google/jax/issues/14977 for an example/more details.
+        Args:
+            visible_tpu_chips (List[str]): List of int representing TPU chips.
+        """
+        if os.environ.get(NOSET_TPU_VISIBLE_CHIPS_ENV_VAR):
+            return
+        num_visible_tpu_chips = len(visible_tpu_chips)
+        num_accelerators_on_node = (
+            TPUAcceleratorManager.get_current_node_num_accelerators()
+        )
+        if num_visible_tpu_chips == num_accelerators_on_node:
+            # Let the ML framework use the defaults
+            os.environ.pop(TPU_CHIPS_PER_HOST_BOUNDS_ENV_VAR, None)
+            os.environ.pop(TPU_HOST_BOUNDS_ENV_VAR, None)
+            return
+        os.environ[
+            TPUAcceleratorManager.get_visible_accelerator_ids_env_var()
+        ] = ",".join([str(i) for i in visible_tpu_chips])
+        if num_visible_tpu_chips == 1:
+            os.environ[
+                TPU_CHIPS_PER_HOST_BOUNDS_ENV_VAR
+            ] = TPU_CHIPS_PER_HOST_BOUNDS_1_CHIP_CONFIG
+            os.environ[TPU_HOST_BOUNDS_ENV_VAR] = TPU_SINGLE_HOST_BOUNDS
+        elif num_visible_tpu_chips == 2:
+            os.environ[
+                TPU_CHIPS_PER_HOST_BOUNDS_ENV_VAR
+            ] = TPU_CHIPS_PER_HOST_BOUNDS_2_CHIP_CONFIG
+            os.environ[TPU_HOST_BOUNDS_ENV_VAR] = TPU_SINGLE_HOST_BOUNDS
+    @staticmethod
+    def _get_current_node_tpu_pod_type() -> Optional[str]:
+        """Get the TPU pod type of the current node if applicable.
+        Individual TPU VMs within a TPU pod must know what type
+        of pod it is a part of. This is necessary for the
+        ML framework to work properly.
+        The logic is different if the TPU was provisioned via:
+        ```
+        gcloud tpus tpu-vm create ...
+        ```
+        (i.e. a GCE VM), vs through GKE:
+        - GCE VMs will always have a metadata server to poll this info
+        - GKE VMS will have environment variables preset.
+        Returns:
+            A string representing the current TPU pod type, e.g.
+            v4-16.
+        """
+        # Start with GKE-based check
+        accelerator_type = os.getenv(GKE_TPU_ACCELERATOR_TYPE_ENV_VAR, "")
+        if not accelerator_type:
+            # GCE-based VM check
+            accelerator_type = _get_tpu_metadata(key=GCE_TPU_ACCELERATOR_KEY)
+        if accelerator_type and TPUAcceleratorManager.is_valid_tpu_accelerator_type(
+            tpu_accelerator_type=accelerator_type
+        ):
+            return accelerator_type
+        logging.debug("Failed to get a valid accelerator type.")
+        return None
+    @staticmethod
+    def get_current_node_tpu_name() -> Optional[str]:
+        """Return the name of the TPU pod that this worker node is a part of.
+        For instance, if the TPU was created with name "my-tpu", this function
+        will return "my-tpu".
+        If created through the Ray cluster launcher, the
+        name will typically be something like "ray-my-tpu-cluster-worker-aa946781-tpu".
+        In case the TPU was created through KubeRay, we currently expect that the
+        environment variable TPU_NAME is set per TPU pod slice, in which case
+        this function will return the value of that environment variable.
+        """
+        try:
+            # Start with GKE-based check
+            tpu_name = os.getenv(GKE_TPU_NAME_ENV_VAR, None)
+            if not tpu_name:
+                # GCE-based VM check
+                tpu_name = _get_tpu_metadata(key=GCE_TPU_INSTANCE_ID_KEY)
+            return tpu_name
+        except ValueError as e:
+            logging.debug("Could not get TPU name: %s", e)
+            return None
+    @staticmethod
+    def _get_current_node_tpu_worker_id() -> Optional[int]:
+        """Return the worker index of the TPU pod."""
+        try:
+            # Start with GKE-based check
+            worker_id = os.getenv(GKE_TPU_WORKER_ID_ENV_VAR, None)
+            if not worker_id:
+                # GCE-based VM check
+                worker_id = _get_tpu_metadata(key=GCE_TPU_WORKER_ID_KEY)
+            if worker_id:
+                return int(worker_id)
+            else:
+                return None
+        except ValueError as e:
+            logging.debug("Could not get TPU worker id: %s", e)
+            return None
+    @staticmethod
+    def get_num_workers_in_current_tpu_pod() -> Optional[int]:
+        """Return the total number of workers in a TPU pod."""
+        tpu_pod_type = TPUAcceleratorManager._get_current_node_tpu_pod_type()
+        cores_per_host = TPUAcceleratorManager.get_current_node_num_accelerators()
+        if tpu_pod_type and cores_per_host > 0:
+            num_chips_or_cores = int(tpu_pod_type.split("-")[1])
+            return num_chips_or_cores // cores_per_host
+        else:
+            logging.debug("Could not get num workers in TPU pod.")
+            return None
+    @staticmethod
+    def get_current_node_accelerator_type() -> Optional[str]:
+        """Attempt to detect the TPU accelerator type.
+        The output of this function will return the "ray accelerator type"
+        resource (e.g. TPU-V4) that indicates the TPU version.
+        We also expect that our TPU nodes contain a "TPU pod type"
+        resource, which indicates information about the topology of
+        the TPU pod slice.
+        We expect that the "TPU pod type" resource to be used when
+        running multi host workers, i.e. when TPU units are pod slices.
+        We expect that the "ray accelerator type" resource to be used when
+        running single host workers, i.e. when TPU units are single hosts.
+        Returns:
+            A string representing the TPU accelerator type,
+            e.g. "TPU-V2", "TPU-V3", "TPU-V4" if applicable, else None.
+        """
+        def tpu_pod_type_to_ray_accelerator_type(
+            tpu_pod_type: str,
+        ) -> Optional[str]:
+            return "TPU-" + str(tpu_pod_type.split("-")[0].upper())
+        ray_accelerator_type = None
+        tpu_pod_type = TPUAcceleratorManager._get_current_node_tpu_pod_type()
+        if tpu_pod_type is not None:
+            ray_accelerator_type = tpu_pod_type_to_ray_accelerator_type(
+                tpu_pod_type=tpu_pod_type
+            )
+            if ray_accelerator_type is None:
+                logger.info(
+                    "While trying to autodetect a TPU type, "
+                    f"received malformed accelerator_type: {tpu_pod_type}"
+                )
+        if ray_accelerator_type is None:
+            logging.info("Failed to auto-detect TPU type.")
+        return ray_accelerator_type
+    def get_current_node_additional_resources() -> Optional[Dict[str, float]]:
+        """Get additional resources required for TPU nodes.
+        This will populate the TPU pod type and the TPU name which
+        is used for TPU pod execution.
+        When running workloads on a TPU pod, we need a way to run
+        the same binary on every worker in the TPU pod.
+        See https://jax.readthedocs.io/en/latest/multi_process.html
+        for more information.
+        To do this in ray, we take advantage of custom resources. We
+        mark worker 0 of the TPU pod as a "coordinator" that identifies
+        the other workers in the TPU pod. We therefore need:
+        - worker 0 to be targetable.
+        - all workers in the TPU pod to have a unique identifier consistent
+        within a TPU pod.
+        So assuming we want to run the following workload:
+        @ray.remote
+        def my_jax_fn():
+            import jax
+            return jax.device_count()
+        We could broadcast this on a TPU pod (e.g. a v4-16) as follows:
+        @ray.remote(resources={"TPU-v4-16-head"})
+        def run_jax_fn(executable):
+            # Note this will execute on worker 0
+            tpu_name = ray.util.accelerators.tpu.get_tpu_pod_name()
+            num_workers = ray.util.accelerators.tpu.get_tpu_num_workers()
+            tpu_executable = executable.options(resources={"TPU": 4, tpu_name: 1})
+            return [tpu_executable.remote() for _ in range(num_workers)]
+        Returns:
+            A dictionary representing additional resources that may be
+            necessary for a particular accelerator type.
+        """
+        resources = {}
+        tpu_name = TPUAcceleratorManager.get_current_node_tpu_name()
+        worker_id = TPUAcceleratorManager._get_current_node_tpu_worker_id()
+        tpu_pod_type = TPUAcceleratorManager._get_current_node_tpu_pod_type()
+        if tpu_name and worker_id is not None and tpu_pod_type:
+            pod_head_resource_name = f"TPU-{tpu_pod_type}-head"
+            # Add the name of the TPU to the resource.
+            resources[tpu_name] = 1
+            # Only add in the TPU pod type resource to worker 0.
+            if worker_id == 0:
+                resources[pod_head_resource_name] = 1
+        else:
+            logging.info(
+                "Failed to configure TPU pod. Got: "
+                "tpu_name: %s, worker_id: %s, accelerator_type: %s",
+                tpu_name,
+                worker_id,
+                tpu_pod_type,
+            )
+        if resources:
+            return resources
+        return None

.venv/lib/python3.11/site-packages/ray/_private/runtime_env/agent/thirdparty_files/propcache/_helpers_c.cpython-311-x86_64-linux-gnu.so ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a87371c20cf73e0fe5df7f255ec4523368eff6d0a6e61a6fd6a730892a134935
+size 800728

.venv/lib/python3.11/site-packages/ray/_private/usage/__init__.py ADDED Viewed

File without changes

.venv/lib/python3.11/site-packages/ray/_private/usage/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (191 Bytes). View file

.venv/lib/python3.11/site-packages/ray/_private/usage/__pycache__/usage_constants.cpython-311.pyc ADDED Viewed

Binary file (2.48 kB). View file

.venv/lib/python3.11/site-packages/ray/_private/usage/__pycache__/usage_lib.cpython-311.pyc ADDED Viewed

Binary file (44.8 kB). View file

.venv/lib/python3.11/site-packages/ray/_private/usage/usage_constants.py ADDED Viewed

	@@ -0,0 +1,63 @@

+SCHEMA_VERSION = "0.1"
+# The key to store / obtain cluster metadata.
+CLUSTER_METADATA_KEY = b"CLUSTER_METADATA"
+# The name of a json file where usage stats will be written.
+USAGE_STATS_FILE = "usage_stats.json"
+USAGE_STATS_ENABLED_ENV_VAR = "RAY_USAGE_STATS_ENABLED"
+USAGE_STATS_SOURCE_ENV_VAR = "RAY_USAGE_STATS_SOURCE"
+USAGE_STATS_SOURCE_OSS = "OSS"
+USAGE_STATS_ENABLED_FOR_CLI_MESSAGE = (
+    "Usage stats collection is enabled. To disable this, add `--disable-usage-stats` "
+    "to the command that starts the cluster, or run the following command:"
+    " `ray disable-usage-stats` before starting the cluster. "
+    "See https://docs.ray.io/en/master/cluster/usage-stats.html for more details."
+)
+USAGE_STATS_ENABLED_FOR_RAY_INIT_MESSAGE = (
+    "Usage stats collection is enabled. To disable this, run the following command:"
+    " `ray disable-usage-stats` before starting Ray. "
+    "See https://docs.ray.io/en/master/cluster/usage-stats.html for more details."
+)
+USAGE_STATS_DISABLED_MESSAGE = "Usage stats collection is disabled."
+USAGE_STATS_ENABLED_BY_DEFAULT_FOR_CLI_MESSAGE = (
+    "Usage stats collection is enabled by default without user confirmation "
+    "because this terminal is detected to be non-interactive. "
+    "To disable this, add `--disable-usage-stats` to the command that starts "
+    "the cluster, or run the following command:"
+    " `ray disable-usage-stats` before starting the cluster. "
+    "See https://docs.ray.io/en/master/cluster/usage-stats.html for more details."
+)
+USAGE_STATS_ENABLED_BY_DEFAULT_FOR_RAY_INIT_MESSAGE = (
+    "Usage stats collection is enabled by default for nightly wheels. "
+    "To disable this, run the following command:"
+    " `ray disable-usage-stats` before starting Ray. "
+    "See https://docs.ray.io/en/master/cluster/usage-stats.html for more details."
+)
+USAGE_STATS_CONFIRMATION_MESSAGE = (
+    "Enable usage stats collection? "
+    "This prompt will auto-proceed in 10 seconds to avoid blocking cluster startup."
+)
+LIBRARY_USAGE_SET_NAME = "library_usage_"
+HARDWARE_USAGE_SET_NAME = "hardware_usage_"
+# Keep in-sync with the same constants defined in usage_stats_client.h
+EXTRA_USAGE_TAG_PREFIX = "extra_usage_tag_"
+USAGE_STATS_NAMESPACE = "usage_stats"
+KUBERNETES_SERVICE_HOST_ENV = "KUBERNETES_SERVICE_HOST"
+KUBERAY_ENV = "RAY_USAGE_STATS_KUBERAY_IN_USE"
+PROVIDER_KUBERNETES_GENERIC = "kubernetes"
+PROVIDER_KUBERAY = "kuberay"

.venv/lib/python3.11/site-packages/ray/_private/usage/usage_lib.py ADDED Viewed

	@@ -0,0 +1,964 @@

+"""This is the module that is in charge of Ray usage report (telemetry) APIs.
+NOTE: Ray's usage report is currently "on by default".
+      One could opt-out, see details at https://docs.ray.io/en/master/cluster/usage-stats.html. # noqa
+Ray usage report follows the specification from
+https://docs.google.com/document/d/1ZT-l9YbGHh-iWRUC91jS-ssQ5Qe2UQ43Lsoc1edCalc/edit#heading=h.17dss3b9evbj. # noqa
+# Module
+The module consists of 2 parts.
+## Public API
+It contains public APIs to obtain usage report information.
+APIs will be added before the usage report becomes opt-in by default.
+## Internal APIs for usage processing/report
+The telemetry report consists of 5 components. This module is in charge of the top 2 layers.
+Report                -> usage_lib
+---------------------
+Usage data processing -> usage_lib
+---------------------
+Data storage          -> Ray API server
+---------------------
+Aggregation           -> Ray API server (currently a dashboard server)
+---------------------
+Usage data collection -> Various components (Ray agent, GCS, etc.) + usage_lib (cluster metadata).
+Usage report is currently "off by default". You can enable the report by setting an environment variable
+RAY_USAGE_STATS_ENABLED=1. For example, `RAY_USAGE_STATS_ENABLED=1 ray start --head`.
+Or `RAY_USAGE_STATS_ENABLED=1 python [drivers with ray.init()]`.
+"Ray API server (currently a dashboard server)" reports the usage data to https://usage-stats.ray.io/.
+Data is reported every hour by default.
+Note that it is also possible to configure the interval using the environment variable,
+`RAY_USAGE_STATS_REPORT_INTERVAL_S`.
+To see collected/reported data, see `usage_stats.json` inside a temp
+folder (e.g., /tmp/ray/session_[id]/*).
+"""
+import json
+import logging
+import threading
+import os
+import platform
+import sys
+import time
+from dataclasses import asdict, dataclass
+from enum import Enum, auto
+from pathlib import Path
+from typing import Dict, List, Optional, Set
+import requests
+import yaml
+import ray
+from ray._raylet import GcsClient
+import ray._private.ray_constants as ray_constants
+import ray._private.usage.usage_constants as usage_constant
+from ray.experimental.internal_kv import (
+    _internal_kv_initialized,
+    _internal_kv_put,
+)
+from ray.core.generated import usage_pb2, gcs_pb2
+logger = logging.getLogger(__name__)
+TagKey = usage_pb2.TagKey
+#################
+# Internal APIs #
+#################
+@dataclass(init=True)
+class ClusterConfigToReport:
+    cloud_provider: Optional[str] = None
+    min_workers: Optional[int] = None
+    max_workers: Optional[int] = None
+    head_node_instance_type: Optional[str] = None
+    worker_node_instance_types: Optional[List[str]] = None
+@dataclass(init=True)
+class ClusterStatusToReport:
+    total_num_cpus: Optional[int] = None
+    total_num_gpus: Optional[int] = None
+    total_memory_gb: Optional[float] = None
+    total_object_store_memory_gb: Optional[float] = None
+@dataclass(init=True)
+class UsageStatsToReport:
+    """Usage stats to report"""
+    #: The schema version of the report.
+    schema_version: str
+    #: The source of the data (i.e. OSS).
+    source: str
+    #: When the data is collected and reported.
+    collect_timestamp_ms: int
+    #: The total number of successful reports for the lifetime of the cluster.
+    total_success: Optional[int] = None
+    #: The total number of failed reports for the lifetime of the cluster.
+    total_failed: Optional[int] = None
+    #: The sequence number of the report.
+    seq_number: Optional[int] = None
+    #: The Ray version in use.
+    ray_version: Optional[str] = None
+    #: The Python version in use.
+    python_version: Optional[str] = None
+    #: A random id of the cluster session.
+    session_id: Optional[str] = None
+    #: The git commit hash of Ray (i.e. ray.__commit__).
+    git_commit: Optional[str] = None
+    #: The operating system in use.
+    os: Optional[str] = None
+    #: When the cluster is started.
+    session_start_timestamp_ms: Optional[int] = None
+    #: The cloud provider found in the cluster.yaml file (e.g., aws).
+    cloud_provider: Optional[str] = None
+    #: The min_workers found in the cluster.yaml file.
+    min_workers: Optional[int] = None
+    #: The max_workers found in the cluster.yaml file.
+    max_workers: Optional[int] = None
+    #: The head node instance type found in the cluster.yaml file (e.g., i3.8xlarge).
+    head_node_instance_type: Optional[str] = None
+    #: The worker node instance types found in the cluster.yaml file (e.g., i3.8xlarge).
+    worker_node_instance_types: Optional[List[str]] = None
+    #: The total num of cpus in the cluster.
+    total_num_cpus: Optional[int] = None
+    #: The total num of gpus in the cluster.
+    total_num_gpus: Optional[int] = None
+    #: The total size of memory in the cluster.
+    total_memory_gb: Optional[float] = None
+    #: The total size of object store memory in the cluster.
+    total_object_store_memory_gb: Optional[float] = None
+    #: The Ray libraries that are used (e.g., rllib).
+    library_usages: Optional[List[str]] = None
+    #: The extra tags to report when specified by an
+    #  environment variable RAY_USAGE_STATS_EXTRA_TAGS
+    extra_usage_tags: Optional[Dict[str, str]] = None
+    #: The number of alive nodes when the report is generated.
+    total_num_nodes: Optional[int] = None
+    #: The total number of running jobs excluding internal ones
+    #  when the report is generated.
+    total_num_running_jobs: Optional[int] = None
+    #: The libc version in the OS.
+    libc_version: Optional[str] = None
+    #: The hardwares that are used (e.g. Intel Xeon).
+    hardware_usages: Optional[List[str]] = None
+@dataclass(init=True)
+class UsageStatsToWrite:
+    """Usage stats to write to `USAGE_STATS_FILE`
+    We are writing extra metadata such as the status of report
+    to this file.
+    """
+    usage_stats: UsageStatsToReport
+    # Whether or not the last report succeeded.
+    success: bool
+    # The error message of the last report if it happens.
+    error: str
+class UsageStatsEnabledness(Enum):
+    ENABLED_EXPLICITLY = auto()
+    DISABLED_EXPLICITLY = auto()
+    ENABLED_BY_DEFAULT = auto()
+_recorded_library_usages = set()
+_recorded_library_usages_lock = threading.Lock()
+_recorded_extra_usage_tags = dict()
+_recorded_extra_usage_tags_lock = threading.Lock()
+def _add_to_usage_set(set_name: str, value: str):
+    assert _internal_kv_initialized()
+    try:
+        _internal_kv_put(
+            f"{set_name}{value}".encode(),
+            b"",
+            namespace=usage_constant.USAGE_STATS_NAMESPACE.encode(),
+        )
+    except Exception as e:
+        logger.debug(f"Failed to add {value} to usage set {set_name}, {e}")
+def _get_usage_set(gcs_client, set_name: str) -> Set[str]:
+    try:
+        result = set()
+        usages = gcs_client.internal_kv_keys(
+            set_name.encode(),
+            namespace=usage_constant.USAGE_STATS_NAMESPACE.encode(),
+        )
+        for usage in usages:
+            usage = usage.decode("utf-8")
+            result.add(usage[len(set_name) :])
+        return result
+    except Exception as e:
+        logger.debug(f"Failed to get usage set {set_name}, {e}")
+        return set()
+def _put_library_usage(library_usage: str):
+    _add_to_usage_set(usage_constant.LIBRARY_USAGE_SET_NAME, library_usage)
+def _put_hardware_usage(hardware_usage: str):
+    _add_to_usage_set(usage_constant.HARDWARE_USAGE_SET_NAME, hardware_usage)
+def record_extra_usage_tag(
+    key: TagKey, value: str, gcs_client: Optional[GcsClient] = None
+):
+    """Record extra kv usage tag.
+    If the key already exists, the value will be overwritten.
+    To record an extra tag, first add the key to the TagKey enum and
+    then call this function.
+    It will make a synchronous call to the internal kv store if the tag is updated.
+    Args:
+        key: The key of the tag.
+        value: The value of the tag.
+        gcs_client: The GCS client to perform KV operation PUT. Defaults to None.
+            When None, it will try to get the global client from the internal_kv.
+    """
+    key = TagKey.Name(key).lower()
+    with _recorded_extra_usage_tags_lock:
+        if _recorded_extra_usage_tags.get(key) == value:
+            return
+        _recorded_extra_usage_tags[key] = value
+    if not _internal_kv_initialized() and gcs_client is None:
+        # This happens if the record is before ray.init and
+        # no GCS client is used for recording explicitly.
+        return
+    _put_extra_usage_tag(key, value, gcs_client)
+def _put_extra_usage_tag(key: str, value: str, gcs_client: Optional[GcsClient] = None):
+    try:
+        key = f"{usage_constant.EXTRA_USAGE_TAG_PREFIX}{key}".encode()
+        val = value.encode()
+        namespace = usage_constant.USAGE_STATS_NAMESPACE.encode()
+        if gcs_client is not None:
+            # Use the GCS client.
+            gcs_client.internal_kv_put(key, val, namespace=namespace)
+        else:
+            # Use internal kv.
+            assert _internal_kv_initialized()
+            _internal_kv_put(key, val, namespace=namespace)
+    except Exception as e:
+        logger.debug(f"Failed to put extra usage tag, {e}")
+def record_hardware_usage(hardware_usage: str):
+    """Record hardware usage (e.g. which CPU model is used)"""
+    assert _internal_kv_initialized()
+    _put_hardware_usage(hardware_usage)
+def record_library_usage(library_usage: str):
+    """Record library usage (e.g. which library is used)"""
+    with _recorded_library_usages_lock:
+        if library_usage in _recorded_library_usages:
+            return
+        _recorded_library_usages.add(library_usage)
+    if not _internal_kv_initialized():
+        # This happens if the library is imported before ray.init
+        return
+    # Only report lib usage for driver / ray client / workers. Otherwise,
+    # it can be reported if the library is imported from
+    # e.g., API server.
+    if (
+        ray._private.worker.global_worker.mode == ray.SCRIPT_MODE
+        or ray._private.worker.global_worker.mode == ray.WORKER_MODE
+        or ray.util.client.ray.is_connected()
+    ):
+        _put_library_usage(library_usage)
+def _put_pre_init_library_usages():
+    assert _internal_kv_initialized()
+    # NOTE: When the lib is imported from a worker, ray should
+    # always be initialized, so there's no need to register the
+    # pre init hook.
+    if not (
+        ray._private.worker.global_worker.mode == ray.SCRIPT_MODE
+        or ray.util.client.ray.is_connected()
+    ):
+        return
+    for library_usage in _recorded_library_usages:
+        _put_library_usage(library_usage)
+def _put_pre_init_extra_usage_tags():
+    assert _internal_kv_initialized()
+    for k, v in _recorded_extra_usage_tags.items():
+        _put_extra_usage_tag(k, v)
+def put_pre_init_usage_stats():
+    _put_pre_init_library_usages()
+    _put_pre_init_extra_usage_tags()
+def reset_global_state():
+    global _recorded_library_usages, _recorded_extra_usage_tags
+    with _recorded_library_usages_lock:
+        _recorded_library_usages = set()
+    with _recorded_extra_usage_tags_lock:
+        _recorded_extra_usage_tags = dict()
+ray._private.worker._post_init_hooks.append(put_pre_init_usage_stats)
+def _usage_stats_report_url():
+    # The usage collection server URL.
+    # The environment variable is testing-purpose only.
+    return os.getenv("RAY_USAGE_STATS_REPORT_URL", "https://usage-stats.ray.io/")
+def _usage_stats_report_interval_s():
+    return int(os.getenv("RAY_USAGE_STATS_REPORT_INTERVAL_S", 3600))
+def _usage_stats_config_path():
+    return os.getenv(
+        "RAY_USAGE_STATS_CONFIG_PATH", os.path.expanduser("~/.ray/config.json")
+    )
+def _usage_stats_enabledness() -> UsageStatsEnabledness:
+    # Env var has higher priority than config file.
+    usage_stats_enabled_env_var = os.getenv(usage_constant.USAGE_STATS_ENABLED_ENV_VAR)
+    if usage_stats_enabled_env_var == "0":
+        return UsageStatsEnabledness.DISABLED_EXPLICITLY
+    elif usage_stats_enabled_env_var == "1":
+        return UsageStatsEnabledness.ENABLED_EXPLICITLY
+    elif usage_stats_enabled_env_var is not None:
+        raise ValueError(
+            f"Valid value for {usage_constant.USAGE_STATS_ENABLED_ENV_VAR} "
+            f"env var is 0 or 1, but got {usage_stats_enabled_env_var}"
+        )
+    usage_stats_enabled_config_var = None
+    try:
+        with open(_usage_stats_config_path()) as f:
+            config = json.load(f)
+            usage_stats_enabled_config_var = config.get("usage_stats")
+    except FileNotFoundError:
+        pass
+    except Exception as e:
+        logger.debug(f"Failed to load usage stats config {e}")
+    if usage_stats_enabled_config_var is False:
+        return UsageStatsEnabledness.DISABLED_EXPLICITLY
+    elif usage_stats_enabled_config_var is True:
+        return UsageStatsEnabledness.ENABLED_EXPLICITLY
+    elif usage_stats_enabled_config_var is not None:
+        raise ValueError(
+            f"Valid value for 'usage_stats' in {_usage_stats_config_path()}"
+            f" is true or false, but got {usage_stats_enabled_config_var}"
+        )
+    # Usage stats is enabled by default.
+    return UsageStatsEnabledness.ENABLED_BY_DEFAULT
+def is_nightly_wheel() -> bool:
+    return ray.__commit__ != "{{RAY_COMMIT_SHA}}" and "dev" in ray.__version__
+def usage_stats_enabled() -> bool:
+    return _usage_stats_enabledness() is not UsageStatsEnabledness.DISABLED_EXPLICITLY
+def usage_stats_prompt_enabled():
+    return int(os.getenv("RAY_USAGE_STATS_PROMPT_ENABLED", "1")) == 1
+def _generate_cluster_metadata(*, ray_init_cluster: bool):
+    """Return a dictionary of cluster metadata.
+    Params:
+        ray_init_cluster: Whether the cluster is started by ray.init()
+    """
+    ray_version, python_version = ray._private.utils.compute_version_info()
+    # These two metadata is necessary although usage report is not enabled
+    # to check version compatibility.
+    metadata = {
+        "ray_version": ray_version,
+        "python_version": python_version,
+        "ray_init_cluster": ray_init_cluster,
+    }
+    # Additional metadata is recorded only when usage stats are enabled.
+    if usage_stats_enabled():
+        metadata.update(
+            {
+                "git_commit": ray.__commit__,
+                "os": sys.platform,
+                "session_start_timestamp_ms": int(time.time() * 1000),
+            }
+        )
+        if sys.platform == "linux":
+            # Record llibc version
+            (lib, ver) = platform.libc_ver()
+            if not lib:
+                metadata.update({"libc_version": "NA"})
+            else:
+                metadata.update({"libc_version": f"{lib}:{ver}"})
+    return metadata
+def show_usage_stats_prompt(cli: bool) -> None:
+    if not usage_stats_prompt_enabled():
+        return
+    from ray.autoscaler._private.cli_logger import cli_logger
+    prompt_print = cli_logger.print if cli else print
+    usage_stats_enabledness = _usage_stats_enabledness()
+    if usage_stats_enabledness is UsageStatsEnabledness.DISABLED_EXPLICITLY:
+        prompt_print(usage_constant.USAGE_STATS_DISABLED_MESSAGE)
+    elif usage_stats_enabledness is UsageStatsEnabledness.ENABLED_BY_DEFAULT:
+        if not cli:
+            prompt_print(
+                usage_constant.USAGE_STATS_ENABLED_BY_DEFAULT_FOR_RAY_INIT_MESSAGE
+            )
+        elif cli_logger.interactive:
+            enabled = cli_logger.confirm(
+                False,
+                usage_constant.USAGE_STATS_CONFIRMATION_MESSAGE,
+                _default=True,
+                _timeout_s=10,
+            )
+            set_usage_stats_enabled_via_env_var(enabled)
+            # Remember user's choice.
+            try:
+                set_usage_stats_enabled_via_config(enabled)
+            except Exception as e:
+                logger.debug(
+                    f"Failed to persist usage stats choice for future clusters: {e}"
+                )
+            if enabled:
+                prompt_print(usage_constant.USAGE_STATS_ENABLED_FOR_CLI_MESSAGE)
+            else:
+                prompt_print(usage_constant.USAGE_STATS_DISABLED_MESSAGE)
+        else:
+            prompt_print(
+                usage_constant.USAGE_STATS_ENABLED_BY_DEFAULT_FOR_CLI_MESSAGE,
+            )
+    else:
+        assert usage_stats_enabledness is UsageStatsEnabledness.ENABLED_EXPLICITLY
+        prompt_print(
+            usage_constant.USAGE_STATS_ENABLED_FOR_CLI_MESSAGE
+            if cli
+            else usage_constant.USAGE_STATS_ENABLED_FOR_RAY_INIT_MESSAGE
+        )
+def set_usage_stats_enabled_via_config(enabled) -> None:
+    config = {}
+    try:
+        with open(_usage_stats_config_path()) as f:
+            config = json.load(f)
+        if not isinstance(config, dict):
+            logger.debug(
+                f"Invalid ray config file, should be a json dict but got {type(config)}"
+            )
+            config = {}
+    except FileNotFoundError:
+        pass
+    except Exception as e:
+        logger.debug(f"Failed to load ray config file {e}")
+    config["usage_stats"] = enabled
+    try:
+        os.makedirs(os.path.dirname(_usage_stats_config_path()), exist_ok=True)
+        with open(_usage_stats_config_path(), "w") as f:
+            json.dump(config, f)
+    except Exception as e:
+        raise Exception(
+            "Failed to "
+            f'{"enable" if enabled else "disable"}'
+            ' usage stats by writing {"usage_stats": '
+            f'{"true" if enabled else "false"}'
+            "} to "
+            f"{_usage_stats_config_path()}"
+        ) from e
+def set_usage_stats_enabled_via_env_var(enabled) -> None:
+    os.environ[usage_constant.USAGE_STATS_ENABLED_ENV_VAR] = "1" if enabled else "0"
+def put_cluster_metadata(gcs_client, *, ray_init_cluster) -> None:
+    """Generate the cluster metadata and store it to GCS.
+    It is a blocking API.
+    Params:
+        gcs_client: The GCS client to perform KV operation PUT.
+        ray_init_cluster: Whether the cluster is started by ray.init()
+    Raises:
+        gRPC exceptions if PUT fails.
+    """
+    metadata = _generate_cluster_metadata(ray_init_cluster=ray_init_cluster)
+    gcs_client.internal_kv_put(
+        usage_constant.CLUSTER_METADATA_KEY,
+        json.dumps(metadata).encode(),
+        overwrite=True,
+        namespace=ray_constants.KV_NAMESPACE_CLUSTER,
+    )
+    return metadata
+def get_total_num_running_jobs_to_report(gcs_client) -> Optional[int]:
+    """Return the total number of running jobs in the cluster excluding internal ones"""
+    try:
+        result = gcs_client.get_all_job_info(
+            skip_submission_job_info_field=True, skip_is_running_tasks_field=True
+        )
+        total_num_running_jobs = 0
+        for job_info in result.values():
+            if not job_info.is_dead and not job_info.config.ray_namespace.startswith(
+                "_ray_internal"
+            ):
+                total_num_running_jobs += 1
+        return total_num_running_jobs
+    except Exception as e:
+        logger.info(f"Faile to query number of running jobs in the cluster: {e}")
+        return None
+def get_total_num_nodes_to_report(gcs_client, timeout=None) -> Optional[int]:
+    """Return the total number of alive nodes in the cluster"""
+    try:
+        result = gcs_client.get_all_node_info(timeout=timeout)
+        total_num_nodes = 0
+        for node_id, node_info in result.items():
+            if node_info.state == gcs_pb2.GcsNodeInfo.GcsNodeState.ALIVE:
+                total_num_nodes += 1
+        return total_num_nodes
+    except Exception as e:
+        logger.info(f"Faile to query number of nodes in the cluster: {e}")
+        return None
+def get_library_usages_to_report(gcs_client) -> List[str]:
+    return list(_get_usage_set(gcs_client, usage_constant.LIBRARY_USAGE_SET_NAME))
+def get_hardware_usages_to_report(gcs_client) -> List[str]:
+    return list(_get_usage_set(gcs_client, usage_constant.HARDWARE_USAGE_SET_NAME))
+def get_extra_usage_tags_to_report(gcs_client) -> Dict[str, str]:
+    """Get the extra usage tags from env var and gcs kv store.
+    The env var should be given this way; key=value;key=value.
+    If parsing is failed, it will return the empty data.
+    Returns:
+        Extra usage tags as kv pairs.
+    """
+    extra_usage_tags = dict()
+    extra_usage_tags_env_var = os.getenv("RAY_USAGE_STATS_EXTRA_TAGS", None)
+    if extra_usage_tags_env_var:
+        try:
+            kvs = extra_usage_tags_env_var.strip(";").split(";")
+            for kv in kvs:
+                k, v = kv.split("=")
+                extra_usage_tags[k] = v
+        except Exception as e:
+            logger.info(f"Failed to parse extra usage tags env var. Error: {e}")
+    valid_tag_keys = [tag_key.lower() for tag_key in TagKey.keys()]
+    try:
+        keys = gcs_client.internal_kv_keys(
+            usage_constant.EXTRA_USAGE_TAG_PREFIX.encode(),
+            namespace=usage_constant.USAGE_STATS_NAMESPACE.encode(),
+        )
+        for key in keys:
+            value = gcs_client.internal_kv_get(
+                key, namespace=usage_constant.USAGE_STATS_NAMESPACE.encode()
+            )
+            key = key.decode("utf-8")
+            key = key[len(usage_constant.EXTRA_USAGE_TAG_PREFIX) :]
+            assert key in valid_tag_keys
+            extra_usage_tags[key] = value.decode("utf-8")
+    except Exception as e:
+        logger.info(f"Failed to get extra usage tags from kv store {e}")
+    return extra_usage_tags
+def _get_cluster_status_to_report_v2(gcs_client) -> ClusterStatusToReport:
+    """
+    Get the current status of this cluster. A temporary proxy for the
+    autoscaler v2 API.
+    It is a blocking API.
+    Params:
+        gcs_client: The GCS client.
+    Returns:
+        The current cluster status or empty ClusterStatusToReport
+        if it fails to get that information.
+    """
+    from ray.autoscaler.v2.sdk import get_cluster_status
+    result = ClusterStatusToReport()
+    try:
+        cluster_status = get_cluster_status(gcs_client.address)
+        total_resources = cluster_status.total_resources()
+        result.total_num_cpus = int(total_resources.get("CPU", 0))
+        result.total_num_gpus = int(total_resources.get("GPU", 0))
+        to_GiB = 1 / 2**30
+        result.total_memory_gb = total_resources.get("memory", 0) * to_GiB
+        result.total_object_store_memory_gb = (
+            total_resources.get("object_store_memory", 0) * to_GiB
+        )
+    except Exception as e:
+        logger.info(f"Failed to get cluster status to report {e}")
+    finally:
+        return result
+def get_cluster_status_to_report(gcs_client) -> ClusterStatusToReport:
+    """Get the current status of this cluster.
+    It is a blocking API.
+    Params:
+        gcs_client: The GCS client to perform KV operation GET.
+    Returns:
+        The current cluster status or empty if it fails to get that information.
+    """
+    try:
+        from ray.autoscaler.v2.utils import is_autoscaler_v2
+        if is_autoscaler_v2():
+            return _get_cluster_status_to_report_v2(gcs_client)
+        cluster_status = gcs_client.internal_kv_get(
+            ray._private.ray_constants.DEBUG_AUTOSCALING_STATUS.encode(),
+            namespace=None,
+        )
+        if not cluster_status:
+            return ClusterStatusToReport()
+        result = ClusterStatusToReport()
+        to_GiB = 1 / 2**30
+        cluster_status = json.loads(cluster_status.decode("utf-8"))
+        if (
+            "load_metrics_report" not in cluster_status
+            or "usage" not in cluster_status["load_metrics_report"]
+        ):
+            return ClusterStatusToReport()
+        usage = cluster_status["load_metrics_report"]["usage"]
+        # usage is a map from resource to (used, total) pair
+        if "CPU" in usage:
+            result.total_num_cpus = int(usage["CPU"][1])
+        if "GPU" in usage:
+            result.total_num_gpus = int(usage["GPU"][1])
+        if "memory" in usage:
+            result.total_memory_gb = usage["memory"][1] * to_GiB
+        if "object_store_memory" in usage:
+            result.total_object_store_memory_gb = (
+                usage["object_store_memory"][1] * to_GiB
+            )
+        return result
+    except Exception as e:
+        logger.info(f"Failed to get cluster status to report {e}")
+        return ClusterStatusToReport()
+def get_cluster_config_to_report(
+    cluster_config_file_path: str,
+) -> ClusterConfigToReport:
+    """Get the static cluster (autoscaler) config used to launch this cluster.
+    Params:
+        cluster_config_file_path: The file path to the cluster config file.
+    Returns:
+        The cluster (autoscaler) config or empty if it fails to get that information.
+    """
+    def get_instance_type(node_config):
+        if not node_config:
+            return None
+        if "InstanceType" in node_config:
+            # aws
+            return node_config["InstanceType"]
+        if "machineType" in node_config:
+            # gcp
+            return node_config["machineType"]
+        if (
+            "azure_arm_parameters" in node_config
+            and "vmSize" in node_config["azure_arm_parameters"]
+        ):
+            return node_config["azure_arm_parameters"]["vmSize"]
+        return None
+    try:
+        with open(cluster_config_file_path) as f:
+            config = yaml.safe_load(f)
+            result = ClusterConfigToReport()
+            if "min_workers" in config:
+                result.min_workers = config["min_workers"]
+            if "max_workers" in config:
+                result.max_workers = config["max_workers"]
+            if "provider" in config and "type" in config["provider"]:
+                result.cloud_provider = config["provider"]["type"]
+            if "head_node_type" not in config:
+                return result
+            if "available_node_types" not in config:
+                return result
+            head_node_type = config["head_node_type"]
+            available_node_types = config["available_node_types"]
+            for available_node_type in available_node_types:
+                if available_node_type == head_node_type:
+                    head_node_instance_type = get_instance_type(
+                        available_node_types[available_node_type].get("node_config")
+                    )
+                    if head_node_instance_type:
+                        result.head_node_instance_type = head_node_instance_type
+                else:
+                    worker_node_instance_type = get_instance_type(
+                        available_node_types[available_node_type].get("node_config")
+                    )
+                    if worker_node_instance_type:
+                        result.worker_node_instance_types = (
+                            result.worker_node_instance_types or set()
+                        )
+                        result.worker_node_instance_types.add(worker_node_instance_type)
+            if result.worker_node_instance_types:
+                result.worker_node_instance_types = list(
+                    result.worker_node_instance_types
+                )
+            return result
+    except FileNotFoundError:
+        # It's a manually started cluster or k8s cluster
+        result = ClusterConfigToReport()
+        # Check if we're on Kubernetes
+        if usage_constant.KUBERNETES_SERVICE_HOST_ENV in os.environ:
+            # Check if we're using KubeRay >= 0.4.0.
+            if usage_constant.KUBERAY_ENV in os.environ:
+                result.cloud_provider = usage_constant.PROVIDER_KUBERAY
+            # Else, we're on Kubernetes but not in either of the above categories.
+            else:
+                result.cloud_provider = usage_constant.PROVIDER_KUBERNETES_GENERIC
+        return result
+    except Exception as e:
+        logger.info(f"Failed to get cluster config to report {e}")
+        return ClusterConfigToReport()
+def get_cluster_metadata(gcs_client) -> dict:
+    """Get the cluster metadata from GCS.
+    It is a blocking API.
+    This will return None if `put_cluster_metadata` was never called.
+    Params:
+        gcs_client: The GCS client to perform KV operation GET.
+    Returns:
+        The cluster metadata in a dictinoary.
+    Raises:
+        RuntimeError if it fails to obtain cluster metadata from GCS.
+    """
+    return json.loads(
+        gcs_client.internal_kv_get(
+            usage_constant.CLUSTER_METADATA_KEY,
+            namespace=ray_constants.KV_NAMESPACE_CLUSTER,
+        ).decode("utf-8")
+    )
+def is_ray_init_cluster(gcs_client: ray._raylet.GcsClient) -> bool:
+    """Return whether the cluster is started by ray.init()"""
+    cluster_metadata = get_cluster_metadata(gcs_client)
+    return cluster_metadata["ray_init_cluster"]
+def generate_disabled_report_data() -> UsageStatsToReport:
+    """Generate the report data indicating usage stats is disabled"""
+    data = UsageStatsToReport(
+        schema_version=usage_constant.SCHEMA_VERSION,
+        source=os.getenv(
+            usage_constant.USAGE_STATS_SOURCE_ENV_VAR,
+            usage_constant.USAGE_STATS_SOURCE_OSS,
+        ),
+        collect_timestamp_ms=int(time.time() * 1000),
+    )
+    return data
+def generate_report_data(
+    cluster_config_to_report: ClusterConfigToReport,
+    total_success: int,
+    total_failed: int,
+    seq_number: int,
+    gcs_address: str,
+    cluster_id: str,
+) -> UsageStatsToReport:
+    """Generate the report data.
+    Params:
+        cluster_config_to_report: The cluster (autoscaler)
+            config generated by `get_cluster_config_to_report`.
+        total_success: The total number of successful report
+            for the lifetime of the cluster.
+        total_failed: The total number of failed report
+            for the lifetime of the cluster.
+        seq_number: The sequence number that's incremented whenever
+            a new report is sent.
+        gcs_address: the address of gcs to get data to report.
+        cluster_id: hex id of the cluster.
+    Returns:
+        UsageStats
+    """
+    assert cluster_id
+    gcs_client = ray._raylet.GcsClient(
+        address=gcs_address, nums_reconnect_retry=20, cluster_id=cluster_id
+    )
+    cluster_metadata = get_cluster_metadata(gcs_client)
+    cluster_status_to_report = get_cluster_status_to_report(gcs_client)
+    data = UsageStatsToReport(
+        schema_version=usage_constant.SCHEMA_VERSION,
+        source=os.getenv(
+            usage_constant.USAGE_STATS_SOURCE_ENV_VAR,
+            usage_constant.USAGE_STATS_SOURCE_OSS,
+        ),
+        collect_timestamp_ms=int(time.time() * 1000),
+        total_success=total_success,
+        total_failed=total_failed,
+        seq_number=seq_number,
+        ray_version=cluster_metadata["ray_version"],
+        python_version=cluster_metadata["python_version"],
+        session_id=cluster_id,
+        git_commit=cluster_metadata["git_commit"],
+        os=cluster_metadata["os"],
+        session_start_timestamp_ms=cluster_metadata["session_start_timestamp_ms"],
+        cloud_provider=cluster_config_to_report.cloud_provider,
+        min_workers=cluster_config_to_report.min_workers,
+        max_workers=cluster_config_to_report.max_workers,
+        head_node_instance_type=cluster_config_to_report.head_node_instance_type,
+        worker_node_instance_types=cluster_config_to_report.worker_node_instance_types,
+        total_num_cpus=cluster_status_to_report.total_num_cpus,
+        total_num_gpus=cluster_status_to_report.total_num_gpus,
+        total_memory_gb=cluster_status_to_report.total_memory_gb,
+        total_object_store_memory_gb=cluster_status_to_report.total_object_store_memory_gb,  # noqa: E501
+        library_usages=get_library_usages_to_report(gcs_client),
+        extra_usage_tags=get_extra_usage_tags_to_report(gcs_client),
+        total_num_nodes=get_total_num_nodes_to_report(gcs_client),
+        total_num_running_jobs=get_total_num_running_jobs_to_report(gcs_client),
+        libc_version=cluster_metadata.get("libc_version"),
+        hardware_usages=get_hardware_usages_to_report(gcs_client),
+    )
+    return data
+def generate_write_data(
+    usage_stats: UsageStatsToReport,
+    error: str,
+) -> UsageStatsToWrite:
+    """Generate the report data.
+    Params:
+        usage_stats: The usage stats that were reported.
+        error: The error message of failed reports.
+    Returns:
+        UsageStatsToWrite
+    """
+    data = UsageStatsToWrite(
+        usage_stats=usage_stats,
+        success=error is None,
+        error=error,
+    )
+    return data
+class UsageReportClient:
+    """The client implementation for usage report.
+    It is in charge of writing usage stats to the directory
+    and report usage stats.
+    """
+    def write_usage_data(self, data: UsageStatsToWrite, dir_path: str) -> None:
+        """Write the usage data to the directory.
+        Params:
+            data: Data to report
+            dir_path: The path to the directory to write usage data.
+        """
+        # Atomically update the file.
+        dir_path = Path(dir_path)
+        destination = dir_path / usage_constant.USAGE_STATS_FILE
+        temp = dir_path / f"{usage_constant.USAGE_STATS_FILE}.tmp"
+        with temp.open(mode="w") as json_file:
+            json_file.write(json.dumps(asdict(data)))
+        if sys.platform == "win32":
+            # Windows 32 doesn't support atomic renaming, so we should delete
+            # the file first.
+            destination.unlink(missing_ok=True)
+        temp.rename(destination)
+    def report_usage_data(self, url: str, data: UsageStatsToReport) -> None:
+        """Report the usage data to the usage server.
+        Params:
+            url: The URL to update resource usage.
+            data: Data to report.
+        Raises:
+            requests.HTTPError if requests fails.
+        """
+        r = requests.request(
+            "POST",
+            url,
+            headers={"Content-Type": "application/json"},
+            json=asdict(data),
+            timeout=10,
+        )
+        r.raise_for_status()
+        return r

.venv/lib/python3.11/site-packages/ray/_private/workers/__init__.py ADDED Viewed

File without changes

.venv/lib/python3.11/site-packages/ray/_private/workers/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (193 Bytes). View file

.venv/lib/python3.11/site-packages/ray/_private/workers/__pycache__/default_worker.cpython-311.pyc ADDED Viewed

Binary file (10.4 kB). View file

.venv/lib/python3.11/site-packages/ray/_private/workers/__pycache__/setup_worker.cpython-311.pyc ADDED Viewed

Binary file (1.66 kB). View file

.venv/lib/python3.11/site-packages/ray/_private/workers/default_worker.py ADDED Viewed

	@@ -0,0 +1,304 @@

+import os
+import argparse
+import base64
+import json
+import time
+import ray
+import ray._private.node
+import ray._private.ray_constants as ray_constants
+import ray._private.utils
+import ray.actor
+from ray._private.async_compat import try_install_uvloop
+from ray._private.parameter import RayParams
+from ray._private.ray_logging import configure_log_file, get_worker_log_file_name
+from ray._private.runtime_env.setup_hook import load_and_execute_setup_hook
+parser = argparse.ArgumentParser(
+    description=("Parse addresses for the worker to connect to.")
+)
+parser.add_argument(
+    "--cluster-id",
+    required=True,
+    type=str,
+    help="the auto-generated ID of the cluster",
+)
+parser.add_argument(
+    "--node-id",
+    required=True,
+    type=str,
+    help="the auto-generated ID of the node",
+)
+parser.add_argument(
+    "--node-ip-address",
+    required=True,
+    type=str,
+    help="the ip address of the worker's node",
+)
+parser.add_argument(
+    "--node-manager-port", required=True, type=int, help="the port of the worker's node"
+)
+parser.add_argument(
+    "--raylet-ip-address",
+    required=False,
+    type=str,
+    default=None,
+    help="the ip address of the worker's raylet",
+)
+parser.add_argument(
+    "--redis-address", required=True, type=str, help="the address to use for Redis"
+)
+parser.add_argument(
+    "--gcs-address", required=True, type=str, help="the address to use for GCS"
+)
+parser.add_argument(
+    "--redis-username",
+    required=False,
+    type=str,
+    default=None,
+    help="the username to use for Redis",
+)
+parser.add_argument(
+    "--redis-password",
+    required=False,
+    type=str,
+    default=None,
+    help="the password to use for Redis",
+)
+parser.add_argument(
+    "--object-store-name", required=True, type=str, help="the object store's name"
+)
+parser.add_argument("--raylet-name", required=False, type=str, help="the raylet's name")
+parser.add_argument(
+    "--logging-level",
+    required=False,
+    type=str,
+    default=ray_constants.LOGGER_LEVEL,
+    choices=ray_constants.LOGGER_LEVEL_CHOICES,
+    help=ray_constants.LOGGER_LEVEL_HELP,
+)
+parser.add_argument(
+    "--logging-format",
+    required=False,
+    type=str,
+    default=ray_constants.LOGGER_FORMAT,
+    help=ray_constants.LOGGER_FORMAT_HELP,
+)
+parser.add_argument(
+    "--temp-dir",
+    required=False,
+    type=str,
+    default=None,
+    help="Specify the path of the temporary directory use by Ray process.",
+)
+parser.add_argument(
+    "--storage",
+    required=False,
+    type=str,
+    default=None,
+    help="Specify the persistent storage path.",
+)
+parser.add_argument(
+    "--load-code-from-local",
+    default=False,
+    action="store_true",
+    help="True if code is loaded from local files, as opposed to the GCS.",
+)
+parser.add_argument(
+    "--worker-type",
+    required=False,
+    type=str,
+    default="WORKER",
+    help="Specify the type of the worker process",
+)
+parser.add_argument(
+    "--metrics-agent-port",
+    required=True,
+    type=int,
+    help="the port of the node's metric agent.",
+)
+parser.add_argument(
+    "--runtime-env-agent-port",
+    required=True,
+    type=int,
+    default=None,
+    help="The port on which the runtime env agent receives HTTP requests.",
+)
+parser.add_argument(
+    "--object-spilling-config",
+    required=False,
+    type=str,
+    default="",
+    help="The configuration of object spilling. Only used by I/O workers.",
+)
+parser.add_argument(
+    "--logging-rotate-bytes",
+    required=False,
+    type=int,
+    default=ray_constants.LOGGING_ROTATE_BYTES,
+    help="Specify the max bytes for rotating "
+    "log file, default is "
+    f"{ray_constants.LOGGING_ROTATE_BYTES} bytes.",
+)
+parser.add_argument(
+    "--logging-rotate-backup-count",
+    required=False,
+    type=int,
+    default=ray_constants.LOGGING_ROTATE_BACKUP_COUNT,
+    help="Specify the backup count of rotated log file, default is "
+    f"{ray_constants.LOGGING_ROTATE_BACKUP_COUNT}.",
+)
+parser.add_argument(
+    "--runtime-env-hash",
+    required=False,
+    type=int,
+    default=0,
+    help="The computed hash of the runtime env for this worker.",
+)
+parser.add_argument(
+    "--startup-token",
+    required=True,
+    type=int,
+    help="The startup token assigned to this worker process by the raylet.",
+)
+parser.add_argument(
+    "--ray-debugger-external",
+    default=False,
+    action="store_true",
+    help="True if Ray debugger is made available externally.",
+)
+parser.add_argument("--session-name", required=False, help="The current session name")
+parser.add_argument(
+    "--webui",
+    required=False,
+    help="The address of web ui",
+)
+parser.add_argument(
+    "--worker-launch-time-ms",
+    required=True,
+    type=int,
+    help="The time when raylet starts to launch the worker process.",
+)
+parser.add_argument(
+    "--worker-preload-modules",
+    type=str,
+    required=False,
+    help=(
+        "A comma-separated list of Python module names "
+        "to import before accepting work."
+    ),
+)
+if __name__ == "__main__":
+    # NOTE(sang): For some reason, if we move the code below
+    # to a separate function, tensorflow will capture that method
+    # as a step function. For more details, check out
+    # https://github.com/ray-project/ray/pull/12225#issue-525059663.
+    args = parser.parse_args()
+    ray._private.ray_logging.setup_logger(args.logging_level, args.logging_format)
+    worker_launched_time_ms = time.time_ns() // 1e6
+    if args.worker_type == "WORKER":
+        mode = ray.WORKER_MODE
+    elif args.worker_type == "SPILL_WORKER":
+        mode = ray.SPILL_WORKER_MODE
+    elif args.worker_type == "RESTORE_WORKER":
+        mode = ray.RESTORE_WORKER_MODE
+    else:
+        raise ValueError("Unknown worker type: " + args.worker_type)
+    # Try installing uvloop as default event-loop implementation
+    # for asyncio
+    try_install_uvloop()
+    raylet_ip_address = args.raylet_ip_address
+    if raylet_ip_address is None:
+        raylet_ip_address = args.node_ip_address
+    ray_params = RayParams(
+        node_ip_address=args.node_ip_address,
+        raylet_ip_address=raylet_ip_address,
+        node_manager_port=args.node_manager_port,
+        redis_address=args.redis_address,
+        redis_username=args.redis_username,
+        redis_password=args.redis_password,
+        plasma_store_socket_name=args.object_store_name,
+        raylet_socket_name=args.raylet_name,
+        temp_dir=args.temp_dir,
+        storage=args.storage,
+        metrics_agent_port=args.metrics_agent_port,
+        runtime_env_agent_port=args.runtime_env_agent_port,
+        gcs_address=args.gcs_address,
+        session_name=args.session_name,
+        webui=args.webui,
+        cluster_id=args.cluster_id,
+        node_id=args.node_id,
+    )
+    node = ray._private.node.Node(
+        ray_params,
+        head=False,
+        shutdown_at_exit=False,
+        spawn_reaper=False,
+        connect_only=True,
+        default_worker=True,
+    )
+    # NOTE(suquark): We must initialize the external storage before we
+    # connect to raylet. Otherwise we may receive requests before the
+    # external storage is intialized.
+    if mode == ray.RESTORE_WORKER_MODE or mode == ray.SPILL_WORKER_MODE:
+        from ray._private import external_storage, storage
+        storage._init_storage(args.storage, is_head=False)
+        if args.object_spilling_config:
+            object_spilling_config = base64.b64decode(args.object_spilling_config)
+            object_spilling_config = json.loads(object_spilling_config)
+        else:
+            object_spilling_config = {}
+        external_storage.setup_external_storage(
+            object_spilling_config, node.node_id, node.session_name
+        )
+    ray._private.worker._global_node = node
+    ray._private.worker.connect(
+        node,
+        node.session_name,
+        mode=mode,
+        runtime_env_hash=args.runtime_env_hash,
+        startup_token=args.startup_token,
+        ray_debugger_external=args.ray_debugger_external,
+        worker_launch_time_ms=args.worker_launch_time_ms,
+        worker_launched_time_ms=worker_launched_time_ms,
+    )
+    worker = ray._private.worker.global_worker
+    # Setup log file.
+    out_file, err_file = node.get_log_file_handles(
+        get_worker_log_file_name(args.worker_type)
+    )
+    configure_log_file(out_file, err_file)
+    worker.set_out_file(out_file)
+    worker.set_err_file(err_file)
+    if mode == ray.WORKER_MODE and args.worker_preload_modules:
+        module_names_to_import = args.worker_preload_modules.split(",")
+        ray._private.utils.try_import_each_module(module_names_to_import)
+    # If the worker setup function is configured, run it.
+    worker_process_setup_hook_key = os.getenv(
+        ray_constants.WORKER_PROCESS_SETUP_HOOK_ENV_VAR
+    )
+    if worker_process_setup_hook_key:
+        error = load_and_execute_setup_hook(worker_process_setup_hook_key)
+        if error is not None:
+            worker.core_worker.drain_and_exit_worker("system", error)
+    if mode == ray.WORKER_MODE:
+        worker.main_loop()
+    elif mode in [ray.RESTORE_WORKER_MODE, ray.SPILL_WORKER_MODE]:
+        # It is handled by another thread in the C++ core worker.
+        # We just need to keep the worker alive.
+        while True:
+            time.sleep(100000)
+    else:
+        raise ValueError(f"Unexcepted worker mode: {mode}")

.venv/lib/python3.11/site-packages/ray/_private/workers/setup_worker.py ADDED Viewed

	@@ -0,0 +1,33 @@

+import argparse
+import logging
+from ray._private.ray_constants import LOGGER_FORMAT, LOGGER_LEVEL
+from ray._private.ray_logging import setup_logger
+from ray._private.runtime_env.context import RuntimeEnvContext
+from ray.core.generated.common_pb2 import Language
+logger = logging.getLogger(__name__)
+parser = argparse.ArgumentParser(
+    description=("Set up the environment for a Ray worker and launch the worker.")
+)
+parser.add_argument(
+    "--serialized-runtime-env-context",
+    type=str,
+    help="the serialized runtime env context",
+)
+parser.add_argument("--language", type=str, help="the language type of the worker")
+if __name__ == "__main__":
+    setup_logger(LOGGER_LEVEL, LOGGER_FORMAT)
+    args, remaining_args = parser.parse_known_args()
+    # NOTE(edoakes): args.serialized_runtime_env_context is only None when
+    # we're starting the main Ray client proxy server. That case should
+    # probably not even go through this codepath.
+    runtime_env_context = RuntimeEnvContext.deserialize(
+        args.serialized_runtime_env_context or "{}"
+    )
+    runtime_env_context.exec_worker(remaining_args, Language.Value(args.language))

.venv/lib/python3.11/site-packages/ray/jars/ray_dist.jar ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1f3835fe29f363a67c05160a5c60634942abbd46720e587faad488cadebd2e8a
+size 32364530

.venv/lib/python3.11/site-packages/ray/rllib/__init__.py ADDED Viewed

	@@ -0,0 +1,55 @@

+import logging
+from ray._private.usage import usage_lib
+# Note: do not introduce unnecessary library dependencies here, e.g. gym.
+# This file is imported from the tune module in order to register RLlib agents.
+from ray.rllib.env.base_env import BaseEnv
+from ray.rllib.env.external_env import ExternalEnv
+from ray.rllib.env.multi_agent_env import MultiAgentEnv
+from ray.rllib.env.vector_env import VectorEnv
+from ray.rllib.evaluation.rollout_worker import RolloutWorker
+from ray.rllib.policy.policy import Policy
+from ray.rllib.policy.sample_batch import SampleBatch
+from ray.rllib.policy.tf_policy import TFPolicy
+from ray.rllib.policy.torch_policy import TorchPolicy
+from ray.tune.registry import register_trainable
+def _setup_logger():
+    logger = logging.getLogger("ray.rllib")
+    handler = logging.StreamHandler()
+    handler.setFormatter(
+        logging.Formatter(
+            "%(asctime)s\t%(levelname)s %(filename)s:%(lineno)s -- %(message)s"
+        )
+    )
+    logger.addHandler(handler)
+    logger.propagate = False
+def _register_all():
+    from ray.rllib.algorithms.registry import ALGORITHMS, _get_algorithm_class
+    for key, get_trainable_class_and_config in ALGORITHMS.items():
+        register_trainable(key, get_trainable_class_and_config()[0])
+    for key in ["__fake", "__sigmoid_fake_data", "__parameter_tuning"]:
+        register_trainable(key, _get_algorithm_class(key))
+_setup_logger()
+usage_lib.record_library_usage("rllib")
+__all__ = [
+    "Policy",
+    "TFPolicy",
+    "TorchPolicy",
+    "RolloutWorker",
+    "SampleBatch",
+    "BaseEnv",
+    "MultiAgentEnv",
+    "VectorEnv",
+    "ExternalEnv",
+]

.venv/lib/python3.11/site-packages/ray/rllib/execution/__init__.py ADDED Viewed

	@@ -0,0 +1,23 @@

+from ray.rllib.execution.learner_thread import LearnerThread
+from ray.rllib.execution.multi_gpu_learner_thread import MultiGPULearnerThread
+from ray.rllib.execution.minibatch_buffer import MinibatchBuffer
+from ray.rllib.execution.replay_ops import SimpleReplayBuffer
+from ray.rllib.execution.rollout_ops import (
+    standardize_fields,
+    synchronous_parallel_sample,
+)
+from ray.rllib.execution.train_ops import (
+    train_one_step,
+    multi_gpu_train_one_step,
+)
+__all__ = [
+    "multi_gpu_train_one_step",
+    "standardize_fields",
+    "synchronous_parallel_sample",
+    "train_one_step",
+    "LearnerThread",
+    "MultiGPULearnerThread",
+    "SimpleReplayBuffer",
+    "MinibatchBuffer",
+]

.venv/lib/python3.11/site-packages/ray/rllib/execution/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (930 Bytes). View file

.venv/lib/python3.11/site-packages/ray/rllib/execution/__pycache__/learner_thread.cpython-311.pyc ADDED Viewed

Binary file (8.03 kB). View file

.venv/lib/python3.11/site-packages/ray/rllib/execution/__pycache__/minibatch_buffer.cpython-311.pyc ADDED Viewed

Binary file (3.05 kB). View file

.venv/lib/python3.11/site-packages/ray/rllib/execution/__pycache__/multi_gpu_learner_thread.cpython-311.pyc ADDED Viewed

Binary file (11.1 kB). View file

.venv/lib/python3.11/site-packages/ray/rllib/execution/__pycache__/replay_ops.cpython-311.pyc ADDED Viewed

Binary file (2.49 kB). View file

.venv/lib/python3.11/site-packages/ray/rllib/execution/__pycache__/rollout_ops.cpython-311.pyc ADDED Viewed

Binary file (9.45 kB). View file

.venv/lib/python3.11/site-packages/ray/rllib/execution/__pycache__/segment_tree.cpython-311.pyc ADDED Viewed

Binary file (9.23 kB). View file

.venv/lib/python3.11/site-packages/ray/rllib/execution/__pycache__/train_ops.cpython-311.pyc ADDED Viewed

Binary file (9.24 kB). View file

.venv/lib/python3.11/site-packages/ray/rllib/execution/buffers/__init__.py ADDED Viewed

File without changes

.venv/lib/python3.11/site-packages/ray/rllib/execution/buffers/__pycache__/mixin_replay_buffer.cpython-311.pyc ADDED Viewed

Binary file (8.58 kB). View file

.venv/lib/python3.11/site-packages/ray/rllib/execution/learner_thread.py ADDED Viewed

	@@ -0,0 +1,137 @@

+import copy
+import queue
+import threading
+from typing import Dict, Optional
+from ray.util.timer import _Timer
+from ray.rllib.evaluation.rollout_worker import RolloutWorker
+from ray.rllib.execution.minibatch_buffer import MinibatchBuffer
+from ray.rllib.utils.annotations import OldAPIStack
+from ray.rllib.utils.framework import try_import_tf
+from ray.rllib.utils.metrics.learner_info import LearnerInfoBuilder, LEARNER_INFO
+from ray.rllib.utils.metrics.window_stat import WindowStat
+from ray.util.iter import _NextValueNotReady
+tf1, tf, tfv = try_import_tf()
+@OldAPIStack
+class LearnerThread(threading.Thread):
+    """Background thread that updates the local model from sample trajectories.
+    The learner thread communicates with the main thread through Queues. This
+    is needed since Ray operations can only be run on the main thread. In
+    addition, moving heavyweight gradient ops session runs off the main thread
+    improves overall throughput.
+    """
+    def __init__(
+        self,
+        local_worker: RolloutWorker,
+        minibatch_buffer_size: int,
+        num_sgd_iter: int,
+        learner_queue_size: int,
+        learner_queue_timeout: int,
+    ):
+        """Initialize the learner thread.
+        Args:
+            local_worker: process local rollout worker holding
+                policies this thread will call learn_on_batch() on
+            minibatch_buffer_size: max number of train batches to store
+                in the minibatching buffer
+            num_sgd_iter: number of passes to learn on per train batch
+            learner_queue_size: max size of queue of inbound
+                train batches to this thread
+            learner_queue_timeout: raise an exception if the queue has
+                been empty for this long in seconds
+        """
+        threading.Thread.__init__(self)
+        self.learner_queue_size = WindowStat("size", 50)
+        self.local_worker = local_worker
+        self.inqueue = queue.Queue(maxsize=learner_queue_size)
+        self.outqueue = queue.Queue()
+        self.minibatch_buffer = MinibatchBuffer(
+            inqueue=self.inqueue,
+            size=minibatch_buffer_size,
+            timeout=learner_queue_timeout,
+            num_passes=num_sgd_iter,
+            init_num_passes=num_sgd_iter,
+        )
+        self.queue_timer = _Timer()
+        self.grad_timer = _Timer()
+        self.load_timer = _Timer()
+        self.load_wait_timer = _Timer()
+        self.daemon = True
+        self.policy_ids_updated = []
+        self.learner_info = {}
+        self.stopped = False
+        self.num_steps = 0
+    def run(self) -> None:
+        # Switch on eager mode if configured.
+        if self.local_worker.config.framework_str == "tf2":
+            tf1.enable_eager_execution()
+        while not self.stopped:
+            self.step()
+    def step(self) -> Optional[_NextValueNotReady]:
+        with self.queue_timer:
+            try:
+                batch, _ = self.minibatch_buffer.get()
+            except queue.Empty:
+                return _NextValueNotReady()
+        with self.grad_timer:
+            # Use LearnerInfoBuilder as a unified way to build the final
+            # results dict from `learn_on_loaded_batch` call(s).
+            # This makes sure results dicts always have the same structure
+            # no matter the setup (multi-GPU, multi-agent, minibatch SGD,
+            # tf vs torch).
+            learner_info_builder = LearnerInfoBuilder(num_devices=1)
+            if self.local_worker.config.policy_states_are_swappable:
+                self.local_worker.lock()
+            multi_agent_results = self.local_worker.learn_on_batch(batch)
+            if self.local_worker.config.policy_states_are_swappable:
+                self.local_worker.unlock()
+            self.policy_ids_updated.extend(list(multi_agent_results.keys()))
+            for pid, results in multi_agent_results.items():
+                learner_info_builder.add_learn_on_batch_results(results, pid)
+            self.learner_info = learner_info_builder.finalize()
+        self.num_steps += 1
+        # Put tuple: env-steps, agent-steps, and learner info into the queue.
+        self.outqueue.put((batch.count, batch.agent_steps(), self.learner_info))
+        self.learner_queue_size.push(self.inqueue.qsize())
+    def add_learner_metrics(self, result: Dict, overwrite_learner_info=True) -> Dict:
+        """Add internal metrics to a result dict."""
+        def timer_to_ms(timer):
+            return round(1000 * timer.mean, 3)
+        if overwrite_learner_info:
+            result["info"].update(
+                {
+                    "learner_queue": self.learner_queue_size.stats(),
+                    LEARNER_INFO: copy.deepcopy(self.learner_info),
+                    "timing_breakdown": {
+                        "learner_grad_time_ms": timer_to_ms(self.grad_timer),
+                        "learner_load_time_ms": timer_to_ms(self.load_timer),
+                        "learner_load_wait_time_ms": timer_to_ms(self.load_wait_timer),
+                        "learner_dequeue_time_ms": timer_to_ms(self.queue_timer),
+                    },
+                }
+            )
+        else:
+            result["info"].update(
+                {
+                    "learner_queue": self.learner_queue_size.stats(),
+                    "timing_breakdown": {
+                        "learner_grad_time_ms": timer_to_ms(self.grad_timer),
+                        "learner_load_time_ms": timer_to_ms(self.load_timer),
+                        "learner_load_wait_time_ms": timer_to_ms(self.load_wait_timer),
+                        "learner_dequeue_time_ms": timer_to_ms(self.queue_timer),
+                    },
+                }
+            )
+        return result

.venv/lib/python3.11/site-packages/ray/rllib/execution/minibatch_buffer.py ADDED Viewed

	@@ -0,0 +1,61 @@

+from typing import Any, Tuple
+import queue
+from ray.rllib.utils.annotations import OldAPIStack
+@OldAPIStack
+class MinibatchBuffer:
+    """Ring buffer of recent data batches for minibatch SGD.
+    This is for use with AsyncSamplesOptimizer.
+    """
+    def __init__(
+        self,
+        inqueue: queue.Queue,
+        size: int,
+        timeout: float,
+        num_passes: int,
+        init_num_passes: int = 1,
+    ):
+        """Initialize a minibatch buffer.
+        Args:
+           inqueue (queue.Queue): Queue to populate the internal ring buffer
+           from.
+           size: Max number of data items to buffer.
+           timeout: Queue timeout
+           num_passes: Max num times each data item should be emitted.
+           init_num_passes: Initial passes for each data item.
+           Maxiumum number of passes per item are increased to num_passes over
+           time.
+        """
+        self.inqueue = inqueue
+        self.size = size
+        self.timeout = timeout
+        self.max_initial_ttl = num_passes
+        self.cur_initial_ttl = init_num_passes
+        self.buffers = [None] * size
+        self.ttl = [0] * size
+        self.idx = 0
+    def get(self) -> Tuple[Any, bool]:
+        """Get a new batch from the internal ring buffer.
+        Returns:
+           buf: Data item saved from inqueue.
+           released: True if the item is now removed from the ring buffer.
+        """
+        if self.ttl[self.idx] <= 0:
+            self.buffers[self.idx] = self.inqueue.get(timeout=self.timeout)
+            self.ttl[self.idx] = self.cur_initial_ttl
+            if self.cur_initial_ttl < self.max_initial_ttl:
+                self.cur_initial_ttl += 1
+        buf = self.buffers[self.idx]
+        self.ttl[self.idx] -= 1
+        released = self.ttl[self.idx] <= 0
+        if released:
+            self.buffers[self.idx] = None
+        self.idx = (self.idx + 1) % len(self.buffers)
+        return buf, released

.venv/lib/python3.11/site-packages/ray/rllib/execution/multi_gpu_learner_thread.py ADDED Viewed

	@@ -0,0 +1,245 @@

+import logging
+import queue
+import threading
+from ray.util.timer import _Timer
+from ray.rllib.execution.learner_thread import LearnerThread
+from ray.rllib.execution.minibatch_buffer import MinibatchBuffer
+from ray.rllib.policy.sample_batch import SampleBatch
+from ray.rllib.utils.annotations import OldAPIStack, override
+from ray.rllib.utils.deprecation import deprecation_warning
+from ray.rllib.utils.framework import try_import_tf
+from ray.rllib.utils.metrics.learner_info import LearnerInfoBuilder
+from ray.rllib.evaluation.rollout_worker import RolloutWorker
+tf1, tf, tfv = try_import_tf()
+logger = logging.getLogger(__name__)
+@OldAPIStack
+class MultiGPULearnerThread(LearnerThread):
+    """Learner that can use multiple GPUs and parallel loading.
+    This class is used for async sampling algorithms.
+    Example workflow: 2 GPUs and 3 multi-GPU tower stacks.
+    -> On each GPU, there are 3 slots for batches, indexed 0, 1, and 2.
+    Workers collect data from env and push it into inqueue:
+    Workers -> (data) -> self.inqueue
+    We also have two queues, indicating, which stacks are loaded and which
+    are not.
+    - idle_tower_stacks = [0, 1, 2]  <- all 3 stacks are free at first.
+    - ready_tower_stacks = []  <- None of the 3 stacks is loaded with data.
+    `ready_tower_stacks` is managed by `ready_tower_stacks_buffer` for
+    possible minibatch-SGD iterations per loaded batch (this avoids a reload
+    from CPU to GPU for each SGD iter).
+    n _MultiGPULoaderThreads: self.inqueue -get()->
+    policy.load_batch_into_buffer() -> ready_stacks = [0 ...]
+    This thread: self.ready_tower_stacks_buffer -get()->
+    policy.learn_on_loaded_batch() -> if SGD-iters done,
+    put stack index back in idle_tower_stacks queue.
+    """
+    def __init__(
+        self,
+        local_worker: RolloutWorker,
+        num_gpus: int = 1,
+        lr=None,  # deprecated.
+        train_batch_size: int = 500,
+        num_multi_gpu_tower_stacks: int = 1,
+        num_sgd_iter: int = 1,
+        learner_queue_size: int = 16,
+        learner_queue_timeout: int = 300,
+        num_data_load_threads: int = 16,
+        _fake_gpus: bool = False,
+        # Deprecated arg, use
+        minibatch_buffer_size=None,
+    ):
+        """Initializes a MultiGPULearnerThread instance.
+        Args:
+            local_worker: Local RolloutWorker holding
+                policies this thread will call `load_batch_into_buffer` and
+                `learn_on_loaded_batch` on.
+            num_gpus: Number of GPUs to use for data-parallel SGD.
+            train_batch_size: Size of batches (minibatches if
+                `num_sgd_iter` > 1) to learn on.
+            num_multi_gpu_tower_stacks: Number of buffers to parallelly
+                load data into on one device. Each buffer is of size of
+                `train_batch_size` and hence increases GPU memory usage
+                accordingly.
+            num_sgd_iter: Number of passes to learn on per train batch
+                (minibatch if `num_sgd_iter` > 1).
+            learner_queue_size: Max size of queue of inbound
+                train batches to this thread.
+            num_data_load_threads: Number of threads to use to load
+                data into GPU memory in parallel.
+        """
+        # Deprecated: No need to specify as we don't need the actual
+        # minibatch-buffer anyways.
+        if minibatch_buffer_size:
+            deprecation_warning(
+                old="MultiGPULearnerThread.minibatch_buffer_size",
+                error=True,
+            )
+        super().__init__(
+            local_worker=local_worker,
+            minibatch_buffer_size=0,
+            num_sgd_iter=num_sgd_iter,
+            learner_queue_size=learner_queue_size,
+            learner_queue_timeout=learner_queue_timeout,
+        )
+        # Delete reference to parent's minibatch_buffer, which is not needed.
+        # Instead, in multi-GPU mode, we pull tower stack indices from the
+        # `self.ready_tower_stacks_buffer` buffer, whose size is exactly
+        # `num_multi_gpu_tower_stacks`.
+        self.minibatch_buffer = None
+        self.train_batch_size = train_batch_size
+        self.policy_map = self.local_worker.policy_map
+        self.devices = next(iter(self.policy_map.values())).devices
+        logger.info("MultiGPULearnerThread devices {}".format(self.devices))
+        assert self.train_batch_size % len(self.devices) == 0
+        assert self.train_batch_size >= len(self.devices), "batch too small"
+        self.tower_stack_indices = list(range(num_multi_gpu_tower_stacks))
+        # Two queues for tower stacks:
+        # a) Those that are loaded with data ("ready")
+        # b) Those that are ready to be loaded with new data ("idle").
+        self.idle_tower_stacks = queue.Queue()
+        self.ready_tower_stacks = queue.Queue()
+        # In the beginning, all stacks are idle (no loading has taken place
+        # yet).
+        for idx in self.tower_stack_indices:
+            self.idle_tower_stacks.put(idx)
+        # Start n threads that are responsible for loading data into the
+        # different (idle) stacks.
+        for i in range(num_data_load_threads):
+            self.loader_thread = _MultiGPULoaderThread(self, share_stats=(i == 0))
+            self.loader_thread.start()
+        # Create a buffer that holds stack indices that are "ready"
+        # (loaded with data). Those are stacks that we can call
+        # "learn_on_loaded_batch" on.
+        self.ready_tower_stacks_buffer = MinibatchBuffer(
+            self.ready_tower_stacks,
+            num_multi_gpu_tower_stacks,
+            learner_queue_timeout,
+            num_sgd_iter,
+        )
+    @override(LearnerThread)
+    def step(self) -> None:
+        if not self.loader_thread.is_alive():
+            raise RuntimeError(
+                "The `_MultiGPULoaderThread` has died! Will therefore also terminate "
+                "the `MultiGPULearnerThread`."
+            )
+        with self.load_wait_timer:
+            buffer_idx, released = self.ready_tower_stacks_buffer.get()
+        get_num_samples_loaded_into_buffer = 0
+        with self.grad_timer:
+            # Use LearnerInfoBuilder as a unified way to build the final
+            # results dict from `learn_on_loaded_batch` call(s).
+            # This makes sure results dicts always have the same structure
+            # no matter the setup (multi-GPU, multi-agent, minibatch SGD,
+            # tf vs torch).
+            learner_info_builder = LearnerInfoBuilder(num_devices=len(self.devices))
+            for pid in self.policy_map.keys():
+                # Not a policy-to-train.
+                if (
+                    self.local_worker.is_policy_to_train is not None
+                    and not self.local_worker.is_policy_to_train(pid)
+                ):
+                    continue
+                policy = self.policy_map[pid]
+                default_policy_results = policy.learn_on_loaded_batch(
+                    offset=0, buffer_index=buffer_idx
+                )
+                learner_info_builder.add_learn_on_batch_results(
+                    default_policy_results, policy_id=pid
+                )
+                self.policy_ids_updated.append(pid)
+                get_num_samples_loaded_into_buffer += (
+                    policy.get_num_samples_loaded_into_buffer(buffer_idx)
+                )
+            self.learner_info = learner_info_builder.finalize()
+        if released:
+            self.idle_tower_stacks.put(buffer_idx)
+        # Put tuple: env-steps, agent-steps, and learner info into the queue.
+        self.outqueue.put(
+            (
+                get_num_samples_loaded_into_buffer,
+                get_num_samples_loaded_into_buffer,
+                self.learner_info,
+            )
+        )
+        self.learner_queue_size.push(self.inqueue.qsize())
+class _MultiGPULoaderThread(threading.Thread):
+    def __init__(
+        self, multi_gpu_learner_thread: MultiGPULearnerThread, share_stats: bool
+    ):
+        threading.Thread.__init__(self)
+        self.multi_gpu_learner_thread = multi_gpu_learner_thread
+        self.daemon = True
+        if share_stats:
+            self.queue_timer = multi_gpu_learner_thread.queue_timer
+            self.load_timer = multi_gpu_learner_thread.load_timer
+        else:
+            self.queue_timer = _Timer()
+            self.load_timer = _Timer()
+    def run(self) -> None:
+        while True:
+            self._step()
+    def _step(self) -> None:
+        s = self.multi_gpu_learner_thread
+        policy_map = s.policy_map
+        # Get a new batch from the data (inqueue).
+        with self.queue_timer:
+            batch = s.inqueue.get()
+        # Get next idle stack for loading.
+        buffer_idx = s.idle_tower_stacks.get()
+        # Load the batch into the idle stack.
+        with self.load_timer:
+            for pid in policy_map.keys():
+                if (
+                    s.local_worker.is_policy_to_train is not None
+                    and not s.local_worker.is_policy_to_train(pid, batch)
+                ):
+                    continue
+                policy = policy_map[pid]
+                if isinstance(batch, SampleBatch):
+                    policy.load_batch_into_buffer(
+                        batch=batch,
+                        buffer_index=buffer_idx,
+                    )
+                elif pid in batch.policy_batches:
+                    policy.load_batch_into_buffer(
+                        batch=batch.policy_batches[pid],
+                        buffer_index=buffer_idx,
+                    )
+        # Tag just-loaded stack as "ready".
+        s.ready_tower_stacks.put(buffer_idx)

.venv/lib/python3.11/site-packages/ray/rllib/execution/replay_ops.py ADDED Viewed

	@@ -0,0 +1,37 @@

+from typing import Optional
+import random
+from ray.rllib.utils.annotations import OldAPIStack
+from ray.rllib.utils.replay_buffers.replay_buffer import warn_replay_capacity
+from ray.rllib.utils.typing import SampleBatchType
+@OldAPIStack
+class SimpleReplayBuffer:
+    """Simple replay buffer that operates over batches."""
+    def __init__(self, num_slots: int, replay_proportion: Optional[float] = None):
+        """Initialize SimpleReplayBuffer.
+        Args:
+            num_slots: Number of batches to store in total.
+        """
+        self.num_slots = num_slots
+        self.replay_batches = []
+        self.replay_index = 0
+    def add_batch(self, sample_batch: SampleBatchType) -> None:
+        warn_replay_capacity(item=sample_batch, num_items=self.num_slots)
+        if self.num_slots > 0:
+            if len(self.replay_batches) < self.num_slots:
+                self.replay_batches.append(sample_batch)
+            else:
+                self.replay_batches[self.replay_index] = sample_batch
+                self.replay_index += 1
+                self.replay_index %= self.num_slots
+    def replay(self) -> SampleBatchType:
+        return random.choice(self.replay_batches)
+    def __len__(self):
+        return len(self.replay_batches)

.venv/lib/python3.11/site-packages/ray/rllib/execution/rollout_ops.py ADDED Viewed

	@@ -0,0 +1,207 @@

+import logging
+from typing import List, Optional, Union
+import tree
+from ray.rllib.env.env_runner_group import EnvRunnerGroup
+from ray.rllib.policy.sample_batch import (
+    SampleBatch,
+    DEFAULT_POLICY_ID,
+    concat_samples,
+)
+from ray.rllib.utils.annotations import ExperimentalAPI, OldAPIStack
+from ray.rllib.utils.metrics import NUM_AGENT_STEPS_SAMPLED, NUM_ENV_STEPS_SAMPLED
+from ray.rllib.utils.sgd import standardized
+from ray.rllib.utils.typing import EpisodeType, SampleBatchType
+logger = logging.getLogger(__name__)
+@ExperimentalAPI
+def synchronous_parallel_sample(
+    *,
+    worker_set: EnvRunnerGroup,
+    max_agent_steps: Optional[int] = None,
+    max_env_steps: Optional[int] = None,
+    concat: bool = True,
+    sample_timeout_s: Optional[float] = None,
+    random_actions: bool = False,
+    _uses_new_env_runners: bool = False,
+    _return_metrics: bool = False,
+) -> Union[List[SampleBatchType], SampleBatchType, List[EpisodeType], EpisodeType]:
+    """Runs parallel and synchronous rollouts on all remote workers.
+    Waits for all workers to return from the remote calls.
+    If no remote workers exist (num_workers == 0), use the local worker
+    for sampling.
+    Alternatively to calling `worker.sample.remote()`, the user can provide a
+    `remote_fn()`, which will be applied to the worker(s) instead.
+    Args:
+        worker_set: The EnvRunnerGroup to use for sampling.
+        remote_fn: If provided, use `worker.apply.remote(remote_fn)` instead
+            of `worker.sample.remote()` to generate the requests.
+        max_agent_steps: Optional number of agent steps to be included in the
+            final batch or list of episodes.
+        max_env_steps: Optional number of environment steps to be included in the
+            final batch or list of episodes.
+        concat: Whether to aggregate all resulting batches or episodes. in case of
+            batches the list of batches is concatinated at the end. in case of
+            episodes all episode lists from workers are flattened into a single list.
+        sample_timeout_s: The timeout in sec to use on the `foreach_env_runner` call.
+            After this time, the call will return with a result (or not if all
+            EnvRunners are stalling). If None, will block indefinitely and not timeout.
+        _uses_new_env_runners: Whether the new `EnvRunner API` is used. In this case
+            episodes instead of `SampleBatch` objects are returned.
+    Returns:
+        The list of collected sample batch types or episode types (one for each parallel
+        rollout worker in the given `worker_set`).
+    .. testcode::
+        # Define an RLlib Algorithm.
+        from ray.rllib.algorithms.ppo import PPO, PPOConfig
+        config = (
+            PPOConfig()
+            .environment("CartPole-v1")
+        )
+        algorithm = config.build()
+        # 2 remote EnvRunners (num_env_runners=2):
+        episodes = synchronous_parallel_sample(
+            worker_set=algorithm.env_runner_group,
+            _uses_new_env_runners=True,
+            concat=False,
+        )
+        print(len(episodes))
+    .. testoutput::
+        2
+    """
+    # Only allow one of `max_agent_steps` or `max_env_steps` to be defined.
+    assert not (max_agent_steps is not None and max_env_steps is not None)
+    agent_or_env_steps = 0
+    max_agent_or_env_steps = max_agent_steps or max_env_steps or None
+    sample_batches_or_episodes = []
+    all_stats_dicts = []
+    random_action_kwargs = {} if not random_actions else {"random_actions": True}
+    # Stop collecting batches as soon as one criterium is met.
+    while (max_agent_or_env_steps is None and agent_or_env_steps == 0) or (
+        max_agent_or_env_steps is not None
+        and agent_or_env_steps < max_agent_or_env_steps
+    ):
+        # No remote workers in the set -> Use local worker for collecting
+        # samples.
+        if worker_set.num_remote_workers() <= 0:
+            sampled_data = [worker_set.local_env_runner.sample(**random_action_kwargs)]
+            if _return_metrics:
+                stats_dicts = [worker_set.local_env_runner.get_metrics()]
+        # Loop over remote workers' `sample()` method in parallel.
+        else:
+            sampled_data = worker_set.foreach_env_runner(
+                (
+                    (lambda w: w.sample(**random_action_kwargs))
+                    if not _return_metrics
+                    else (lambda w: (w.sample(**random_action_kwargs), w.get_metrics()))
+                ),
+                local_env_runner=False,
+                timeout_seconds=sample_timeout_s,
+            )
+            # Nothing was returned (maybe all workers are stalling) or no healthy
+            # remote workers left: Break.
+            # There is no point staying in this loop, since we will not be able to
+            # get any new samples if we don't have any healthy remote workers left.
+            if not sampled_data or worker_set.num_healthy_remote_workers() <= 0:
+                if not sampled_data:
+                    logger.warning(
+                        "No samples returned from remote workers. If you have a "
+                        "slow environment or model, consider increasing the "
+                        "`sample_timeout_s` or decreasing the "
+                        "`rollout_fragment_length` in `AlgorithmConfig.env_runners()."
+                    )
+                elif worker_set.num_healthy_remote_workers() <= 0:
+                    logger.warning(
+                        "No healthy remote workers left. Trying to restore workers ..."
+                    )
+                break
+            if _return_metrics:
+                stats_dicts = [s[1] for s in sampled_data]
+                sampled_data = [s[0] for s in sampled_data]
+        # Update our counters for the stopping criterion of the while loop.
+        if _return_metrics:
+            if max_agent_steps:
+                agent_or_env_steps += sum(
+                    int(agent_stat)
+                    for stat_dict in stats_dicts
+                    for agent_stat in stat_dict[NUM_AGENT_STEPS_SAMPLED].values()
+                )
+            else:
+                agent_or_env_steps += sum(
+                    int(stat_dict[NUM_ENV_STEPS_SAMPLED]) for stat_dict in stats_dicts
+                )
+            sample_batches_or_episodes.extend(sampled_data)
+            all_stats_dicts.extend(stats_dicts)
+        else:
+            for batch_or_episode in sampled_data:
+                if max_agent_steps:
+                    agent_or_env_steps += (
+                        sum(e.agent_steps() for e in batch_or_episode)
+                        if _uses_new_env_runners
+                        else batch_or_episode.agent_steps()
+                    )
+                else:
+                    agent_or_env_steps += (
+                        sum(e.env_steps() for e in batch_or_episode)
+                        if _uses_new_env_runners
+                        else batch_or_episode.env_steps()
+                    )
+                sample_batches_or_episodes.append(batch_or_episode)
+                # Break out (and ignore the remaining samples) if max timesteps (batch
+                # size) reached. We want to avoid collecting batches that are too large
+                # only because of a failed/restarted worker causing a second iteration
+                # of the main loop.
+                if (
+                    max_agent_or_env_steps is not None
+                    and agent_or_env_steps >= max_agent_or_env_steps
+                ):
+                    break
+    if concat is True:
+        # If we have episodes flatten the episode list.
+        if _uses_new_env_runners:
+            sample_batches_or_episodes = tree.flatten(sample_batches_or_episodes)
+        # Otherwise we concatenate the `SampleBatch` objects
+        else:
+            sample_batches_or_episodes = concat_samples(sample_batches_or_episodes)
+    if _return_metrics:
+        return sample_batches_or_episodes, all_stats_dicts
+    return sample_batches_or_episodes
+@OldAPIStack
+def standardize_fields(samples: SampleBatchType, fields: List[str]) -> SampleBatchType:
+    """Standardize fields of the given SampleBatch"""
+    wrapped = False
+    if isinstance(samples, SampleBatch):
+        samples = samples.as_multi_agent()
+        wrapped = True
+    for policy_id in samples.policy_batches:
+        batch = samples.policy_batches[policy_id]
+        for field in fields:
+            if field in batch:
+                batch[field] = standardized(batch[field])
+    if wrapped:
+        samples = samples.policy_batches[DEFAULT_POLICY_ID]
+    return samples