koichi12 commited on Feb 12, 2025

Commit

cf6a8b4

verified ·

1 Parent(s): ed5a2c3

Add files using upload-large-folder tool

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.venv/lib/python3.11/site-packages/ray/autoscaler/aliyun/__init__.py +0 -0
.venv/lib/python3.11/site-packages/ray/autoscaler/aliyun/__pycache__/__init__.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/autoscaler/v2/__pycache__/__init__.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/autoscaler/v2/__pycache__/autoscaler.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/autoscaler/v2/__pycache__/event_logger.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/autoscaler/v2/__pycache__/metrics_reporter.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/autoscaler/v2/__pycache__/monitor.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/autoscaler/v2/__pycache__/scheduler.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/autoscaler/v2/__pycache__/schema.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/autoscaler/v2/__pycache__/sdk.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/autoscaler/v2/__pycache__/utils.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/autoscaler/v2/instance_manager/__pycache__/__init__.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/autoscaler/v2/instance_manager/__pycache__/common.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/autoscaler/v2/instance_manager/__pycache__/config.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/autoscaler/v2/instance_manager/__pycache__/instance_manager.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/autoscaler/v2/instance_manager/__pycache__/instance_storage.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/autoscaler/v2/instance_manager/__pycache__/node_provider.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/autoscaler/v2/instance_manager/__pycache__/ray_installer.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/autoscaler/v2/instance_manager/__pycache__/reconciler.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/autoscaler/v2/instance_manager/__pycache__/storage.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/autoscaler/v2/instance_manager/cloud_providers/__init__.py +0 -0
.venv/lib/python3.11/site-packages/ray/autoscaler/v2/instance_manager/cloud_providers/__pycache__/__init__.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/autoscaler/v2/instance_manager/cloud_providers/kuberay/__init__.py +0 -0
.venv/lib/python3.11/site-packages/ray/autoscaler/v2/instance_manager/cloud_providers/kuberay/__pycache__/__init__.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/autoscaler/v2/instance_manager/cloud_providers/kuberay/__pycache__/cloud_provider.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/autoscaler/v2/instance_manager/cloud_providers/kuberay/cloud_provider.py +571 -0
.venv/lib/python3.11/site-packages/ray/autoscaler/v2/instance_manager/cloud_providers/read_only/__init__.py +0 -0
.venv/lib/python3.11/site-packages/ray/autoscaler/v2/instance_manager/cloud_providers/read_only/__pycache__/__init__.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/autoscaler/v2/instance_manager/cloud_providers/read_only/__pycache__/cloud_provider.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/autoscaler/v2/instance_manager/cloud_providers/read_only/cloud_provider.py +73 -0
.venv/lib/python3.11/site-packages/ray/autoscaler/v2/instance_manager/subscribers/__init__.py +0 -0
.venv/lib/python3.11/site-packages/ray/autoscaler/v2/instance_manager/subscribers/__pycache__/__init__.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/autoscaler/v2/instance_manager/subscribers/__pycache__/cloud_instance_updater.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/autoscaler/v2/instance_manager/subscribers/__pycache__/ray_stopper.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/autoscaler/v2/instance_manager/subscribers/__pycache__/threaded_ray_installer.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/autoscaler/v2/instance_manager/subscribers/cloud_instance_updater.py +93 -0
.venv/lib/python3.11/site-packages/ray/autoscaler/v2/instance_manager/subscribers/ray_stopper.py +154 -0
.venv/lib/python3.11/site-packages/ray/autoscaler/v2/instance_manager/subscribers/threaded_ray_installer.py +95 -0
.venv/lib/python3.11/site-packages/ray/util/__init__.py +74 -0
.venv/lib/python3.11/site-packages/ray/util/actor_group.py +230 -0
.venv/lib/python3.11/site-packages/ray/util/actor_pool.py +463 -0
.venv/lib/python3.11/site-packages/ray/util/check_open_ports.py +179 -0
.venv/lib/python3.11/site-packages/ray/util/check_serialize.py +265 -0
.venv/lib/python3.11/site-packages/ray/util/client_connect.py +76 -0
.venv/lib/python3.11/site-packages/ray/util/dask/scheduler_utils.py +371 -0
.venv/lib/python3.11/site-packages/ray/util/debug.py +274 -0
.venv/lib/python3.11/site-packages/ray/util/debugpy.py +136 -0
.venv/lib/python3.11/site-packages/ray/util/iter_metrics.py +69 -0
.venv/lib/python3.11/site-packages/ray/util/lightgbm/__init__.py +4 -0
.venv/lib/python3.11/site-packages/ray/util/lightgbm/__pycache__/__init__.cpython-311.pyc +0 -0

.venv/lib/python3.11/site-packages/ray/autoscaler/aliyun/__init__.py ADDED Viewed

File without changes

.venv/lib/python3.11/site-packages/ray/autoscaler/aliyun/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (194 Bytes). View file

.venv/lib/python3.11/site-packages/ray/autoscaler/v2/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (190 Bytes). View file

.venv/lib/python3.11/site-packages/ray/autoscaler/v2/__pycache__/autoscaler.cpython-311.pyc ADDED Viewed

Binary file (9.29 kB). View file

.venv/lib/python3.11/site-packages/ray/autoscaler/v2/__pycache__/event_logger.cpython-311.pyc ADDED Viewed

Binary file (7.55 kB). View file

.venv/lib/python3.11/site-packages/ray/autoscaler/v2/__pycache__/metrics_reporter.cpython-311.pyc ADDED Viewed

Binary file (5.95 kB). View file

.venv/lib/python3.11/site-packages/ray/autoscaler/v2/__pycache__/monitor.cpython-311.pyc ADDED Viewed

Binary file (12.9 kB). View file

.venv/lib/python3.11/site-packages/ray/autoscaler/v2/__pycache__/scheduler.cpython-311.pyc ADDED Viewed

Binary file (64.3 kB). View file

.venv/lib/python3.11/site-packages/ray/autoscaler/v2/__pycache__/schema.cpython-311.pyc ADDED Viewed

Binary file (14 kB). View file

.venv/lib/python3.11/site-packages/ray/autoscaler/v2/__pycache__/sdk.cpython-311.pyc ADDED Viewed

Binary file (5 kB). View file

.venv/lib/python3.11/site-packages/ray/autoscaler/v2/__pycache__/utils.cpython-311.pyc ADDED Viewed

Binary file (34.3 kB). View file

.venv/lib/python3.11/site-packages/ray/autoscaler/v2/instance_manager/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (207 Bytes). View file

.venv/lib/python3.11/site-packages/ray/autoscaler/v2/instance_manager/__pycache__/common.cpython-311.pyc ADDED Viewed

Binary file (15.9 kB). View file

.venv/lib/python3.11/site-packages/ray/autoscaler/v2/instance_manager/__pycache__/config.cpython-311.pyc ADDED Viewed

Binary file (29.3 kB). View file

.venv/lib/python3.11/site-packages/ray/autoscaler/v2/instance_manager/__pycache__/instance_manager.cpython-311.pyc ADDED Viewed

Binary file (11.8 kB). View file

.venv/lib/python3.11/site-packages/ray/autoscaler/v2/instance_manager/__pycache__/instance_storage.cpython-311.pyc ADDED Viewed

Binary file (6.9 kB). View file

.venv/lib/python3.11/site-packages/ray/autoscaler/v2/instance_manager/__pycache__/node_provider.cpython-311.pyc ADDED Viewed

Binary file (22.4 kB). View file

.venv/lib/python3.11/site-packages/ray/autoscaler/v2/instance_manager/__pycache__/ray_installer.cpython-311.pyc ADDED Viewed

Binary file (5.26 kB). View file

.venv/lib/python3.11/site-packages/ray/autoscaler/v2/instance_manager/__pycache__/reconciler.cpython-311.pyc ADDED Viewed

Binary file (61.2 kB). View file

.venv/lib/python3.11/site-packages/ray/autoscaler/v2/instance_manager/__pycache__/storage.cpython-311.pyc ADDED Viewed

Binary file (10.8 kB). View file

.venv/lib/python3.11/site-packages/ray/autoscaler/v2/instance_manager/cloud_providers/__init__.py ADDED Viewed

File without changes

.venv/lib/python3.11/site-packages/ray/autoscaler/v2/instance_manager/cloud_providers/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (223 Bytes). View file

.venv/lib/python3.11/site-packages/ray/autoscaler/v2/instance_manager/cloud_providers/kuberay/__init__.py ADDED Viewed

File without changes

.venv/lib/python3.11/site-packages/ray/autoscaler/v2/instance_manager/cloud_providers/kuberay/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (231 Bytes). View file

.venv/lib/python3.11/site-packages/ray/autoscaler/v2/instance_manager/cloud_providers/kuberay/__pycache__/cloud_provider.cpython-311.pyc ADDED Viewed

Binary file (23 kB). View file

.venv/lib/python3.11/site-packages/ray/autoscaler/v2/instance_manager/cloud_providers/kuberay/cloud_provider.py ADDED Viewed

	@@ -0,0 +1,571 @@

+import copy
+import logging
+import time
+from collections import defaultdict
+from dataclasses import dataclass, field
+from typing import Any, Dict, List, Optional, Set, Tuple
+import requests
+# TODO(rickyx): We should eventually remove these imports
+# when we deprecate the v1 kuberay node provider.
+from ray.autoscaler._private.kuberay.node_provider import (
+    KUBERAY_KIND_HEAD,
+    KUBERAY_KIND_WORKER,
+    KUBERAY_LABEL_KEY_KIND,
+    KUBERAY_LABEL_KEY_TYPE,
+    RAY_HEAD_POD_NAME,
+    IKubernetesHttpApiClient,
+    KubernetesHttpApiClient,
+    _worker_group_index,
+    _worker_group_max_replicas,
+    _worker_group_replicas,
+    worker_delete_patch,
+    worker_replica_patch,
+)
+from ray.autoscaler.v2.instance_manager.node_provider import (
+    CloudInstance,
+    CloudInstanceId,
+    CloudInstanceProviderError,
+    ICloudInstanceProvider,
+    LaunchNodeError,
+    NodeKind,
+    TerminateNodeError,
+)
+from ray.autoscaler.v2.schema import NodeType
+logger = logging.getLogger(__name__)
+class KubeRayProvider(ICloudInstanceProvider):
+    """
+    This class is a thin wrapper around the Kubernetes API client. It modifies
+    the RayCluster resource spec on the Kubernetes API server to scale the cluster:
+    It launches new instances/nodes by submitting patches to the Kubernetes API
+    to update the RayCluster CRD.
+    """
+    def __init__(
+        self,
+        cluster_name: str,
+        provider_config: Dict[str, Any],
+        k8s_api_client: Optional[IKubernetesHttpApiClient] = None,
+    ):
+        """
+        Args:
+            cluster_name: The name of the RayCluster resource.
+            provider_config: The namespace of the RayCluster.
+            k8s_api_client: The client to the Kubernetes API server.
+                This could be used to mock the Kubernetes API server for testing.
+        """
+        self._cluster_name = cluster_name
+        self._namespace = provider_config["namespace"]
+        self._k8s_api_client = k8s_api_client or KubernetesHttpApiClient(
+            namespace=self._namespace
+        )
+        # Below are states that are cached locally.
+        self._requests = set()
+        self._launch_errors_queue = []
+        self._terminate_errors_queue = []
+        # Below are states that are fetched from the Kubernetes API server.
+        self._ray_cluster = None
+        self._cached_instances: Dict[CloudInstanceId, CloudInstance]
+    @dataclass
+    class ScaleRequest:
+        """Represents a scale request that contains the current states and go-to states
+        for the ray cluster.
+        This class will be converted to patches to be submitted to the Kubernetes API
+        server:
+        - For launching new instances, it will adjust the `replicas` field in the
+            workerGroupSpecs.
+        - For terminating instances, it will adjust the `workersToDelete` field in the
+            workerGroupSpecs.
+        """
+        # The desired number of workers for each node type.
+        desired_num_workers: Dict[NodeType, int] = field(default_factory=dict)
+        # The workers to delete for each node type.
+        workers_to_delete: Dict[NodeType, List[CloudInstanceId]] = field(
+            default_factory=dict
+        )
+        # The worker groups with empty workersToDelete field.
+        # This is needed since we will also need to clear the workersToDelete field
+        # for the worker groups that have finished deletes.
+        worker_groups_without_pending_deletes: Set[NodeType] = field(
+            default_factory=set
+        )
+        # The worker groups that still have workers to be deleted.
+        worker_groups_with_pending_deletes: Set[NodeType] = field(default_factory=set)
+    ################################
+    # Interface for ICloudInstanceProvider
+    ################################
+    def get_non_terminated(self) -> Dict[CloudInstanceId, CloudInstance]:
+        self._sync_with_api_server()
+        return copy.deepcopy(
+            {id: instance for id, instance in self._cached_instances.items()}
+        )
+    def terminate(self, ids: List[CloudInstanceId], request_id: str) -> None:
+        if request_id in self._requests:
+            # This request is already processed.
+            logger.warning(f"Request {request_id} is already processed for: {ids}")
+            return
+        self._requests.add(request_id)
+        logger.info("Terminating worker pods: {}".format(ids))
+        scale_request = self._initialize_scale_request(
+            to_launch={}, to_delete_instances=ids
+        )
+        if scale_request.worker_groups_with_pending_deletes:
+            errors_msg = (
+                "There are workers to be deleted from: "
+                f"{scale_request.worker_groups_with_pending_deletes}. "
+                "Waiting for them to be deleted before adding new workers "
+                " to be deleted"
+            )
+            logger.warning(errors_msg)
+            self._add_terminate_errors(
+                ids,
+                request_id,
+                details=errors_msg,
+            )
+            return
+        try:
+            self._submit_scale_request(scale_request)
+        except Exception as e:
+            logger.exception(f"Error terminating nodes: {scale_request}")
+            self._add_terminate_errors(ids, request_id, details=str(e), e=e)
+    def launch(self, shape: Dict[NodeType, int], request_id: str) -> None:
+        if request_id in self._requests:
+            # This request is already processed.
+            return
+        self._requests.add(request_id)
+        scale_request = self._initialize_scale_request(
+            to_launch=shape, to_delete_instances=[]
+        )
+        if scale_request.worker_groups_with_pending_deletes:
+            error_msg = (
+                "There are workers to be deleted from: "
+                f"{scale_request.worker_groups_with_pending_deletes}. "
+                "Waiting for them to be deleted before creating new workers."
+            )
+            logger.warning(error_msg)
+            self._add_launch_errors(
+                shape,
+                request_id,
+                details=error_msg,
+            )
+            return
+        try:
+            self._submit_scale_request(scale_request)
+        except Exception as e:
+            logger.exception(f"Error launching nodes: {scale_request}")
+            self._add_launch_errors(shape, request_id, details=str(e), e=e)
+    def poll_errors(self) -> List[CloudInstanceProviderError]:
+        errors = []
+        errors += self._launch_errors_queue
+        errors += self._terminate_errors_queue
+        self._launch_errors_queue = []
+        self._terminate_errors_queue = []
+        return errors
+    ############################
+    # Private
+    ############################
+    def _initialize_scale_request(
+        self, to_launch: Dict[NodeType, int], to_delete_instances: List[CloudInstanceId]
+    ) -> "KubeRayProvider.ScaleRequest":
+        """
+        Initialize the scale request based on the current state of the cluster and
+        the desired state (to launch, to delete).
+        Args:
+            to_launch: The desired number of workers to launch for each node type.
+            to_delete_instances: The instances to delete.
+        Returns:
+            The scale request.
+        """
+        # Update the cached states.
+        self._sync_with_api_server()
+        ray_cluster = self.ray_cluster
+        cur_instances = self.instances
+        # Get the worker groups that have pending deletes and the worker groups that
+        # have finished deletes, and the set of workers included in the workersToDelete
+        # field of any worker group.
+        (
+            worker_groups_with_pending_deletes,
+            worker_groups_without_pending_deletes,
+            worker_to_delete_set,
+        ) = self._get_workers_delete_info(ray_cluster, set(cur_instances.keys()))
+        # Calculate the desired number of workers by type.
+        num_workers_dict = defaultdict(int)
+        worker_groups = ray_cluster["spec"].get("workerGroupSpecs", [])
+        for worker_group in worker_groups:
+            node_type = worker_group["groupName"]
+            # Handle the case where users manually increase `minReplicas`
+            # to scale up the number of worker Pods. In this scenario,
+            # `replicas` will be smaller than `minReplicas`.
+            num_workers_dict[node_type] = max(
+                worker_group["replicas"], worker_group["minReplicas"]
+            )
+        # Add to launch nodes.
+        for node_type, count in to_launch.items():
+            num_workers_dict[node_type] += count
+        to_delete_instances_by_type = defaultdict(list)
+        # Update the number of workers with to_delete_instances
+        # and group them by type.
+        for to_delete_id in to_delete_instances:
+            to_delete_instance = cur_instances.get(to_delete_id, None)
+            if to_delete_instance is None:
+                # This instance has already been deleted.
+                continue
+            if to_delete_instance.node_kind == NodeKind.HEAD:
+                # Not possible to delete head node.
+                continue
+            if to_delete_instance.cloud_instance_id in worker_to_delete_set:
+                # If the instance is already in the workersToDelete field of
+                # any worker group, skip it.
+                continue
+            num_workers_dict[to_delete_instance.node_type] -= 1
+            assert num_workers_dict[to_delete_instance.node_type] >= 0
+            to_delete_instances_by_type[to_delete_instance.node_type].append(
+                to_delete_instance
+            )
+        scale_request = KubeRayProvider.ScaleRequest(
+            desired_num_workers=num_workers_dict,
+            workers_to_delete=to_delete_instances_by_type,
+            worker_groups_without_pending_deletes=worker_groups_without_pending_deletes,
+            worker_groups_with_pending_deletes=worker_groups_with_pending_deletes,
+        )
+        return scale_request
+    def _submit_scale_request(
+        self, scale_request: "KubeRayProvider.ScaleRequest"
+    ) -> None:
+        """Submits a scale request to the Kubernetes API server.
+        This method will convert the scale request to patches and submit the patches
+        to the Kubernetes API server.
+        Args:
+            scale_request: The scale request.
+        Raises:
+            Exception: An exception is raised if the Kubernetes API server returns an
+                error.
+        """
+        # Get the current ray cluster spec.
+        patch_payload = []
+        raycluster = self.ray_cluster
+        # Collect patches for replica counts.
+        for node_type, target_replicas in scale_request.desired_num_workers.items():
+            group_index = _worker_group_index(raycluster, node_type)
+            group_max_replicas = _worker_group_max_replicas(raycluster, group_index)
+            # Cap the replica count to maxReplicas.
+            if group_max_replicas is not None and group_max_replicas < target_replicas:
+                logger.warning(
+                    "Autoscaler attempted to create "
+                    + "more than maxReplicas pods of type {}.".format(node_type)
+                )
+                target_replicas = group_max_replicas
+            # Check if we need to change the target count.
+            if target_replicas == _worker_group_replicas(raycluster, group_index):
+                # No patch required.
+                continue
+            # Need to patch replica count. Format the patch and add it to the payload.
+            patch = worker_replica_patch(group_index, target_replicas)
+            patch_payload.append(patch)
+        # Maps node_type to nodes to delete for that group.
+        for (
+            node_type,
+            workers_to_delete_of_type,
+        ) in scale_request.workers_to_delete.items():
+            group_index = _worker_group_index(raycluster, node_type)
+            worker_ids_to_delete = [
+                worker.cloud_instance_id for worker in workers_to_delete_of_type
+            ]
+            patch = worker_delete_patch(group_index, worker_ids_to_delete)
+            patch_payload.append(patch)
+        # Clear the workersToDelete field for the worker groups that have been deleted.
+        for node_type in scale_request.worker_groups_without_pending_deletes:
+            if node_type in scale_request.workers_to_delete:
+                # This node type is still being deleted.
+                continue
+            group_index = _worker_group_index(raycluster, node_type)
+            patch = worker_delete_patch(group_index, [])
+            patch_payload.append(patch)
+        if len(patch_payload) == 0:
+            # No patch required.
+            return
+        logger.info(f"Submitting a scale request: {scale_request}")
+        self._patch(f"rayclusters/{self._cluster_name}", patch_payload)
+    def _add_launch_errors(
+        self,
+        shape: Dict[NodeType, int],
+        request_id: str,
+        details: str,
+        e: Optional[Exception] = None,
+    ) -> None:
+        """
+        Adds launch errors to the error queue.
+        Args:
+            shape: The shape of the nodes that failed to launch.
+            request_id: The request id of the launch request.
+            details: The details of the error.
+            e: The exception that caused the error.
+        """
+        for node_type, count in shape.items():
+            self._launch_errors_queue.append(
+                LaunchNodeError(
+                    node_type=node_type,
+                    timestamp_ns=time.time_ns(),
+                    count=count,
+                    request_id=request_id,
+                    details=details,
+                    cause=e,
+                )
+            )
+    def _add_terminate_errors(
+        self,
+        ids: List[CloudInstanceId],
+        request_id: str,
+        details: str,
+        e: Optional[Exception] = None,
+    ) -> None:
+        """
+        Adds terminate errors to the error queue.
+        Args:
+            ids: The ids of the nodes that failed to terminate.
+            request_id: The request id of the terminate request.
+            details: The details of the error.
+            e: The exception that caused the error.
+        """
+        for id in ids:
+            self._terminate_errors_queue.append(
+                TerminateNodeError(
+                    cloud_instance_id=id,
+                    timestamp_ns=time.time_ns(),
+                    request_id=request_id,
+                    details=details,
+                    cause=e,
+                )
+            )
+    def _sync_with_api_server(self) -> None:
+        """Fetches the RayCluster resource from the Kubernetes API server."""
+        self._ray_cluster = self._get(f"rayclusters/{self._cluster_name}")
+        self._cached_instances = self._fetch_instances()
+    @property
+    def ray_cluster(self) -> Dict[str, Any]:
+        return copy.deepcopy(self._ray_cluster)
+    @property
+    def instances(self) -> Dict[CloudInstanceId, CloudInstance]:
+        return copy.deepcopy(self._cached_instances)
+    @staticmethod
+    def _get_workers_delete_info(
+        ray_cluster_spec: Dict[str, Any], node_set: Set[CloudInstanceId]
+    ) -> Tuple[Set[NodeType], Set[NodeType], Set[CloudInstanceId]]:
+        """
+        Gets the worker groups that have pending deletes and the worker groups that
+        have finished deletes.
+        Returns:
+            worker_groups_with_pending_deletes: The worker groups that have pending
+                deletes.
+            worker_groups_with_finished_deletes: The worker groups that have finished
+                deletes.
+            worker_to_delete_set: A set of Pods that are included in the workersToDelete
+                field of any worker group.
+        """
+        worker_groups_with_pending_deletes = set()
+        worker_groups_with_deletes = set()
+        worker_to_delete_set = set()
+        worker_groups = ray_cluster_spec["spec"].get("workerGroupSpecs", [])
+        for worker_group in worker_groups:
+            workersToDelete = worker_group.get("scaleStrategy", {}).get(
+                "workersToDelete", []
+            )
+            if not workersToDelete:
+                # No workers to delete in this group.
+                continue
+            node_type = worker_group["groupName"]
+            worker_groups_with_deletes.add(node_type)
+            for worker in workersToDelete:
+                worker_to_delete_set.add(worker)
+                if worker in node_set:
+                    worker_groups_with_pending_deletes.add(node_type)
+                    break
+        worker_groups_with_finished_deletes = (
+            worker_groups_with_deletes - worker_groups_with_pending_deletes
+        )
+        return (
+            worker_groups_with_pending_deletes,
+            worker_groups_with_finished_deletes,
+            worker_to_delete_set,
+        )
+    def _fetch_instances(self) -> Dict[CloudInstanceId, CloudInstance]:
+        """
+        Fetches the pods from the Kubernetes API server and convert them to Ray
+        CloudInstance.
+        Returns:
+            A dict of CloudInstanceId to CloudInstance.
+        """
+        # Get the pods resource version.
+        # Specifying a resource version in list requests is important for scalability:
+        # https://kubernetes.io/docs/reference/using-api/api-concepts/#semantics-for-get-and-list
+        resource_version = self._get_head_pod_resource_version()
+        if resource_version:
+            logger.info(
+                f"Listing pods for RayCluster {self._cluster_name}"
+                f" in namespace {self._namespace}"
+                f" at pods resource version >= {resource_version}."
+            )
+        # Filter pods by cluster_name.
+        label_selector = requests.utils.quote(f"ray.io/cluster={self._cluster_name}")
+        resource_path = f"pods?labelSelector={label_selector}"
+        if resource_version:
+            resource_path += (
+                f"&resourceVersion={resource_version}"
+                + "&resourceVersionMatch=NotOlderThan"
+            )
+        pod_list = self._get(resource_path)
+        fetched_resource_version = pod_list["metadata"]["resourceVersion"]
+        logger.info(
+            f"Fetched pod data at resource version" f" {fetched_resource_version}."
+        )
+        # Extract node data from the pod list.
+        cloud_instances = {}
+        for pod in pod_list["items"]:
+            # Kubernetes sets metadata.deletionTimestamp immediately after admitting a
+            # request to delete an object. Full removal of the object may take some time
+            # after the deletion timestamp is set. See link for details:
+            # https://kubernetes.io/docs/reference/using-api/api-concepts/#resource-deletion
+            if "deletionTimestamp" in pod["metadata"]:
+                # Ignore pods marked for termination.
+                continue
+            pod_name = pod["metadata"]["name"]
+            cloud_instance = self._cloud_instance_from_pod(pod)
+            if cloud_instance:
+                cloud_instances[pod_name] = cloud_instance
+        return cloud_instances
+    @staticmethod
+    def _cloud_instance_from_pod(pod: Dict[str, Any]) -> Optional[CloudInstance]:
+        """
+        Convert a pod to a Ray CloudInstance.
+        Args:
+            pod: The pod resource dict.
+        """
+        labels = pod["metadata"]["labels"]
+        if labels[KUBERAY_LABEL_KEY_KIND] == KUBERAY_KIND_HEAD:
+            kind = NodeKind.HEAD
+            type = labels[KUBERAY_LABEL_KEY_TYPE]
+        elif labels[KUBERAY_LABEL_KEY_KIND] == KUBERAY_KIND_WORKER:
+            kind = NodeKind.WORKER
+            type = labels[KUBERAY_LABEL_KEY_TYPE]
+        else:
+            # Other ray nodes types defined by KubeRay.
+            # e.g. this could also be `redis-cleanup`
+            # We will not track these nodes.
+            return None
+        # TODO: we should prob get from the pod's env var (RAY_CLOUD_INSTANCE_ID)
+        # directly.
+        cloud_instance_id = pod["metadata"]["name"]
+        return CloudInstance(
+            cloud_instance_id=cloud_instance_id,
+            node_type=type,
+            node_kind=kind,
+            is_running=KubeRayProvider._is_running(pod),
+        )
+    @staticmethod
+    def _is_running(pod) -> bool:
+        """Convert pod state to Ray NodeStatus
+        A cloud instance is considered running if the pod is in the running state,
+        else it could be pending/containers-terminated.
+        When it disappears from the list, it is considered terminated.
+        """
+        if (
+            "containerStatuses" not in pod["status"]
+            or not pod["status"]["containerStatuses"]
+        ):
+            return False
+        state = pod["status"]["containerStatuses"][0]["state"]
+        if "running" in state:
+            return True
+        return False
+    def _get(self, remote_path: str) -> Dict[str, Any]:
+        """Get a resource from the Kubernetes API server."""
+        return self._k8s_api_client.get(remote_path)
+    def _patch(self, remote_path: str, payload: List[Dict[str, Any]]) -> Dict[str, Any]:
+        """Patch a resource on the Kubernetes API server."""
+        return self._k8s_api_client.patch(remote_path, payload)
+    def _get_head_pod_resource_version(self) -> str:
+        """
+        Extract a recent pods resource version by reading the head pod's
+        metadata.resourceVersion of the response.
+        """
+        if not RAY_HEAD_POD_NAME:
+            return None
+        pod_resp = self._get(f"pods/{RAY_HEAD_POD_NAME}")
+        return pod_resp["metadata"]["resourceVersion"]

.venv/lib/python3.11/site-packages/ray/autoscaler/v2/instance_manager/cloud_providers/read_only/__init__.py ADDED Viewed

File without changes

.venv/lib/python3.11/site-packages/ray/autoscaler/v2/instance_manager/cloud_providers/read_only/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (233 Bytes). View file

.venv/lib/python3.11/site-packages/ray/autoscaler/v2/instance_manager/cloud_providers/read_only/__pycache__/cloud_provider.cpython-311.pyc ADDED Viewed

Binary file (3.81 kB). View file

.venv/lib/python3.11/site-packages/ray/autoscaler/v2/instance_manager/cloud_providers/read_only/cloud_provider.py ADDED Viewed

	@@ -0,0 +1,73 @@

+from typing import Dict, List
+from ray._private.utils import binary_to_hex
+from ray._raylet import GcsClient
+from ray.autoscaler._private.util import format_readonly_node_type
+from ray.autoscaler.v2.instance_manager.node_provider import (
+    CloudInstance,
+    CloudInstanceId,
+    CloudInstanceProviderError,
+    ICloudInstanceProvider,
+    NodeKind,
+)
+from ray.autoscaler.v2.sdk import get_cluster_resource_state
+from ray.autoscaler.v2.utils import is_head_node
+from ray.core.generated.autoscaler_pb2 import NodeStatus
+class ReadOnlyProvider(ICloudInstanceProvider):
+    """
+    A read only provider that use the ray node states from the GCS as the
+    cloud instances.
+    This is used for laptop mode / manual cluster setup modes, in order to
+    provide status reporting in the same way for users.
+    """
+    def __init__(self, provider_config: dict):
+        self._provider_config = provider_config
+        self._gcs_address = provider_config["gcs_address"]
+        self._gcs_client = GcsClient(address=self._gcs_address)
+    def get_non_terminated(self) -> Dict[str, CloudInstance]:
+        cluster_resource_state = get_cluster_resource_state(self._gcs_client)
+        cloud_instances = {}
+        for gcs_node_state in cluster_resource_state.node_states:
+            if gcs_node_state.status == NodeStatus.DEAD:
+                # Skip dead nodes.
+                continue
+            # Use node's node id if instance id is not available
+            cloud_instance_id = (
+                gcs_node_state.instance_id
+                if gcs_node_state.instance_id
+                else binary_to_hex(gcs_node_state.node_id)
+            )
+            # TODO: we should add a field to the proto to indicate if the node is head
+            # or not.
+            is_head = is_head_node(gcs_node_state)
+            cloud_instances[cloud_instance_id] = CloudInstance(
+                cloud_instance_id=cloud_instance_id,
+                node_kind=NodeKind.HEAD if is_head else NodeKind.WORKER,
+                node_type=format_readonly_node_type(
+                    binary_to_hex(gcs_node_state.node_id)  # Legacy behavior.
+                ),
+                is_running=True,
+                request_id="",
+            )
+        return cloud_instances
+    def terminate(self, instance_id: CloudInstanceId) -> None:
+        raise NotImplementedError("Cannot terminate instances in read-only mode.")
+    def launch(
+        self, shape: Dict[CloudInstanceId, int], request_id: CloudInstanceId
+    ) -> None:
+        raise NotImplementedError("Cannot launch instances in read-only mode.")
+    def poll_errors(self) -> List[CloudInstanceProviderError]:
+        return []

.venv/lib/python3.11/site-packages/ray/autoscaler/v2/instance_manager/subscribers/__init__.py ADDED Viewed

File without changes

.venv/lib/python3.11/site-packages/ray/autoscaler/v2/instance_manager/subscribers/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (219 Bytes). View file

.venv/lib/python3.11/site-packages/ray/autoscaler/v2/instance_manager/subscribers/__pycache__/cloud_instance_updater.cpython-311.pyc ADDED Viewed

Binary file (5.15 kB). View file

.venv/lib/python3.11/site-packages/ray/autoscaler/v2/instance_manager/subscribers/__pycache__/ray_stopper.cpython-311.pyc ADDED Viewed

Binary file (7.54 kB). View file

.venv/lib/python3.11/site-packages/ray/autoscaler/v2/instance_manager/subscribers/__pycache__/threaded_ray_installer.cpython-311.pyc ADDED Viewed

Binary file (5 kB). View file

.venv/lib/python3.11/site-packages/ray/autoscaler/v2/instance_manager/subscribers/cloud_instance_updater.py ADDED Viewed

	@@ -0,0 +1,93 @@

+import logging
+import uuid
+from collections import defaultdict
+from typing import List
+from ray.autoscaler.v2.instance_manager.instance_manager import (
+    InstanceUpdatedSubscriber,
+)
+from ray.autoscaler.v2.instance_manager.node_provider import ICloudInstanceProvider
+from ray.core.generated.instance_manager_pb2 import Instance, InstanceUpdateEvent
+logger = logging.getLogger(__name__)
+class CloudInstanceUpdater(InstanceUpdatedSubscriber):
+    """CloudInstanceUpdater is responsible for launching
+    new instances and terminating cloud instances
+    It requests the cloud instance provider to launch new instances when
+    there are new instance requests (with REQUESTED status change).
+    It requests the cloud instance provider to terminate instances when
+    there are new instance terminations (with TERMINATING status change).
+    The cloud instance APIs are async and non-blocking.
+    """
+    def __init__(
+        self,
+        cloud_provider: ICloudInstanceProvider,
+    ) -> None:
+        self._cloud_provider = cloud_provider
+    def notify(self, events: List[InstanceUpdateEvent]) -> None:
+        new_requests = [
+            event for event in events if event.new_instance_status == Instance.REQUESTED
+        ]
+        new_terminations = [
+            event
+            for event in events
+            if event.new_instance_status == Instance.TERMINATING
+        ]
+        self._launch_new_instances(new_requests)
+        self._terminate_instances(new_terminations)
+    def _terminate_instances(self, new_terminations: List[InstanceUpdateEvent]):
+        """
+        Terminate cloud instances through cloud provider.
+        Args:
+            new_terminations: List of new instance terminations.
+        """
+        if not new_terminations:
+            logger.debug("No instances to terminate.")
+            return
+        # Terminate the instances.
+        cloud_instance_ids = [event.cloud_instance_id for event in new_terminations]
+        # This is an async call.
+        self._cloud_provider.terminate(
+            ids=cloud_instance_ids, request_id=str(uuid.uuid4())
+        )
+    def _launch_new_instances(self, new_requests: List[InstanceUpdateEvent]):
+        """
+        Launches new instances by requesting the cloud provider.
+        Args:
+            new_requests: List of new instance requests.
+        """
+        if not new_requests:
+            logger.debug("No instances to launch.")
+            return
+        # Group new requests by launch request id.
+        requests_by_launch_request_id = defaultdict(list)
+        for event in new_requests:
+            assert (
+                event.launch_request_id
+            ), "Launch request id should have been set by the reconciler"
+            requests_by_launch_request_id[event.launch_request_id].append(event)
+        for launch_request_id, events in requests_by_launch_request_id.items():
+            request_shape = defaultdict(int)
+            for event in events:
+                request_shape[event.instance_type] += 1
+            # Make requests to the cloud provider.
+            self._cloud_provider.launch(
+                shape=request_shape, request_id=launch_request_id
+            )

.venv/lib/python3.11/site-packages/ray/autoscaler/v2/instance_manager/subscribers/ray_stopper.py ADDED Viewed

	@@ -0,0 +1,154 @@

+import logging
+from concurrent.futures import ThreadPoolExecutor
+from dataclasses import dataclass
+from queue import Queue
+from typing import List
+from ray._private.utils import hex_to_binary
+from ray._raylet import GcsClient
+from ray.autoscaler.v2.instance_manager.instance_manager import (
+    InstanceUpdatedSubscriber,
+)
+from ray.core.generated.autoscaler_pb2 import DrainNodeReason
+from ray.core.generated.instance_manager_pb2 import (
+    Instance,
+    InstanceUpdateEvent,
+    TerminationRequest,
+)
+logger = logging.getLogger(__name__)
+@dataclass(frozen=True)
+class RayStopError:
+    # Instance manager's instance id.
+    im_instance_id: str
+class RayStopper(InstanceUpdatedSubscriber):
+    """RayStopper is responsible for stopping ray on instances.
+    It will drain the ray node if it's for idle termination.
+    For other terminations, it will stop the ray node. (e.g. scale down, etc.)
+    If any failures happen when stopping/draining the node, we will not retry
+    and rely on the reconciler to handle the failure.
+    TODO: we could also surface the errors back to the reconciler for
+    quicker failure detection.
+    """
+    def __init__(self, gcs_client: GcsClient, error_queue: Queue) -> None:
+        self._gcs_client = gcs_client
+        self._error_queue = error_queue
+        self._executor = ThreadPoolExecutor(max_workers=1)
+    def notify(self, events: List[InstanceUpdateEvent]) -> None:
+        for event in events:
+            if event.new_instance_status == Instance.RAY_STOP_REQUESTED:
+                fut = self._executor.submit(self._stop_or_drain_ray, event)
+                def _log_on_error(fut):
+                    try:
+                        fut.result()
+                    except Exception:
+                        logger.exception("Error stopping/drain ray.")
+                fut.add_done_callback(_log_on_error)
+    def _stop_or_drain_ray(self, event: InstanceUpdateEvent) -> None:
+        """
+        Stops or drains the ray node based on the termination request.
+        """
+        assert event.HasField("termination_request"), "Termination request is required."
+        termination_request = event.termination_request
+        ray_node_id = termination_request.ray_node_id
+        instance_id = event.instance_id
+        if termination_request.cause == TerminationRequest.Cause.IDLE:
+            reason = DrainNodeReason.DRAIN_NODE_REASON_IDLE_TERMINATION
+            reason_str = "Termination of node that's idle for {} seconds.".format(
+                termination_request.idle_duration_ms / 1000
+            )
+            self._drain_ray_node(
+                self._gcs_client,
+                self._error_queue,
+                ray_node_id,
+                instance_id,
+                reason,
+                reason_str,
+            )
+            return
+        # If it's not an idle termination, we stop the ray node.
+        self._stop_ray_node(
+            self._gcs_client, self._error_queue, ray_node_id, instance_id
+        )
+    @staticmethod
+    def _drain_ray_node(
+        gcs_client: GcsClient,
+        error_queue: Queue,
+        ray_node_id: str,
+        instance_id: str,
+        reason: DrainNodeReason,
+        reason_str: str,
+    ):
+        """
+        Drains the ray node.
+        Args:
+            gcs_client: The gcs client to use.
+            ray_node_id: The ray node id to drain.
+            reason: The reason to drain the node.
+            reason_str: The reason message to drain the node.
+        """
+        try:
+            accepted, reject_msg_str = gcs_client.drain_node(
+                node_id=ray_node_id,
+                reason=reason,
+                reason_message=reason_str,
+                # TODO: we could probably add a deadline here that's derived
+                # from the stuck instance reconciliation configs.
+                deadline_timestamp_ms=0,
+            )
+            logger.info(
+                f"Drained ray on {ray_node_id}(success={accepted}, "
+                f"msg={reject_msg_str})"
+            )
+            if not accepted:
+                error_queue.put_nowait(RayStopError(im_instance_id=instance_id))
+        except Exception:
+            logger.exception(f"Error draining ray on {ray_node_id}")
+            error_queue.put_nowait(RayStopError(im_instance_id=instance_id))
+    @staticmethod
+    def _stop_ray_node(
+        gcs_client: GcsClient,
+        error_queue: Queue,
+        ray_node_id: str,
+        instance_id: str,
+    ):
+        """
+        Stops the ray node.
+        Args:
+            gcs_client: The gcs client to use.
+            ray_node_id: The ray node id to stop.
+        """
+        try:
+            drained = gcs_client.drain_nodes(node_ids=[hex_to_binary(ray_node_id)])
+            success = len(drained) > 0
+            logger.info(
+                f"Stopping ray on {ray_node_id}(instance={instance_id}): "
+                f"success={success})"
+            )
+            if not success:
+                error_queue.put_nowait(RayStopError(im_instance_id=instance_id))
+        except Exception:
+            logger.exception(
+                f"Error stopping ray on {ray_node_id}(instance={instance_id})"
+            )
+            error_queue.put_nowait(RayStopError(im_instance_id=instance_id))

.venv/lib/python3.11/site-packages/ray/autoscaler/v2/instance_manager/subscribers/threaded_ray_installer.py ADDED Viewed

	@@ -0,0 +1,95 @@

+import logging
+import time
+from concurrent.futures import ThreadPoolExecutor
+from typing import List
+from ray.autoscaler.v2.instance_manager.instance_manager import (
+    InstanceUpdatedSubscriber,
+)
+from ray.autoscaler.v2.instance_manager.instance_storage import InstanceStorage
+from ray.autoscaler.v2.instance_manager.ray_installer import RayInstaller
+from ray.core.generated.instance_manager_pb2 import Instance, InstanceUpdateEvent
+logger = logging.getLogger(__name__)
+class ThreadedRayInstaller(InstanceUpdatedSubscriber):
+    """ThreadedRayInstaller is responsible for install ray on new nodes."""
+    def __init__(
+        self,
+        head_node_ip: str,
+        instance_storage: InstanceStorage,
+        ray_installer: RayInstaller,
+        max_install_attempts: int = 3,
+        install_retry_interval: int = 10,
+        max_concurrent_installs: int = 50,
+    ) -> None:
+        self._head_node_ip = head_node_ip
+        self._instance_storage = instance_storage
+        self._ray_installer = ray_installer
+        self._max_concurrent_installs = max_concurrent_installs
+        self._max_install_attempts = max_install_attempts
+        self._install_retry_interval = install_retry_interval
+        self._ray_installation_executor = ThreadPoolExecutor(
+            max_workers=self._max_concurrent_installs
+        )
+    def notify(self, events: List[InstanceUpdateEvent]) -> None:
+        for event in events:
+            if event.new_instance_status == Instance.ALLOCATED:
+                self._install_ray_on_new_nodes(event.instance_id)
+    def _install_ray_on_new_nodes(self, instance_id: str) -> None:
+        allocated_instance, _ = self._instance_storage.get_instances(
+            instance_ids={instance_id},
+            status_filter={Instance.ALLOCATED},
+        )
+        for instance in allocated_instance.values():
+            self._ray_installation_executor.submit(
+                self._install_ray_on_single_node, instance
+            )
+    def _install_ray_on_single_node(self, instance: Instance) -> None:
+        assert instance.status == Instance.ALLOCATED
+        success, version = self._instance_storage.upsert_instance(
+            instance, expected_instance_version=instance.version
+        )
+        if not success:
+            logger.warning(
+                f"Failed to update instance {instance.instance_id} to RAY_INSTALLING"
+            )
+            # Do not need to handle failures, it will be covered by
+            # garbage collection.
+            return
+        # install with exponential backoff
+        installed = False
+        backoff_factor = 1
+        for _ in range(self._max_install_attempts):
+            installed = self._ray_installer.install_ray(instance, self._head_node_ip)
+            if installed:
+                break
+            logger.warning("Failed to install ray, retrying...")
+            time.sleep(self._install_retry_interval * backoff_factor)
+            backoff_factor *= 2
+        if not installed:
+            instance.status = Instance.RAY_INSTALL_FAILED
+            success, version = self._instance_storage.upsert_instance(
+                instance,
+                expected_instance_version=version,
+            )
+        else:
+            instance.status = Instance.RAY_RUNNING
+            success, version = self._instance_storage.upsert_instance(
+                instance,
+                expected_instance_version=version,
+            )
+        if not success:
+            logger.warning(
+                f"Failed to update instance {instance.instance_id} to {instance.status}"
+            )
+            # Do not need to handle failures, it will be covered by
+            # garbage collection.
+            return

.venv/lib/python3.11/site-packages/ray/util/__init__.py ADDED Viewed

	@@ -0,0 +1,74 @@

+from typing import List
+import ray
+from ray._private.client_mode_hook import client_mode_hook
+from ray._private.auto_init_hook import wrap_auto_init
+from ray._private.services import get_node_ip_address
+from ray.util import iter
+from ray.util import rpdb as pdb
+from ray.util import debugpy as ray_debugpy
+from ray.util.actor_pool import ActorPool
+from ray.util import accelerators
+from ray.util.annotations import PublicAPI
+from ray.util.check_serialize import inspect_serializability
+from ray.util.client_connect import connect, disconnect
+from ray.util.debug import disable_log_once_globally, enable_periodic_logging, log_once
+from ray.util.placement_group import (
+    get_current_placement_group,
+    get_placement_group,
+    placement_group,
+    placement_group_table,
+    remove_placement_group,
+)
+from ray.util.serialization import deregister_serializer, register_serializer
+@PublicAPI(stability="beta")
+@wrap_auto_init
+@client_mode_hook
+def list_named_actors(all_namespaces: bool = False) -> List[str]:
+    """List all named actors in the system.
+    Actors must have been created with Actor.options(name="name").remote().
+    This works for both detached & non-detached actors.
+    By default, only actors in the current namespace will be returned
+    and the returned entries will simply be their name.
+    If `all_namespaces` is set to True, all actors in the cluster will be
+    returned regardless of namespace, and the returned entries will be of the
+    form {"namespace": namespace, "name": name}.
+    """
+    worker = ray._private.worker.global_worker
+    worker.check_connected()
+    actors = worker.core_worker.list_named_actors(all_namespaces)
+    if all_namespaces:
+        return [{"name": name, "namespace": namespace} for namespace, name in actors]
+    else:
+        return [name for _, name in actors]
+__all__ = [
+    "accelerators",
+    "ActorPool",
+    "disable_log_once_globally",
+    "enable_periodic_logging",
+    "iter",
+    "log_once",
+    "pdb",
+    "placement_group",
+    "placement_group_table",
+    "get_placement_group",
+    "get_current_placement_group",
+    "get_node_ip_address",
+    "remove_placement_group",
+    "ray_debugpy",
+    "inspect_serializability",
+    "collective",
+    "connect",
+    "disconnect",
+    "register_serializer",
+    "deregister_serializer",
+    "list_named_actors",
+]

.venv/lib/python3.11/site-packages/ray/util/actor_group.py ADDED Viewed

	@@ -0,0 +1,230 @@

+import weakref
+from dataclasses import dataclass
+import logging
+from typing import List, TypeVar, Optional, Dict, Type, Tuple
+import ray
+from ray.actor import ActorHandle
+from ray.util.annotations import Deprecated
+from ray._private.utils import get_ray_doc_version
+T = TypeVar("T")
+ActorMetadata = TypeVar("ActorMetadata")
+logger = logging.getLogger(__name__)
+@dataclass
+class ActorWrapper:
+    """Class containing an actor and its metadata."""
+    actor: ActorHandle
+    metadata: ActorMetadata
+@dataclass
+class ActorConfig:
+    num_cpus: float
+    num_gpus: float
+    resources: Optional[Dict[str, float]]
+    init_args: Tuple
+    init_kwargs: Dict
+class ActorGroupMethod:
+    def __init__(self, actor_group: "ActorGroup", method_name: str):
+        self.actor_group = weakref.ref(actor_group)
+        self._method_name = method_name
+    def __call__(self, *args, **kwargs):
+        raise TypeError(
+            "ActorGroup methods cannot be called directly. "
+            "Instead "
+            f"of running 'object.{self._method_name}()', try "
+            f"'object.{self._method_name}.remote()'."
+        )
+    def remote(self, *args, **kwargs):
+        return [
+            getattr(a.actor, self._method_name).remote(*args, **kwargs)
+            for a in self.actor_group().actors
+        ]
+@Deprecated(
+    message="For stateless/task processing, use ray.util.multiprocessing, see details "
+    f"in https://docs.ray.io/en/{get_ray_doc_version()}/ray-more-libs/multiprocessing.html. "  # noqa: E501
+    "For stateful/actor processing such as batch prediction, use "
+    "Datasets.map_batches(compute=ActorPoolStrategy, ...), see details in "
+    f"https://docs.ray.io/en/{get_ray_doc_version()}/data/api/dataset.html#ray.data.Dataset.map_batches.",  # noqa: E501
+    warning=True,
+)
+class ActorGroup:
+    """Group of Ray Actors that can execute arbitrary functions.
+    ``ActorGroup`` launches Ray actors according to the given
+    specification. It can then execute arbitrary Python functions in each of
+    these actors.
+    If not enough resources are available to launch the actors, the Ray
+    cluster will automatically scale up if autoscaling is enabled.
+    Args:
+        actor_cls: The class to use as the remote actors.
+        num_actors: The number of the provided Ray actors to
+            launch. Defaults to 1.
+        num_cpus_per_actor: The number of CPUs to reserve for each
+            actor. Fractional values are allowed. Defaults to 1.
+        num_gpus_per_actor: The number of GPUs to reserve for each
+            actor. Fractional values are allowed. Defaults to 0.
+        resources_per_actor (Optional[Dict[str, float]]):
+            Dictionary specifying the resources that will be
+            requested for each actor in addition to ``num_cpus_per_actor``
+            and ``num_gpus_per_actor``.
+        init_args, init_kwargs: If ``actor_cls`` is provided,
+            these args will be used for the actor initialization.
+    """
+    def __init__(
+        self,
+        actor_cls: Type,
+        num_actors: int = 1,
+        num_cpus_per_actor: float = 1,
+        num_gpus_per_actor: float = 0,
+        resources_per_actor: Optional[Dict[str, float]] = None,
+        init_args: Optional[Tuple] = None,
+        init_kwargs: Optional[Dict] = None,
+    ):
+        from ray._private.usage.usage_lib import record_library_usage
+        record_library_usage("util.ActorGroup")
+        if num_actors <= 0:
+            raise ValueError(
+                "The provided `num_actors` must be greater "
+                f"than 0. Received num_actors={num_actors} "
+                f"instead."
+            )
+        if num_cpus_per_actor < 0 or num_gpus_per_actor < 0:
+            raise ValueError(
+                "The number of CPUs and GPUs per actor must "
+                "not be negative. Received "
+                f"num_cpus_per_actor={num_cpus_per_actor} and "
+                f"num_gpus_per_actor={num_gpus_per_actor}."
+            )
+        self.actors = []
+        self.num_actors = num_actors
+        self.actor_config = ActorConfig(
+            num_cpus=num_cpus_per_actor,
+            num_gpus=num_gpus_per_actor,
+            resources=resources_per_actor,
+            init_args=init_args or (),
+            init_kwargs=init_kwargs or {},
+        )
+        self._remote_cls = ray.remote(
+            num_cpus=self.actor_config.num_cpus,
+            num_gpus=self.actor_config.num_gpus,
+            resources=self.actor_config.resources,
+        )(actor_cls)
+        self.start()
+    def __getattr__(self, item):
+        if len(self.actors) == 0:
+            raise RuntimeError(
+                "This ActorGroup has been shutdown. Please start it again."
+            )
+        # Same implementation as actor.py
+        return ActorGroupMethod(self, item)
+    def __len__(self):
+        return len(self.actors)
+    def __getitem__(self, item):
+        return self.actors[item]
+    def start(self):
+        """Starts all the actors in this actor group."""
+        if self.actors and len(self.actors) > 0:
+            raise RuntimeError(
+                "The actors have already been started. "
+                "Please call `shutdown` first if you want to "
+                "restart them."
+            )
+        logger.debug(f"Starting {self.num_actors} actors.")
+        self.add_actors(self.num_actors)
+        logger.debug(f"{len(self.actors)} actors have successfully started.")
+    def shutdown(self, patience_s: float = 5):
+        """Shutdown all the actors in this actor group.
+        Args:
+            patience_s: Attempt a graceful shutdown
+                of the actors for this many seconds. Fallback to force kill
+                if graceful shutdown is not complete after this time. If
+                this is less than or equal to 0, immediately force kill all
+                actors.
+        """
+        logger.debug(f"Shutting down {len(self.actors)} actors.")
+        if patience_s <= 0:
+            for actor in self.actors:
+                ray.kill(actor.actor)
+        else:
+            done_refs = [w.actor.__ray_terminate__.remote() for w in self.actors]
+            # Wait for actors to die gracefully.
+            done, not_done = ray.wait(done_refs, timeout=patience_s)
+            if not_done:
+                logger.debug("Graceful termination failed. Falling back to force kill.")
+                # If all actors are not able to die gracefully, then kill them.
+                for actor in self.actors:
+                    ray.kill(actor.actor)
+        logger.debug("Shutdown successful.")
+        self.actors = []
+    def remove_actors(self, actor_indexes: List[int]):
+        """Removes the actors with the specified indexes.
+        Args:
+            actor_indexes (List[int]): The indexes of the actors to remove.
+        """
+        new_actors = []
+        for i in range(len(self.actors)):
+            if i not in actor_indexes:
+                new_actors.append(self.actors[i])
+        self.actors = new_actors
+    def add_actors(self, num_actors: int):
+        """Adds ``num_actors`` to this ActorGroup.
+        Args:
+            num_actors: The number of actors to add.
+        """
+        new_actors = []
+        new_actor_metadata = []
+        for _ in range(num_actors):
+            actor = self._remote_cls.remote(
+                *self.actor_config.init_args, **self.actor_config.init_kwargs
+            )
+            new_actors.append(actor)
+            if hasattr(actor, "get_actor_metadata"):
+                new_actor_metadata.append(actor.get_actor_metadata.remote())
+        # Get metadata from all actors.
+        metadata = ray.get(new_actor_metadata)
+        if len(metadata) == 0:
+            metadata = [None] * len(new_actors)
+        for i in range(len(new_actors)):
+            self.actors.append(ActorWrapper(actor=new_actors[i], metadata=metadata[i]))
+    @property
+    def actor_metadata(self):
+        return [a.metadata for a in self.actors]

.venv/lib/python3.11/site-packages/ray/util/actor_pool.py ADDED Viewed

	@@ -0,0 +1,463 @@

+from typing import TYPE_CHECKING, Any, Callable, List, TypeVar
+import ray
+from ray.util.annotations import DeveloperAPI
+if TYPE_CHECKING:
+    import ray.actor
+V = TypeVar("V")
+@DeveloperAPI
+class ActorPool:
+    """Utility class to operate on a fixed pool of actors.
+    Arguments:
+        actors: List of Ray actor handles to use in this pool.
+    Examples:
+        .. testcode::
+            import ray
+            from ray.util.actor_pool import ActorPool
+            @ray.remote
+            class Actor:
+                def double(self, v):
+                    return 2 * v
+            a1, a2 = Actor.remote(), Actor.remote()
+            pool = ActorPool([a1, a2])
+            print(list(pool.map(lambda a, v: a.double.remote(v),
+                                [1, 2, 3, 4])))
+        .. testoutput::
+            [2, 4, 6, 8]
+    """
+    def __init__(self, actors: list):
+        from ray._private.usage.usage_lib import record_library_usage
+        record_library_usage("util.ActorPool")
+        # actors to be used
+        self._idle_actors = list(actors)
+        # get actor from future
+        self._future_to_actor = {}
+        # get future from index
+        self._index_to_future = {}
+        # next task to do
+        self._next_task_index = 0
+        # next task to return
+        self._next_return_index = 0
+        # next work depending when actors free
+        self._pending_submits = []
+    def map(self, fn: Callable[["ray.actor.ActorHandle", V], Any], values: List[V]):
+        """Apply the given function in parallel over the actors and values.
+        This returns an ordered iterator that will return results of the map
+        as they finish. Note that you must iterate over the iterator to force
+        the computation to finish.
+        Arguments:
+            fn: Function that takes (actor, value) as argument and
+                returns an ObjectRef computing the result over the value. The
+                actor will be considered busy until the ObjectRef completes.
+            values: List of values that fn(actor, value) should be
+                applied to.
+        Returns:
+            Iterator over results from applying fn to the actors and values.
+        Examples:
+            .. testcode::
+                import ray
+                from ray.util.actor_pool import ActorPool
+                @ray.remote
+                class Actor:
+                    def double(self, v):
+                        return 2 * v
+                a1, a2 = Actor.remote(), Actor.remote()
+                pool = ActorPool([a1, a2])
+                print(list(pool.map(lambda a, v: a.double.remote(v),
+                                    [1, 2, 3, 4])))
+            .. testoutput::
+                [2, 4, 6, 8]
+        """
+        # Ignore/Cancel all the previous submissions
+        # by calling `has_next` and `gen_next` repeteadly.
+        while self.has_next():
+            try:
+                self.get_next(timeout=0, ignore_if_timedout=True)
+            except TimeoutError:
+                pass
+        for v in values:
+            self.submit(fn, v)
+        def get_generator():
+            while self.has_next():
+                yield self.get_next()
+        return get_generator()
+    def map_unordered(
+        self, fn: Callable[["ray.actor.ActorHandle", V], Any], values: List[V]
+    ):
+        """Similar to map(), but returning an unordered iterator.
+        This returns an unordered iterator that will return results of the map
+        as they finish. This can be more efficient that map() if some results
+        take longer to compute than others.
+        Arguments:
+            fn: Function that takes (actor, value) as argument and
+                returns an ObjectRef computing the result over the value. The
+                actor will be considered busy until the ObjectRef completes.
+            values: List of values that fn(actor, value) should be
+                applied to.
+        Returns:
+            Iterator over results from applying fn to the actors and values.
+        Examples:
+            .. testcode::
+                import ray
+                from ray.util.actor_pool import ActorPool
+                @ray.remote
+                class Actor:
+                    def double(self, v):
+                        return 2 * v
+                a1, a2 = Actor.remote(), Actor.remote()
+                pool = ActorPool([a1, a2])
+                print(list(pool.map_unordered(lambda a, v: a.double.remote(v),
+                                              [1, 2, 3, 4])))
+            .. testoutput::
+                :options: +MOCK
+                [6, 8, 4, 2]
+        """
+        # Ignore/Cancel all the previous submissions
+        # by calling `has_next` and `gen_next_unordered` repeteadly.
+        while self.has_next():
+            try:
+                self.get_next_unordered(timeout=0)
+            except TimeoutError:
+                pass
+        for v in values:
+            self.submit(fn, v)
+        def get_generator():
+            while self.has_next():
+                yield self.get_next_unordered()
+        return get_generator()
+    def submit(self, fn, value):
+        """Schedule a single task to run in the pool.
+        This has the same argument semantics as map(), but takes on a single
+        value instead of a list of values. The result can be retrieved using
+        get_next() / get_next_unordered().
+        Arguments:
+            fn: Function that takes (actor, value) as argument and
+                returns an ObjectRef computing the result over the value. The
+                actor will be considered busy until the ObjectRef completes.
+            value: Value to compute a result for.
+        Examples:
+            .. testcode::
+                import ray
+                from ray.util.actor_pool import ActorPool
+                @ray.remote
+                class Actor:
+                    def double(self, v):
+                        return 2 * v
+                a1, a2 = Actor.remote(), Actor.remote()
+                pool = ActorPool([a1, a2])
+                pool.submit(lambda a, v: a.double.remote(v), 1)
+                pool.submit(lambda a, v: a.double.remote(v), 2)
+                print(pool.get_next(), pool.get_next())
+            .. testoutput::
+                2 4
+        """
+        if self._idle_actors:
+            actor = self._idle_actors.pop()
+            future = fn(actor, value)
+            future_key = tuple(future) if isinstance(future, list) else future
+            self._future_to_actor[future_key] = (self._next_task_index, actor)
+            self._index_to_future[self._next_task_index] = future
+            self._next_task_index += 1
+        else:
+            self._pending_submits.append((fn, value))
+    def has_next(self):
+        """Returns whether there are any pending results to return.
+        Returns:
+            True if there are any pending results not yet returned.
+        Examples:
+            .. testcode::
+                import ray
+                from ray.util.actor_pool import ActorPool
+                @ray.remote
+                class Actor:
+                    def double(self, v):
+                        return 2 * v
+                a1, a2 = Actor.remote(), Actor.remote()
+                pool = ActorPool([a1, a2])
+                pool.submit(lambda a, v: a.double.remote(v), 1)
+                print(pool.has_next())
+                print(pool.get_next())
+                print(pool.has_next())
+            .. testoutput::
+                True
+                2
+                False
+        """
+        return bool(self._future_to_actor)
+    def get_next(self, timeout=None, ignore_if_timedout=False):
+        """Returns the next pending result in order.
+        This returns the next result produced by submit(), blocking for up to
+        the specified timeout until it is available.
+        Returns:
+            The next result.
+        Raises:
+            TimeoutError: if the timeout is reached.
+        Examples:
+            .. testcode::
+                import ray
+                from ray.util.actor_pool import ActorPool
+                @ray.remote
+                class Actor:
+                    def double(self, v):
+                        return 2 * v
+                a1, a2 = Actor.remote(), Actor.remote()
+                pool = ActorPool([a1, a2])
+                pool.submit(lambda a, v: a.double.remote(v), 1)
+                print(pool.get_next())
+            .. testoutput::
+                2
+        """
+        if not self.has_next():
+            raise StopIteration("No more results to get")
+        if self._next_return_index >= self._next_task_index:
+            raise ValueError(
+                "It is not allowed to call get_next() after get_next_unordered()."
+            )
+        future = self._index_to_future[self._next_return_index]
+        timeout_msg = "Timed out waiting for result"
+        raise_timeout_after_ignore = False
+        if timeout is not None:
+            res, _ = ray.wait([future], timeout=timeout)
+            if not res:
+                if not ignore_if_timedout:
+                    raise TimeoutError(timeout_msg)
+                else:
+                    raise_timeout_after_ignore = True
+        del self._index_to_future[self._next_return_index]
+        self._next_return_index += 1
+        future_key = tuple(future) if isinstance(future, list) else future
+        i, a = self._future_to_actor.pop(future_key)
+        self._return_actor(a)
+        if raise_timeout_after_ignore:
+            raise TimeoutError(
+                timeout_msg + ". The task {} has been ignored.".format(future)
+            )
+        return ray.get(future)
+    def get_next_unordered(self, timeout=None, ignore_if_timedout=False):
+        """Returns any of the next pending results.
+        This returns some result produced by submit(), blocking for up to
+        the specified timeout until it is available. Unlike get_next(), the
+        results are not always returned in same order as submitted, which can
+        improve performance.
+        Returns:
+            The next result.
+        Raises:
+            TimeoutError: if the timeout is reached.
+        Examples:
+            .. testcode::
+                import ray
+                from ray.util.actor_pool import ActorPool
+                @ray.remote
+                class Actor:
+                    def double(self, v):
+                        return 2 * v
+                a1, a2 = Actor.remote(), Actor.remote()
+                pool = ActorPool([a1, a2])
+                pool.submit(lambda a, v: a.double.remote(v), 1)
+                pool.submit(lambda a, v: a.double.remote(v), 2)
+                print(pool.get_next_unordered())
+                print(pool.get_next_unordered())
+            .. testoutput::
+                :options: +MOCK
+                4
+                2
+        """
+        if not self.has_next():
+            raise StopIteration("No more results to get")
+        # TODO(ekl) bulk wait for performance
+        res, _ = ray.wait(list(self._future_to_actor), num_returns=1, timeout=timeout)
+        timeout_msg = "Timed out waiting for result"
+        raise_timeout_after_ignore = False
+        if res:
+            [future] = res
+        else:
+            if not ignore_if_timedout:
+                raise TimeoutError(timeout_msg)
+            else:
+                raise_timeout_after_ignore = True
+        i, a = self._future_to_actor.pop(future)
+        self._return_actor(a)
+        del self._index_to_future[i]
+        self._next_return_index = max(self._next_return_index, i + 1)
+        if raise_timeout_after_ignore:
+            raise TimeoutError(
+                timeout_msg + ". The task {} has been ignored.".format(future)
+            )
+        return ray.get(future)
+    def _return_actor(self, actor):
+        self._idle_actors.append(actor)
+        if self._pending_submits:
+            self.submit(*self._pending_submits.pop(0))
+    def has_free(self):
+        """Returns whether there are any idle actors available.
+        Returns:
+            True if there are any idle actors and no pending submits.
+        Examples:
+            .. testcode::
+                import ray
+                from ray.util.actor_pool import ActorPool
+                @ray.remote
+                class Actor:
+                    def double(self, v):
+                        return 2 * v
+                a1 = Actor.remote()
+                pool = ActorPool([a1])
+                pool.submit(lambda a, v: a.double.remote(v), 1)
+                print(pool.has_free())
+                print(pool.get_next())
+                print(pool.has_free())
+            .. testoutput::
+                False
+                2
+                True
+        """
+        return len(self._idle_actors) > 0 and len(self._pending_submits) == 0
+    def pop_idle(self):
+        """Removes an idle actor from the pool.
+        Returns:
+            An idle actor if one is available.
+            None if no actor was free to be removed.
+        Examples:
+            .. testcode::
+                import ray
+                from ray.util.actor_pool import ActorPool
+                @ray.remote
+                class Actor:
+                    def double(self, v):
+                        return 2 * v
+                a1 = Actor.remote()
+                pool = ActorPool([a1])
+                pool.submit(lambda a, v: a.double.remote(v), 1)
+                assert pool.pop_idle() is None
+                assert pool.get_next() == 2
+                assert pool.pop_idle() == a1
+        """
+        if self.has_free():
+            return self._idle_actors.pop()
+        return None
+    def push(self, actor):
+        """Pushes a new actor into the current list of idle actors.
+        Examples:
+            .. testcode::
+                import ray
+                from ray.util.actor_pool import ActorPool
+                @ray.remote
+                class Actor:
+                    def double(self, v):
+                        return 2 * v
+                a1, a2 = Actor.remote(), Actor.remote()
+                pool = ActorPool([a1])
+                pool.push(a2)
+        """
+        busy_actors = []
+        if self._future_to_actor.values():
+            _, busy_actors = zip(*self._future_to_actor.values())
+        if actor in self._idle_actors or actor in busy_actors:
+            raise ValueError("Actor already belongs to current ActorPool")
+        else:
+            self._return_actor(actor)

.venv/lib/python3.11/site-packages/ray/util/check_open_ports.py ADDED Viewed

	@@ -0,0 +1,179 @@

+"""A CLI utility for check open ports in the Ray cluster.
+See https://www.anyscale.com/blog/update-on-ray-cve-2023-48022-new-verification-tooling-available # noqa: E501
+for more details.
+"""
+from typing import List, Tuple
+import subprocess
+import click
+import psutil
+import urllib
+import json
+import ray
+from ray.util.annotations import PublicAPI
+from ray.autoscaler._private.cli_logger import add_click_logging_options, cli_logger
+from ray.autoscaler._private.constants import RAY_PROCESSES
+from ray.util.scheduling_strategies import NodeAffinitySchedulingStrategy
+def _get_ray_ports() -> List[int]:
+    unique_ports = set()
+    process_infos = []
+    for proc in psutil.process_iter(["name", "cmdline"]):
+        try:
+            process_infos.append((proc, proc.name(), proc.cmdline()))
+        except psutil.Error:
+            pass
+    for keyword, filter_by_cmd in RAY_PROCESSES:
+        for candidate in process_infos:
+            proc, proc_cmd, proc_args = candidate
+            corpus = proc_cmd if filter_by_cmd else subprocess.list2cmdline(proc_args)
+            if keyword in corpus:
+                try:
+                    for connection in proc.connections():
+                        if connection.status == psutil.CONN_LISTEN:
+                            unique_ports.add(connection.laddr.port)
+                except psutil.AccessDenied:
+                    cli_logger.info(
+                        "Access denied to process connections for process,"
+                        " worker process probably restarted",
+                        proc,
+                    )
+    return sorted(unique_ports)
+def _check_for_open_ports_from_internet(
+    service_url: str, ports: List[int]
+) -> Tuple[List[int], List[int]]:
+    request = urllib.request.Request(
+        method="POST",
+        url=service_url,
+        headers={
+            "Content-Type": "application/json",
+            "X-Ray-Open-Port-Check": "1",
+        },
+        data=json.dumps({"ports": ports}).encode("utf-8"),
+    )
+    response = urllib.request.urlopen(request)
+    if response.status != 200:
+        raise RuntimeError(
+            f"Failed to check with Ray Open Port Service: {response.status}"
+        )
+    response_body = json.load(response)
+    publicly_open_ports = response_body.get("open_ports", [])
+    checked_ports = response_body.get("checked_ports", [])
+    return publicly_open_ports, checked_ports
+def _check_if_exposed_to_internet(
+    service_url: str,
+) -> Tuple[List[int], List[int]]:
+    return _check_for_open_ports_from_internet(service_url, _get_ray_ports())
+def _check_ray_cluster(
+    service_url: str,
+) -> List[Tuple[str, Tuple[List[int], List[int]]]]:
+    ray.init(ignore_reinit_error=True)
+    @ray.remote(num_cpus=0)
+    def check(node_id, service_url):
+        return node_id, _check_if_exposed_to_internet(service_url)
+    ray_node_ids = [node["NodeID"] for node in ray.nodes() if node["Alive"]]
+    cli_logger.info(
+        f"Cluster has {len(ray_node_ids)} node(s)."
+        " Scheduling tasks on each to check for exposed ports",
+    )
+    per_node_tasks = {
+        node_id: (
+            check.options(
+                scheduling_strategy=NodeAffinitySchedulingStrategy(
+                    node_id=node_id, soft=False
+                )
+            ).remote(node_id, service_url)
+        )
+        for node_id in ray_node_ids
+    }
+    results = []
+    for node_id, per_node_task in per_node_tasks.items():
+        try:
+            results.append(ray.get(per_node_task))
+        except Exception as e:
+            cli_logger.info(f"Failed to check on node {node_id}: {e}")
+    return results
+@click.command()
+@click.option(
+    "--yes", "-y", is_flag=True, default=False, help="Don't ask for confirmation."
+)
+@click.option(
+    "--service-url",
+    required=False,
+    type=str,
+    default="https://ray-open-port-checker.uc.r.appspot.com/open-port-check",
+    help="The url of service that checks whether submitted ports are open.",
+)
+@add_click_logging_options
+@PublicAPI
+def check_open_ports(yes, service_url):
+    """Check open ports in the local Ray cluster."""
+    if not cli_logger.confirm(
+        yes=yes,
+        msg=(
+            "Do you want to check the local Ray cluster"
+            " for any nodes with ports accessible to the internet?"
+        ),
+        _default=True,
+    ):
+        cli_logger.info("Exiting without checking as instructed")
+        return
+    cluster_open_ports = _check_ray_cluster(service_url)
+    public_nodes = []
+    for node_id, (open_ports, checked_ports) in cluster_open_ports:
+        if open_ports:
+            cli_logger.info(
+                f"[🛑] open ports detected open_ports={open_ports!r} node={node_id!r}"
+            )
+            public_nodes.append((node_id, open_ports, checked_ports))
+        else:
+            cli_logger.info(
+                f"[🟢] No open ports detected "
+                f"checked_ports={checked_ports!r} node={node_id!r}"
+            )
+    cli_logger.info("Check complete, results:")
+    if public_nodes:
+        cli_logger.info(
+            """
+[🛑] An server on the internet was able to open a connection to one of this Ray
+cluster's public IP on one of Ray's internal ports. If this is not a false
+positive, this is an extremely unsafe configuration for Ray to be running in.
+Ray is not meant to be exposed to untrusted clients and will allow them to run
+arbitrary code on your machine.
+You should take immediate action to validate this result and if confirmed shut
+down your Ray cluster immediately and take appropriate action to remediate its
+exposure. Anything either running on this Ray cluster or that this cluster has
+had access to could be at risk.
+For guidance on how to operate Ray safely, please review [Ray's security
+documentation](https://docs.ray.io/en/latest/ray-security/index.html).
+""".strip()
+        )
+    else:
+        cli_logger.info("[🟢] No open ports detected from any Ray nodes")

.venv/lib/python3.11/site-packages/ray/util/check_serialize.py ADDED Viewed

	@@ -0,0 +1,265 @@

+"""A utility for debugging serialization issues."""
+import inspect
+from contextlib import contextmanager
+from typing import Any, Optional, Set, Tuple
+# Import ray first to use the bundled colorama
+import ray  # noqa: F401
+import colorama
+import ray.cloudpickle as cp
+from ray.util.annotations import DeveloperAPI
+@contextmanager
+def _indent(printer):
+    printer.level += 1
+    yield
+    printer.level -= 1
+class _Printer:
+    def __init__(self, print_file):
+        self.level = 0
+        self.print_file = print_file
+    def indent(self):
+        return _indent(self)
+    def print(self, msg):
+        indent = "    " * self.level
+        print(indent + msg, file=self.print_file)
+@DeveloperAPI
+class FailureTuple:
+    """Represents the serialization 'frame'.
+    Attributes:
+        obj: The object that fails serialization.
+        name: The variable name of the object.
+        parent: The object that references the `obj`.
+    """
+    def __init__(self, obj: Any, name: str, parent: Any):
+        self.obj = obj
+        self.name = name
+        self.parent = parent
+    def __repr__(self):
+        return f"FailTuple({self.name} [obj={self.obj}, parent={self.parent}])"
+def _inspect_func_serialization(base_obj, depth, parent, failure_set, printer):
+    """Adds the first-found non-serializable element to the failure_set."""
+    assert inspect.isfunction(base_obj)
+    closure = inspect.getclosurevars(base_obj)
+    found = False
+    if closure.globals:
+        printer.print(
+            f"Detected {len(closure.globals)} global variables. "
+            "Checking serializability..."
+        )
+        with printer.indent():
+            for name, obj in closure.globals.items():
+                serializable, _ = _inspect_serializability(
+                    obj,
+                    name=name,
+                    depth=depth - 1,
+                    parent=parent,
+                    failure_set=failure_set,
+                    printer=printer,
+                )
+                found = found or not serializable
+                if found:
+                    break
+    if closure.nonlocals:
+        printer.print(
+            f"Detected {len(closure.nonlocals)} nonlocal variables. "
+            "Checking serializability..."
+        )
+        with printer.indent():
+            for name, obj in closure.nonlocals.items():
+                serializable, _ = _inspect_serializability(
+                    obj,
+                    name=name,
+                    depth=depth - 1,
+                    parent=parent,
+                    failure_set=failure_set,
+                    printer=printer,
+                )
+                found = found or not serializable
+                if found:
+                    break
+    if not found:
+        printer.print(
+            f"WARNING: Did not find non-serializable object in {base_obj}. "
+            "This may be an oversight."
+        )
+    return found
+def _inspect_generic_serialization(base_obj, depth, parent, failure_set, printer):
+    """Adds the first-found non-serializable element to the failure_set."""
+    assert not inspect.isfunction(base_obj)
+    functions = inspect.getmembers(base_obj, predicate=inspect.isfunction)
+    found = False
+    with printer.indent():
+        for name, obj in functions:
+            serializable, _ = _inspect_serializability(
+                obj,
+                name=name,
+                depth=depth - 1,
+                parent=parent,
+                failure_set=failure_set,
+                printer=printer,
+            )
+            found = found or not serializable
+            if found:
+                break
+    with printer.indent():
+        members = inspect.getmembers(base_obj)
+        for name, obj in members:
+            if name.startswith("__") and name.endswith("__") or inspect.isbuiltin(obj):
+                continue
+            serializable, _ = _inspect_serializability(
+                obj,
+                name=name,
+                depth=depth - 1,
+                parent=parent,
+                failure_set=failure_set,
+                printer=printer,
+            )
+            found = found or not serializable
+            if found:
+                break
+    if not found:
+        printer.print(
+            f"WARNING: Did not find non-serializable object in {base_obj}. "
+            "This may be an oversight."
+        )
+    return found
+@DeveloperAPI
+def inspect_serializability(
+    base_obj: Any,
+    name: Optional[str] = None,
+    depth: int = 3,
+    print_file: Optional[Any] = None,
+) -> Tuple[bool, Set[FailureTuple]]:
+    """Identifies what objects are preventing serialization.
+    Args:
+        base_obj: Object to be serialized.
+        name: Optional name of string.
+        depth: Depth of the scope stack to walk through. Defaults to 3.
+        print_file: file argument that will be passed to print().
+    Returns:
+        bool: True if serializable.
+        set[FailureTuple]: Set of unserializable objects.
+    .. versionadded:: 1.1.0
+    """
+    printer = _Printer(print_file)
+    return _inspect_serializability(base_obj, name, depth, None, None, printer)
+def _inspect_serializability(
+    base_obj, name, depth, parent, failure_set, printer
+) -> Tuple[bool, Set[FailureTuple]]:
+    colorama.init()
+    top_level = False
+    declaration = ""
+    found = False
+    if failure_set is None:
+        top_level = True
+        failure_set = set()
+        declaration = f"Checking Serializability of {base_obj}"
+        printer.print("=" * min(len(declaration), 80))
+        printer.print(declaration)
+        printer.print("=" * min(len(declaration), 80))
+        if name is None:
+            name = str(base_obj)
+    else:
+        printer.print(f"Serializing '{name}' {base_obj}...")
+    try:
+        cp.dumps(base_obj)
+        return True, failure_set
+    except Exception as e:
+        printer.print(
+            f"{colorama.Fore.RED}!!! FAIL{colorama.Fore.RESET} " f"serialization: {e}"
+        )
+        found = True
+        try:
+            if depth == 0:
+                failure_set.add(FailureTuple(base_obj, name, parent))
+        # Some objects may not be hashable, so we skip adding this to the set.
+        except Exception:
+            pass
+    if depth <= 0:
+        return False, failure_set
+    # TODO: we only differentiate between 'function' and 'object'
+    # but we should do a better job of diving into something
+    # more specific like a Type, Object, etc.
+    if inspect.isfunction(base_obj):
+        _inspect_func_serialization(
+            base_obj,
+            depth=depth,
+            parent=base_obj,
+            failure_set=failure_set,
+            printer=printer,
+        )
+    else:
+        _inspect_generic_serialization(
+            base_obj,
+            depth=depth,
+            parent=base_obj,
+            failure_set=failure_set,
+            printer=printer,
+        )
+    if not failure_set:
+        failure_set.add(FailureTuple(base_obj, name, parent))
+    if top_level:
+        printer.print("=" * min(len(declaration), 80))
+        if not failure_set:
+            printer.print(
+                "Nothing failed the inspect_serialization test, though "
+                "serialization did not succeed."
+            )
+        else:
+            fail_vars = (
+                f"\n\n\t{colorama.Style.BRIGHT}"
+                + "\n".join(str(k) for k in failure_set)
+                + f"{colorama.Style.RESET_ALL}\n\n"
+            )
+            printer.print(
+                f"Variable: {fail_vars}was found to be non-serializable. "
+                "There may be multiple other undetected variables that were "
+                "non-serializable. "
+            )
+            printer.print(
+                "Consider either removing the "
+                "instantiation/imports of these variables or moving the "
+                "instantiation into the scope of the function/class. "
+            )
+        printer.print("=" * min(len(declaration), 80))
+        printer.print(
+            "Check https://docs.ray.io/en/master/ray-core/objects/serialization.html#troubleshooting for more information."  # noqa
+        )
+        printer.print(
+            "If you have any suggestions on how to improve "
+            "this error message, please reach out to the "
+            "Ray developers on github.com/ray-project/ray/issues/"
+        )
+        printer.print("=" * min(len(declaration), 80))
+    return not found, failure_set

.venv/lib/python3.11/site-packages/ray/util/client_connect.py ADDED Viewed

	@@ -0,0 +1,76 @@

+from typing import Any, Dict, List, Optional, Tuple
+import logging
+from ray._private.client_mode_hook import (
+    _explicitly_enable_client_mode,
+    _set_client_hook_status,
+)
+from ray.job_config import JobConfig
+from ray.util.annotations import Deprecated
+from ray.util.client import ray
+from ray._private.utils import get_ray_doc_version
+logger = logging.getLogger(__name__)
+@Deprecated(
+    message="Use ray.init(ray://<head_node_ip_address>:<ray_client_server_port>) "
+    "instead. See detailed usage at {}.".format(
+        f"https://docs.ray.io/en/{get_ray_doc_version()}/ray-core/package-ref.html#ray-init"  # noqa: E501
+    )
+)
+def connect(
+    conn_str: str,
+    secure: bool = False,
+    metadata: List[Tuple[str, str]] = None,
+    connection_retries: int = 3,
+    job_config: JobConfig = None,
+    namespace: str = None,
+    *,
+    ignore_version: bool = False,
+    _credentials: Optional["grpc.ChannelCredentials"] = None,  # noqa: F821
+    ray_init_kwargs: Optional[Dict[str, Any]] = None,
+) -> Dict[str, Any]:
+    if ray.is_connected():
+        ignore_reinit_error = ray_init_kwargs.get("ignore_reinit_error", False)
+        if ignore_reinit_error:
+            logger.info(
+                "Calling ray.init() again after it has already been called. "
+                "Reusing the existing Ray client connection."
+            )
+            return ray.get_context().client_worker.connection_info()
+        raise RuntimeError(
+            "Ray Client is already connected. Maybe you called "
+            'ray.init("ray://<address>") twice by accident?'
+        )
+    # Enable the same hooks that RAY_CLIENT_MODE does, as calling
+    # ray.init("ray://<address>") is specifically for using client mode.
+    _set_client_hook_status(True)
+    _explicitly_enable_client_mode()
+    # TODO(barakmich): https://github.com/ray-project/ray/issues/13274
+    # for supporting things like cert_path, ca_path, etc and creating
+    # the correct metadata
+    conn = ray.connect(
+        conn_str,
+        job_config=job_config,
+        secure=secure,
+        metadata=metadata,
+        connection_retries=connection_retries,
+        namespace=namespace,
+        ignore_version=ignore_version,
+        _credentials=_credentials,
+        ray_init_kwargs=ray_init_kwargs,
+    )
+    return conn
+@Deprecated(
+    message="Use ray.shutdown() instead. See detailed usage at {}.".format(
+        f"https://docs.ray.io/en/{get_ray_doc_version()}/ray-core/package-ref.html#ray-shutdown"  # noqa: E501
+    )
+)
+def disconnect():
+    """Disconnects from server; is idempotent."""
+    return ray.disconnect()

.venv/lib/python3.11/site-packages/ray/util/dask/scheduler_utils.py ADDED Viewed

	@@ -0,0 +1,371 @@

+"""
+The following is adapted from Dask release 2021.03.1:
+    https://github.com/dask/dask/blob/2021.03.1/dask/local.py
+"""
+import os
+from queue import Queue, Empty
+from dask import config
+from dask.callbacks import local_callbacks, unpack_callbacks
+from dask.core import _execute_task, flatten, get_dependencies, has_tasks, reverse_dict
+from dask.order import order
+if os.name == "nt":
+    # Python 3 windows Queue.get doesn't handle interrupts properly. To
+    # workaround this we poll at a sufficiently large interval that it
+    # shouldn't affect performance, but small enough that users trying to kill
+    # an application shouldn't care.
+    def queue_get(q):
+        while True:
+            try:
+                return q.get(block=True, timeout=0.1)
+            except Empty:
+                pass
+else:
+    def queue_get(q):
+        return q.get()
+def start_state_from_dask(dsk, cache=None, sortkey=None):
+    """Start state from a dask
+    Examples
+    --------
+    >>> dsk = {
+        'x': 1,
+        'y': 2,
+        'z': (inc, 'x'),
+        'w': (add, 'z', 'y')}  # doctest: +SKIP
+    >>> from pprint import pprint  # doctest: +SKIP
+    >>> pprint(start_state_from_dask(dsk))  # doctest: +SKIP
+    {'cache': {'x': 1, 'y': 2},
+     'dependencies': {'w': {'z', 'y'}, 'x': set(), 'y': set(), 'z': {'x'}},
+     'dependents': {'w': set(), 'x': {'z'}, 'y': {'w'}, 'z': {'w'}},
+     'finished': set(),
+     'ready': ['z'],
+     'released': set(),
+     'running': set(),
+     'waiting': {'w': {'z'}},
+     'waiting_data': {'x': {'z'}, 'y': {'w'}, 'z': {'w'}}}
+    """
+    if sortkey is None:
+        sortkey = order(dsk).get
+    if cache is None:
+        cache = config.get("cache", None)
+    if cache is None:
+        cache = dict()
+    data_keys = set()
+    for k, v in dsk.items():
+        if not has_tasks(dsk, v):
+            cache[k] = v
+            data_keys.add(k)
+    dsk2 = dsk.copy()
+    dsk2.update(cache)
+    dependencies = {k: get_dependencies(dsk2, k) for k in dsk}
+    waiting = {k: v.copy() for k, v in dependencies.items() if k not in data_keys}
+    dependents = reverse_dict(dependencies)
+    for a in cache:
+        for b in dependents.get(a, ()):
+            waiting[b].remove(a)
+    waiting_data = {k: v.copy() for k, v in dependents.items() if v}
+    ready_set = {k for k, v in waiting.items() if not v}
+    ready = sorted(ready_set, key=sortkey, reverse=True)
+    waiting = {k: v for k, v in waiting.items() if v}
+    state = {
+        "dependencies": dependencies,
+        "dependents": dependents,
+        "waiting": waiting,
+        "waiting_data": waiting_data,
+        "cache": cache,
+        "ready": ready,
+        "running": set(),
+        "finished": set(),
+        "released": set(),
+    }
+    return state
+def execute_task(key, task_info, dumps, loads, get_id, pack_exception):
+    """
+    Compute task and handle all administration
+    See Also
+    --------
+    _execute_task : actually execute task
+    """
+    try:
+        task, data = loads(task_info)
+        result = _execute_task(task, data)
+        id = get_id()
+        result = dumps((result, id))
+        failed = False
+    except BaseException as e:
+        result = pack_exception(e, dumps)
+        failed = True
+    return key, result, failed
+def release_data(key, state, delete=True):
+    """Remove data from temporary storage
+    See Also
+    --------
+    finish_task
+    """
+    if key in state["waiting_data"]:
+        assert not state["waiting_data"][key]
+        del state["waiting_data"][key]
+    state["released"].add(key)
+    if delete:
+        del state["cache"][key]
+DEBUG = False
+def finish_task(
+    dsk, key, state, results, sortkey, delete=True, release_data=release_data
+):
+    """
+    Update execution state after a task finishes
+    Mutates.  This should run atomically (with a lock).
+    """
+    for dep in sorted(state["dependents"][key], key=sortkey, reverse=True):
+        s = state["waiting"][dep]
+        s.remove(key)
+        if not s:
+            del state["waiting"][dep]
+            state["ready"].append(dep)
+    for dep in state["dependencies"][key]:
+        if dep in state["waiting_data"]:
+            s = state["waiting_data"][dep]
+            s.remove(key)
+            if not s and dep not in results:
+                if DEBUG:
+                    from chest.core import nbytes
+                    print(
+                        "Key: %s\tDep: %s\t NBytes: %.2f\t Release"
+                        % (key, dep, sum(map(nbytes, state["cache"].values()) / 1e6))
+                    )
+                release_data(dep, state, delete=delete)
+        elif delete and dep not in results:
+            release_data(dep, state, delete=delete)
+    state["finished"].add(key)
+    state["running"].remove(key)
+    return state
+def nested_get(ind, coll):
+    """Get nested index from collection
+    Examples
+    --------
+    >>> nested_get(1, 'abc')
+    'b'
+    >>> nested_get([1, 0], 'abc')
+    ('b', 'a')
+    >>> nested_get([[1, 0], [0, 1]], 'abc')
+    (('b', 'a'), ('a', 'b'))
+    """
+    if isinstance(ind, list):
+        return tuple(nested_get(i, coll) for i in ind)
+    else:
+        return coll[ind]
+def default_get_id():
+    """Default get_id"""
+    return None
+def default_pack_exception(e, dumps):
+    raise
+def reraise(exc, tb=None):
+    if exc.__traceback__ is not tb:
+        raise exc.with_traceback(tb)
+    raise exc
+def identity(x):
+    """Identity function. Returns x.
+    >>> identity(3)
+    3
+    """
+    return x
+def get_async(
+    apply_async,
+    num_workers,
+    dsk,
+    result,
+    cache=None,
+    get_id=default_get_id,
+    rerun_exceptions_locally=None,
+    pack_exception=default_pack_exception,
+    raise_exception=reraise,
+    callbacks=None,
+    dumps=identity,
+    loads=identity,
+    **kwargs
+):
+    """Asynchronous get function
+    This is a general version of various asynchronous schedulers for dask.  It
+    takes a an apply_async function as found on Pool objects to form a more
+    specific ``get`` method that walks through the dask array with parallel
+    workers, avoiding repeat computation and minimizing memory use.
+    Parameters
+    ----------
+    apply_async : function
+        Asynchronous apply function as found on Pool or ThreadPool
+    num_workers : int
+        The number of active tasks we should have at any one time
+    dsk : dict
+        A dask dictionary specifying a workflow
+    result : key or list of keys
+        Keys corresponding to desired data
+    cache : dict-like, optional
+        Temporary storage of results
+    get_id : callable, optional
+        Function to return the worker id, takes no arguments. Examples are
+        `threading.current_thread` and `multiprocessing.current_process`.
+    rerun_exceptions_locally : bool, optional
+        Whether to rerun failing tasks in local process to enable debugging
+        (False by default)
+    pack_exception : callable, optional
+        Function to take an exception and ``dumps`` method, and return a
+        serialized tuple of ``(exception, traceback)`` to send back to the
+        scheduler. Default is to just raise the exception.
+    raise_exception : callable, optional
+        Function that takes an exception and a traceback, and raises an error.
+    dumps: callable, optional
+        Function to serialize task data and results to communicate between
+        worker and parent.  Defaults to identity.
+    loads: callable, optional
+        Inverse function of `dumps`.  Defaults to identity.
+    callbacks : tuple or list of tuples, optional
+        Callbacks are passed in as tuples of length 5. Multiple sets of
+        callbacks may be passed in as a list of tuples. For more information,
+        see the dask.diagnostics documentation.
+    See Also
+    --------
+    threaded.get
+    """
+    queue = Queue()
+    if isinstance(result, list):
+        result_flat = set(flatten(result))
+    else:
+        result_flat = {result}
+    results = set(result_flat)
+    dsk = dict(dsk)
+    with local_callbacks(callbacks) as callbacks:
+        _, _, pretask_cbs, posttask_cbs, _ = unpack_callbacks(callbacks)
+        started_cbs = []
+        succeeded = False
+        # if start_state_from_dask fails, we will have something
+        # to pass to the final block.
+        state = {}
+        try:
+            for cb in callbacks:
+                if cb[0]:
+                    cb[0](dsk)
+                started_cbs.append(cb)
+            keyorder = order(dsk)
+            state = start_state_from_dask(dsk, cache=cache, sortkey=keyorder.get)
+            for _, start_state, _, _, _ in callbacks:
+                if start_state:
+                    start_state(dsk, state)
+            if rerun_exceptions_locally is None:
+                rerun_exceptions_locally = config.get("rerun_exceptions_locally", False)
+            if state["waiting"] and not state["ready"]:
+                raise ValueError("Found no accessible jobs in dask")
+            def fire_task():
+                """Fire off a task to the thread pool"""
+                # Choose a good task to compute
+                key = state["ready"].pop()
+                state["running"].add(key)
+                for f in pretask_cbs:
+                    f(key, dsk, state)
+                # Prep data to send
+                data = {dep: state["cache"][dep] for dep in get_dependencies(dsk, key)}
+                # Submit
+                apply_async(
+                    execute_task,
+                    args=(
+                        key,
+                        dumps((dsk[key], data)),
+                        dumps,
+                        loads,
+                        get_id,
+                        pack_exception,
+                    ),
+                    callback=queue.put,
+                )
+            # Seed initial tasks into the thread pool
+            while state["ready"] and len(state["running"]) < num_workers:
+                fire_task()
+            # Main loop, wait on tasks to finish, insert new ones
+            while state["waiting"] or state["ready"] or state["running"]:
+                key, res_info, failed = queue_get(queue)
+                if failed:
+                    exc, tb = loads(res_info)
+                    if rerun_exceptions_locally:
+                        data = {
+                            dep: state["cache"][dep]
+                            for dep in get_dependencies(dsk, key)
+                        }
+                        task = dsk[key]
+                        _execute_task(task, data)  # Re-execute locally
+                    else:
+                        raise_exception(exc, tb)
+                res, worker_id = loads(res_info)
+                state["cache"][key] = res
+                finish_task(dsk, key, state, results, keyorder.get)
+                for f in posttask_cbs:
+                    f(key, res, dsk, state, worker_id)
+                while state["ready"] and len(state["running"]) < num_workers:
+                    fire_task()
+            succeeded = True
+        finally:
+            for _, _, _, _, finish in started_cbs:
+                if finish:
+                    finish(dsk, state, not succeeded)
+    return nested_get(result, state["cache"])
+def apply_sync(func, args=(), kwds=None, callback=None):
+    """A naive synchronous version of apply_async"""
+    if kwds is None:
+        kwds = {}
+    res = func(*args, **kwds)
+    if callback is not None:
+        callback(res)

.venv/lib/python3.11/site-packages/ray/util/debug.py ADDED Viewed

	@@ -0,0 +1,274 @@

+from collections import defaultdict, namedtuple
+import gc
+import os
+import re
+import time
+import tracemalloc
+from typing import Callable, List, Optional
+from ray.util.annotations import DeveloperAPI
+_logged = set()
+_disabled = False
+_periodic_log = False
+_last_logged = 0.0
+@DeveloperAPI
+def log_once(key):
+    """Returns True if this is the "first" call for a given key.
+    Various logging settings can adjust the definition of "first".
+    Example:
+        .. testcode::
+            import logging
+            from ray.util.debug import log_once
+            logger = logging.getLogger(__name__)
+            if log_once("some_key"):
+                logger.info("Some verbose logging statement")
+    """
+    global _last_logged
+    if _disabled:
+        return False
+    elif key not in _logged:
+        _logged.add(key)
+        _last_logged = time.time()
+        return True
+    elif _periodic_log and time.time() - _last_logged > 60.0:
+        _logged.clear()
+        _last_logged = time.time()
+        return False
+    else:
+        return False
+@DeveloperAPI
+def disable_log_once_globally():
+    """Make log_once() return False in this process."""
+    global _disabled
+    _disabled = True
+@DeveloperAPI
+def enable_periodic_logging():
+    """Make log_once() periodically return True in this process."""
+    global _periodic_log
+    _periodic_log = True
+@DeveloperAPI
+def reset_log_once(key: Optional[str] = None):
+    """Resets log_once for the provided key.
+    If you don't provide a key, resets log_once for all keys.
+    """
+    if key is None:
+        _logged.clear()
+    else:
+        _logged.discard(key)
+# A suspicious memory-allocating stack-trace that we should re-test
+# to make sure it's not a false positive.
+Suspect = DeveloperAPI(
+    namedtuple(
+        "Suspect",
+        [
+            # The stack trace of the allocation, going back n frames, depending
+            # on the tracemalloc.start(n) call.
+            "traceback",
+            # The amount of memory taken by this particular stack trace
+            # over the course of the experiment.
+            "memory_increase",
+            # The slope of the scipy linear regression (x=iteration; y=memory size).
+            "slope",
+            # The rvalue of the scipy linear regression.
+            "rvalue",
+            # The memory size history (list of all memory sizes over all iterations).
+            "hist",
+        ],
+    )
+)
+def _test_some_code_for_memory_leaks(
+    desc: str,
+    init: Optional[Callable[[], None]],
+    code: Callable[[], None],
+    repeats: int,
+    max_num_trials: int = 1,
+) -> List[Suspect]:
+    """Runs given code (and init code) n times and checks for memory leaks.
+    Args:
+        desc: A descriptor of the test.
+        init: Optional code to be executed initially.
+        code: The actual code to be checked for producing memory leaks.
+        repeats: How many times to repeatedly execute `code`.
+        max_num_trials: The maximum number of trials to run. A new trial is only
+            run, if the previous one produced a memory leak. For all non-1st trials,
+            `repeats` calculates as: actual_repeats = `repeats` * (trial + 1), where
+            the first trial is 0.
+    Returns:
+        A list of Suspect objects, describing possible memory leaks. If list
+        is empty, no leaks have been found.
+    """
+    def _i_print(i):
+        if (i + 1) % 10 == 0:
+            print(".", end="" if (i + 1) % 100 else f" {i + 1}\n", flush=True)
+    # Do n trials to make sure a found leak is really one.
+    suspicious = set()
+    suspicious_stats = []
+    for trial in range(max_num_trials):
+        # Store up to n frames of each call stack.
+        tracemalloc.start(20)
+        table = defaultdict(list)
+        # Repeat running code for n times.
+        # Increase repeat value with each trial to make sure stats are more
+        # solid each time (avoiding false positives).
+        actual_repeats = repeats * (trial + 1)
+        print(f"{desc} {actual_repeats} times.")
+        # Initialize if necessary.
+        if init is not None:
+            init()
+        # Run `code` n times, each time taking a memory snapshot.
+        for i in range(actual_repeats):
+            _i_print(i)
+            # Manually trigger garbage collection before and after code runs in order to
+            # make tracemalloc snapshots as accurate as possible.
+            gc.collect()
+            code()
+            gc.collect()
+            _take_snapshot(table, suspicious)
+        print("\n")
+        # Check, which traces have moved up in their memory consumption
+        # constantly over time.
+        suspicious.clear()
+        suspicious_stats.clear()
+        # Suspicious memory allocation found?
+        suspects = _find_memory_leaks_in_table(table)
+        for suspect in sorted(suspects, key=lambda s: s.memory_increase, reverse=True):
+            # Only print out the biggest offender:
+            if len(suspicious) == 0:
+                _pprint_suspect(suspect)
+                print("-> added to retry list")
+            suspicious.add(suspect.traceback)
+            suspicious_stats.append(suspect)
+        tracemalloc.stop()
+        # Some suspicious memory allocations found.
+        if len(suspicious) > 0:
+            print(f"{len(suspicious)} suspects found. Top-ten:")
+            for i, s in enumerate(suspicious_stats):
+                if i > 10:
+                    break
+                print(
+                    f"{i}) line={s.traceback[-1]} mem-increase={s.memory_increase}B "
+                    f"slope={s.slope}B/detection rval={s.rvalue}"
+                )
+        # Nothing suspicious found -> Exit trial loop and return.
+        else:
+            print("No remaining suspects found -> returning")
+            break
+    # Print out final top offender.
+    if len(suspicious_stats) > 0:
+        _pprint_suspect(suspicious_stats[0])
+    return suspicious_stats
+def _take_snapshot(table, suspicious=None):
+    # Take a memory snapshot.
+    snapshot = tracemalloc.take_snapshot()
+    # Group all memory allocations by their stacktrace (going n frames
+    # deep as defined above in tracemalloc.start(n)).
+    # Then sort groups by size, then count, then trace.
+    top_stats = snapshot.statistics("traceback")
+    # For the first m largest increases, keep only, if a) first trial or b) those
+    # that are already in the `suspicious` set.
+    for stat in top_stats[:100]:
+        if not suspicious or stat.traceback in suspicious:
+            table[stat.traceback].append(stat.size)
+def _find_memory_leaks_in_table(table):
+    import scipy.stats
+    import numpy as np
+    suspects = []
+    for traceback, hist in table.items():
+        # Do a quick mem increase check.
+        memory_increase = hist[-1] - hist[0]
+        # Only if memory increased, do we check further.
+        if memory_increase <= 0.0:
+            continue
+        # Ignore this very module here (we are collecting lots of data
+        # so an increase is expected).
+        top_stack = str(traceback[-1])
+        drive_separator = "\\\\" if os.name == "nt" else "/"
+        if any(
+            s in top_stack
+            for s in [
+                "tracemalloc",
+                "pycharm",
+                "thirdparty_files/psutil",
+                re.sub("\\.", drive_separator, __name__) + ".py",
+            ]
+        ):
+            continue
+        # Do a linear regression to get the slope and R-value.
+        line = scipy.stats.linregress(x=np.arange(len(hist)), y=np.array(hist))
+        # - If weak positive slope and some confidence and
+        #   increase > n bytes -> error.
+        # - If stronger positive slope -> error.
+        if memory_increase > 1000 and (
+            (line.slope > 60.0 and line.rvalue > 0.875)
+            or (line.slope > 20.0 and line.rvalue > 0.9)
+            or (line.slope > 10.0 and line.rvalue > 0.95)
+        ):
+            suspects.append(
+                Suspect(
+                    traceback=traceback,
+                    memory_increase=memory_increase,
+                    slope=line.slope,
+                    rvalue=line.rvalue,
+                    hist=hist,
+                )
+            )
+    return suspects
+def _pprint_suspect(suspect):
+    print(
+        "Most suspicious memory allocation in traceback "
+        "(only printing out this one, but all (less suspicious)"
+        " suspects will be investigated as well):"
+    )
+    print("\n".join(suspect.traceback.format()))
+    print(f"Increase total={suspect.memory_increase}B")
+    print(f"Slope={suspect.slope} B/detection")
+    print(f"Rval={suspect.rvalue}")

.venv/lib/python3.11/site-packages/ray/util/debugpy.py ADDED Viewed

	@@ -0,0 +1,136 @@

+import logging
+import os
+import sys
+import threading
+import importlib
+import ray
+from ray.util.annotations import DeveloperAPI
+log = logging.getLogger(__name__)
+POST_MORTEM_ERROR_UUID = "post_mortem_error_uuid"
+def _try_import_debugpy():
+    try:
+        debugpy = importlib.import_module("debugpy")
+        if not hasattr(debugpy, "__version__") or debugpy.__version__ < "1.8.0":
+            raise ImportError()
+        return debugpy
+    except (ModuleNotFoundError, ImportError):
+        log.error(
+            "Module 'debugpy>=1.8.0' cannot be loaded. "
+            "Ray Debugpy Debugger will not work without 'debugpy>=1.8.0' installed. "
+            "Install this module using 'pip install debugpy==1.8.0' "
+        )
+        return None
+# A lock to ensure that only one thread can open the debugger port.
+debugger_port_lock = threading.Lock()
+def _override_breakpoint_hooks():
+    """
+    This method overrides the breakpoint() function to set_trace()
+    so that other threads can reuse the same setup logic.
+    This is based on: https://github.com/microsoft/debugpy/blob/ef9a67fe150179ee4df9997f9273723c26687fab/src/debugpy/_vendored/pydevd/pydev_sitecustomize/sitecustomize.py#L87 # noqa: E501
+    """
+    sys.__breakpointhook__ = set_trace
+    sys.breakpointhook = set_trace
+    import builtins as __builtin__
+    __builtin__.breakpoint = set_trace
+def _ensure_debugger_port_open_thread_safe():
+    """
+    This is a thread safe method that ensure that the debugger port
+    is open, and if not, open it.
+    """
+    # The lock is acquired before checking the debugger port so only
+    # one thread can open the debugger port.
+    with debugger_port_lock:
+        debugpy = _try_import_debugpy()
+        if not debugpy:
+            return
+        debugger_port = ray._private.worker.global_worker.debugger_port
+        if not debugger_port:
+            (host, port) = debugpy.listen(
+                (ray._private.worker.global_worker.node_ip_address, 0)
+            )
+            ray._private.worker.global_worker.set_debugger_port(port)
+            log.info(f"Ray debugger is listening on {host}:{port}")
+        else:
+            log.info(f"Ray debugger is already open on {debugger_port}")
+@DeveloperAPI
+def set_trace(breakpoint_uuid=None):
+    """Interrupt the flow of the program and drop into the Ray debugger.
+    Can be used within a Ray task or actor.
+    """
+    debugpy = _try_import_debugpy()
+    if not debugpy:
+        return
+    _ensure_debugger_port_open_thread_safe()
+    # debugpy overrides the breakpoint() function, so we need to set it back
+    # so other threads can reuse it.
+    _override_breakpoint_hooks()
+    with ray._private.worker.global_worker.worker_paused_by_debugger():
+        msg = (
+            "Waiting for debugger to attach (see "
+            "https://docs.ray.io/en/latest/ray-observability/"
+            "ray-distributed-debugger.html)..."
+        )
+        log.info(msg)
+        debugpy.wait_for_client()
+    log.info("Debugger client is connected")
+    if breakpoint_uuid == POST_MORTEM_ERROR_UUID:
+        _debugpy_excepthook()
+    else:
+        _debugpy_breakpoint()
+def _debugpy_breakpoint():
+    """
+    Drop the user into the debugger on a breakpoint.
+    """
+    import pydevd
+    pydevd.settrace(stop_at_frame=sys._getframe().f_back)
+def _debugpy_excepthook():
+    """
+    Drop the user into the debugger on an unhandled exception.
+    """
+    import threading
+    import pydevd
+    py_db = pydevd.get_global_debugger()
+    thread = threading.current_thread()
+    additional_info = py_db.set_additional_thread_info(thread)
+    additional_info.is_tracing += 1
+    try:
+        error = sys.exc_info()
+        py_db.stop_on_unhandled_exception(py_db, thread, additional_info, error)
+        sys.excepthook(error[0], error[1], error[2])
+    finally:
+        additional_info.is_tracing -= 1
+def _is_ray_debugger_post_mortem_enabled():
+    return os.environ.get("RAY_DEBUG_POST_MORTEM", "0") == "1"
+def _post_mortem():
+    return set_trace(POST_MORTEM_ERROR_UUID)

.venv/lib/python3.11/site-packages/ray/util/iter_metrics.py ADDED Viewed

	@@ -0,0 +1,69 @@

+import collections
+from typing import List
+from ray.util.annotations import Deprecated
+from ray.util.timer import _Timer
+@Deprecated
+class MetricsContext:
+    """Metrics context object for a local iterator.
+    This object is accessible by all operators of a local iterator. It can be
+    used to store and retrieve global execution metrics for the iterator.
+    It can be accessed by calling LocalIterator.get_metrics(), which is only
+    allowable inside iterator functions.
+    Attributes:
+        counters: dict storing increasing metrics.
+        timers: dict storing latency timers.
+        info: dict storing misc metric values.
+        current_actor: reference to the actor handle that
+            produced the current iterator output. This is automatically set
+            for gather_async().
+    """
+    def __init__(self):
+        self.counters = collections.defaultdict(int)
+        self.timers = collections.defaultdict(_Timer)
+        self.info = {}
+        self.current_actor = None
+    def save(self):
+        """Return a serializable copy of this context."""
+        return {
+            "counters": dict(self.counters),
+            "info": dict(self.info),
+            "timers": None,  # TODO(ekl) consider persisting timers too
+        }
+    def restore(self, values):
+        """Restores state given the output of save()."""
+        self.counters.clear()
+        self.counters.update(values["counters"])
+        self.timers.clear()
+        self.info = values["info"]
+@Deprecated
+class SharedMetrics:
+    """Holds an indirect reference to a (shared) metrics context.
+    This is used by LocalIterator.union() to point the metrics contexts of
+    entirely separate iterator chains to the same underlying context."""
+    def __init__(
+        self, metrics: MetricsContext = None, parents: List["SharedMetrics"] = None
+    ):
+        self.metrics = metrics or MetricsContext()
+        self.parents = parents or []
+        self.set(self.metrics)
+    def set(self, metrics):
+        """Recursively set self and parents to point to the same metrics."""
+        self.metrics = metrics
+        for parent in self.parents:
+            parent.set(metrics)
+    def get(self):
+        return self.metrics

.venv/lib/python3.11/site-packages/ray/util/lightgbm/__init__.py ADDED Viewed

	@@ -0,0 +1,4 @@

+raise DeprecationWarning(
+    "ray.util.lightgbm has been removed as of Ray 2.0. Instead, use the `lightgbm-ray` "
+    "library directly or the `LightGBMTrainer` in Ray Train."
+)

.venv/lib/python3.11/site-packages/ray/util/lightgbm/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (379 Bytes). View file