diff --git a/.venv/lib/python3.11/site-packages/ray/autoscaler/aliyun/__init__.py b/.venv/lib/python3.11/site-packages/ray/autoscaler/aliyun/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/.venv/lib/python3.11/site-packages/ray/autoscaler/aliyun/__pycache__/__init__.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/autoscaler/aliyun/__pycache__/__init__.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..fd81608a2a81f704565bc2850167fe8af0490232
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/autoscaler/aliyun/__pycache__/__init__.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/autoscaler/v2/__pycache__/__init__.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/autoscaler/v2/__pycache__/__init__.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..0b433c02aab1792016c2217e255963acc68eb375
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/autoscaler/v2/__pycache__/__init__.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/autoscaler/v2/__pycache__/autoscaler.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/autoscaler/v2/__pycache__/autoscaler.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..80583b4ccb15784ea96a7935b55b4b2e692a4a5b
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/autoscaler/v2/__pycache__/autoscaler.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/autoscaler/v2/__pycache__/event_logger.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/autoscaler/v2/__pycache__/event_logger.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..1d05d6671d9d91109ca0c3863c2211b151d5a724
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/autoscaler/v2/__pycache__/event_logger.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/autoscaler/v2/__pycache__/metrics_reporter.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/autoscaler/v2/__pycache__/metrics_reporter.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3a7de65f27e9ae4184e7e95da0b94af3a7d4e3f1
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/autoscaler/v2/__pycache__/metrics_reporter.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/autoscaler/v2/__pycache__/monitor.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/autoscaler/v2/__pycache__/monitor.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..05f65fdea0d584742bbd9a4f6281d8b9bba2e0c0
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/autoscaler/v2/__pycache__/monitor.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/autoscaler/v2/__pycache__/scheduler.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/autoscaler/v2/__pycache__/scheduler.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..1a4dd9d9132907cf6cad234bda5d5c8cd9886bd3
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/autoscaler/v2/__pycache__/scheduler.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/autoscaler/v2/__pycache__/schema.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/autoscaler/v2/__pycache__/schema.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a36f83846647e265a7cb73a74bd53f1265c1ae84
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/autoscaler/v2/__pycache__/schema.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/autoscaler/v2/__pycache__/sdk.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/autoscaler/v2/__pycache__/sdk.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e29c321038d7c82695a8f6c4855300fdc38159b5
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/autoscaler/v2/__pycache__/sdk.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/autoscaler/v2/__pycache__/utils.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/autoscaler/v2/__pycache__/utils.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..684b0b7d567952eadf69163dc9809d5b7c4a669c
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/autoscaler/v2/__pycache__/utils.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/autoscaler/v2/instance_manager/__pycache__/__init__.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/autoscaler/v2/instance_manager/__pycache__/__init__.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..587c712a1b5d1d9d4f985fabb4ebc653928c3aff
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/autoscaler/v2/instance_manager/__pycache__/__init__.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/autoscaler/v2/instance_manager/__pycache__/common.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/autoscaler/v2/instance_manager/__pycache__/common.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a453620b5ea52c6aeb76b676edcc805e798d4ed1
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/autoscaler/v2/instance_manager/__pycache__/common.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/autoscaler/v2/instance_manager/__pycache__/config.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/autoscaler/v2/instance_manager/__pycache__/config.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..5d2e2bde4b8a7e89a20f7df2446e9020284897a6
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/autoscaler/v2/instance_manager/__pycache__/config.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/autoscaler/v2/instance_manager/__pycache__/instance_manager.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/autoscaler/v2/instance_manager/__pycache__/instance_manager.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..afb17ef0a9c599a255e12b1e0edf877376125e82
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/autoscaler/v2/instance_manager/__pycache__/instance_manager.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/autoscaler/v2/instance_manager/__pycache__/instance_storage.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/autoscaler/v2/instance_manager/__pycache__/instance_storage.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9475d9a31495d256ca9b3989148f190efee9b23b
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/autoscaler/v2/instance_manager/__pycache__/instance_storage.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/autoscaler/v2/instance_manager/__pycache__/node_provider.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/autoscaler/v2/instance_manager/__pycache__/node_provider.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ee79bb14fecbf0a604fea8faab1e69c38bde072b
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/autoscaler/v2/instance_manager/__pycache__/node_provider.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/autoscaler/v2/instance_manager/__pycache__/ray_installer.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/autoscaler/v2/instance_manager/__pycache__/ray_installer.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..85c05d8c1bcba1647ee32418e75097309fb70bdf
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/autoscaler/v2/instance_manager/__pycache__/ray_installer.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/autoscaler/v2/instance_manager/__pycache__/reconciler.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/autoscaler/v2/instance_manager/__pycache__/reconciler.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..cb41c421095b86d31384f574fd6abd59e095335d
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/autoscaler/v2/instance_manager/__pycache__/reconciler.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/autoscaler/v2/instance_manager/__pycache__/storage.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/autoscaler/v2/instance_manager/__pycache__/storage.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..091ebc2662011986089f5c79097aa4bb656ee4e4
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/autoscaler/v2/instance_manager/__pycache__/storage.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/autoscaler/v2/instance_manager/cloud_providers/__init__.py b/.venv/lib/python3.11/site-packages/ray/autoscaler/v2/instance_manager/cloud_providers/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/.venv/lib/python3.11/site-packages/ray/autoscaler/v2/instance_manager/cloud_providers/__pycache__/__init__.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/autoscaler/v2/instance_manager/cloud_providers/__pycache__/__init__.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..0b5c4c0589a42c4a2799a4d5852023f607277969
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/autoscaler/v2/instance_manager/cloud_providers/__pycache__/__init__.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/autoscaler/v2/instance_manager/cloud_providers/kuberay/__init__.py b/.venv/lib/python3.11/site-packages/ray/autoscaler/v2/instance_manager/cloud_providers/kuberay/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/.venv/lib/python3.11/site-packages/ray/autoscaler/v2/instance_manager/cloud_providers/kuberay/__pycache__/__init__.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/autoscaler/v2/instance_manager/cloud_providers/kuberay/__pycache__/__init__.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..8ab19179e2ac43b269fb761e25b03450755374da
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/autoscaler/v2/instance_manager/cloud_providers/kuberay/__pycache__/__init__.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/autoscaler/v2/instance_manager/cloud_providers/kuberay/__pycache__/cloud_provider.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/autoscaler/v2/instance_manager/cloud_providers/kuberay/__pycache__/cloud_provider.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..096bb7d5325b7cb2e9c3c129ffbc60916b99c2de
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/autoscaler/v2/instance_manager/cloud_providers/kuberay/__pycache__/cloud_provider.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/autoscaler/v2/instance_manager/cloud_providers/kuberay/cloud_provider.py b/.venv/lib/python3.11/site-packages/ray/autoscaler/v2/instance_manager/cloud_providers/kuberay/cloud_provider.py
new file mode 100644
index 0000000000000000000000000000000000000000..c1b8ddc2a31b91853ef505a52893a0ee4f6ab9fb
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/autoscaler/v2/instance_manager/cloud_providers/kuberay/cloud_provider.py
@@ -0,0 +1,571 @@
+import copy
+import logging
+import time
+from collections import defaultdict
+from dataclasses import dataclass, field
+from typing import Any, Dict, List, Optional, Set, Tuple
+
+import requests
+
+# TODO(rickyx): We should eventually remove these imports
+# when we deprecate the v1 kuberay node provider.
+from ray.autoscaler._private.kuberay.node_provider import (
+    KUBERAY_KIND_HEAD,
+    KUBERAY_KIND_WORKER,
+    KUBERAY_LABEL_KEY_KIND,
+    KUBERAY_LABEL_KEY_TYPE,
+    RAY_HEAD_POD_NAME,
+    IKubernetesHttpApiClient,
+    KubernetesHttpApiClient,
+    _worker_group_index,
+    _worker_group_max_replicas,
+    _worker_group_replicas,
+    worker_delete_patch,
+    worker_replica_patch,
+)
+from ray.autoscaler.v2.instance_manager.node_provider import (
+    CloudInstance,
+    CloudInstanceId,
+    CloudInstanceProviderError,
+    ICloudInstanceProvider,
+    LaunchNodeError,
+    NodeKind,
+    TerminateNodeError,
+)
+from ray.autoscaler.v2.schema import NodeType
+
+logger = logging.getLogger(__name__)
+
+
+class KubeRayProvider(ICloudInstanceProvider):
+    """
+    This class is a thin wrapper around the Kubernetes API client. It modifies
+    the RayCluster resource spec on the Kubernetes API server to scale the cluster:
+
+    It launches new instances/nodes by submitting patches to the Kubernetes API
+    to update the RayCluster CRD.
+    """
+
+    def __init__(
+        self,
+        cluster_name: str,
+        provider_config: Dict[str, Any],
+        k8s_api_client: Optional[IKubernetesHttpApiClient] = None,
+    ):
+        """
+        Args:
+            cluster_name: The name of the RayCluster resource.
+            provider_config: The namespace of the RayCluster.
+            k8s_api_client: The client to the Kubernetes API server.
+                This could be used to mock the Kubernetes API server for testing.
+        """
+        self._cluster_name = cluster_name
+        self._namespace = provider_config["namespace"]
+
+        self._k8s_api_client = k8s_api_client or KubernetesHttpApiClient(
+            namespace=self._namespace
+        )
+
+        # Below are states that are cached locally.
+        self._requests = set()
+        self._launch_errors_queue = []
+        self._terminate_errors_queue = []
+
+        # Below are states that are fetched from the Kubernetes API server.
+        self._ray_cluster = None
+        self._cached_instances: Dict[CloudInstanceId, CloudInstance]
+
+    @dataclass
+    class ScaleRequest:
+        """Represents a scale request that contains the current states and go-to states
+        for the ray cluster.
+
+        This class will be converted to patches to be submitted to the Kubernetes API
+        server:
+        - For launching new instances, it will adjust the `replicas` field in the
+            workerGroupSpecs.
+        - For terminating instances, it will adjust the `workersToDelete` field in the
+            workerGroupSpecs.
+
+        """
+
+        # The desired number of workers for each node type.
+        desired_num_workers: Dict[NodeType, int] = field(default_factory=dict)
+        # The workers to delete for each node type.
+        workers_to_delete: Dict[NodeType, List[CloudInstanceId]] = field(
+            default_factory=dict
+        )
+        # The worker groups with empty workersToDelete field.
+        # This is needed since we will also need to clear the workersToDelete field
+        # for the worker groups that have finished deletes.
+        worker_groups_without_pending_deletes: Set[NodeType] = field(
+            default_factory=set
+        )
+        # The worker groups that still have workers to be deleted.
+        worker_groups_with_pending_deletes: Set[NodeType] = field(default_factory=set)
+
+    ################################
+    # Interface for ICloudInstanceProvider
+    ################################
+
+    def get_non_terminated(self) -> Dict[CloudInstanceId, CloudInstance]:
+        self._sync_with_api_server()
+        return copy.deepcopy(
+            {id: instance for id, instance in self._cached_instances.items()}
+        )
+
+    def terminate(self, ids: List[CloudInstanceId], request_id: str) -> None:
+        if request_id in self._requests:
+            # This request is already processed.
+            logger.warning(f"Request {request_id} is already processed for: {ids}")
+            return
+        self._requests.add(request_id)
+        logger.info("Terminating worker pods: {}".format(ids))
+
+        scale_request = self._initialize_scale_request(
+            to_launch={}, to_delete_instances=ids
+        )
+        if scale_request.worker_groups_with_pending_deletes:
+            errors_msg = (
+                "There are workers to be deleted from: "
+                f"{scale_request.worker_groups_with_pending_deletes}. "
+                "Waiting for them to be deleted before adding new workers "
+                " to be deleted"
+            )
+            logger.warning(errors_msg)
+            self._add_terminate_errors(
+                ids,
+                request_id,
+                details=errors_msg,
+            )
+            return
+
+        try:
+            self._submit_scale_request(scale_request)
+        except Exception as e:
+            logger.exception(f"Error terminating nodes: {scale_request}")
+            self._add_terminate_errors(ids, request_id, details=str(e), e=e)
+
+    def launch(self, shape: Dict[NodeType, int], request_id: str) -> None:
+        if request_id in self._requests:
+            # This request is already processed.
+            return
+        self._requests.add(request_id)
+
+        scale_request = self._initialize_scale_request(
+            to_launch=shape, to_delete_instances=[]
+        )
+
+        if scale_request.worker_groups_with_pending_deletes:
+            error_msg = (
+                "There are workers to be deleted from: "
+                f"{scale_request.worker_groups_with_pending_deletes}. "
+                "Waiting for them to be deleted before creating new workers."
+            )
+            logger.warning(error_msg)
+            self._add_launch_errors(
+                shape,
+                request_id,
+                details=error_msg,
+            )
+            return
+
+        try:
+            self._submit_scale_request(scale_request)
+        except Exception as e:
+            logger.exception(f"Error launching nodes: {scale_request}")
+            self._add_launch_errors(shape, request_id, details=str(e), e=e)
+
+    def poll_errors(self) -> List[CloudInstanceProviderError]:
+        errors = []
+        errors += self._launch_errors_queue
+        errors += self._terminate_errors_queue
+        self._launch_errors_queue = []
+        self._terminate_errors_queue = []
+        return errors
+
+    ############################
+    # Private
+    ############################
+
+    def _initialize_scale_request(
+        self, to_launch: Dict[NodeType, int], to_delete_instances: List[CloudInstanceId]
+    ) -> "KubeRayProvider.ScaleRequest":
+        """
+        Initialize the scale request based on the current state of the cluster and
+        the desired state (to launch, to delete).
+
+        Args:
+            to_launch: The desired number of workers to launch for each node type.
+            to_delete_instances: The instances to delete.
+
+        Returns:
+            The scale request.
+        """
+
+        # Update the cached states.
+        self._sync_with_api_server()
+        ray_cluster = self.ray_cluster
+        cur_instances = self.instances
+
+        # Get the worker groups that have pending deletes and the worker groups that
+        # have finished deletes, and the set of workers included in the workersToDelete
+        # field of any worker group.
+        (
+            worker_groups_with_pending_deletes,
+            worker_groups_without_pending_deletes,
+            worker_to_delete_set,
+        ) = self._get_workers_delete_info(ray_cluster, set(cur_instances.keys()))
+
+        # Calculate the desired number of workers by type.
+        num_workers_dict = defaultdict(int)
+        worker_groups = ray_cluster["spec"].get("workerGroupSpecs", [])
+        for worker_group in worker_groups:
+            node_type = worker_group["groupName"]
+            # Handle the case where users manually increase `minReplicas`
+            # to scale up the number of worker Pods. In this scenario,
+            # `replicas` will be smaller than `minReplicas`.
+            num_workers_dict[node_type] = max(
+                worker_group["replicas"], worker_group["minReplicas"]
+            )
+
+        # Add to launch nodes.
+        for node_type, count in to_launch.items():
+            num_workers_dict[node_type] += count
+
+        to_delete_instances_by_type = defaultdict(list)
+        # Update the number of workers with to_delete_instances
+        # and group them by type.
+        for to_delete_id in to_delete_instances:
+            to_delete_instance = cur_instances.get(to_delete_id, None)
+            if to_delete_instance is None:
+                # This instance has already been deleted.
+                continue
+
+            if to_delete_instance.node_kind == NodeKind.HEAD:
+                # Not possible to delete head node.
+                continue
+
+            if to_delete_instance.cloud_instance_id in worker_to_delete_set:
+                # If the instance is already in the workersToDelete field of
+                # any worker group, skip it.
+                continue
+
+            num_workers_dict[to_delete_instance.node_type] -= 1
+            assert num_workers_dict[to_delete_instance.node_type] >= 0
+            to_delete_instances_by_type[to_delete_instance.node_type].append(
+                to_delete_instance
+            )
+
+        scale_request = KubeRayProvider.ScaleRequest(
+            desired_num_workers=num_workers_dict,
+            workers_to_delete=to_delete_instances_by_type,
+            worker_groups_without_pending_deletes=worker_groups_without_pending_deletes,
+            worker_groups_with_pending_deletes=worker_groups_with_pending_deletes,
+        )
+
+        return scale_request
+
+    def _submit_scale_request(
+        self, scale_request: "KubeRayProvider.ScaleRequest"
+    ) -> None:
+        """Submits a scale request to the Kubernetes API server.
+
+        This method will convert the scale request to patches and submit the patches
+        to the Kubernetes API server.
+
+        Args:
+            scale_request: The scale request.
+
+        Raises:
+            Exception: An exception is raised if the Kubernetes API server returns an
+                error.
+        """
+        # Get the current ray cluster spec.
+        patch_payload = []
+
+        raycluster = self.ray_cluster
+
+        # Collect patches for replica counts.
+        for node_type, target_replicas in scale_request.desired_num_workers.items():
+            group_index = _worker_group_index(raycluster, node_type)
+            group_max_replicas = _worker_group_max_replicas(raycluster, group_index)
+            # Cap the replica count to maxReplicas.
+            if group_max_replicas is not None and group_max_replicas < target_replicas:
+                logger.warning(
+                    "Autoscaler attempted to create "
+                    + "more than maxReplicas pods of type {}.".format(node_type)
+                )
+                target_replicas = group_max_replicas
+            # Check if we need to change the target count.
+            if target_replicas == _worker_group_replicas(raycluster, group_index):
+                # No patch required.
+                continue
+            # Need to patch replica count. Format the patch and add it to the payload.
+            patch = worker_replica_patch(group_index, target_replicas)
+            patch_payload.append(patch)
+
+        # Maps node_type to nodes to delete for that group.
+        for (
+            node_type,
+            workers_to_delete_of_type,
+        ) in scale_request.workers_to_delete.items():
+            group_index = _worker_group_index(raycluster, node_type)
+            worker_ids_to_delete = [
+                worker.cloud_instance_id for worker in workers_to_delete_of_type
+            ]
+            patch = worker_delete_patch(group_index, worker_ids_to_delete)
+            patch_payload.append(patch)
+
+        # Clear the workersToDelete field for the worker groups that have been deleted.
+        for node_type in scale_request.worker_groups_without_pending_deletes:
+            if node_type in scale_request.workers_to_delete:
+                # This node type is still being deleted.
+                continue
+            group_index = _worker_group_index(raycluster, node_type)
+            patch = worker_delete_patch(group_index, [])
+            patch_payload.append(patch)
+
+        if len(patch_payload) == 0:
+            # No patch required.
+            return
+
+        logger.info(f"Submitting a scale request: {scale_request}")
+        self._patch(f"rayclusters/{self._cluster_name}", patch_payload)
+
+    def _add_launch_errors(
+        self,
+        shape: Dict[NodeType, int],
+        request_id: str,
+        details: str,
+        e: Optional[Exception] = None,
+    ) -> None:
+        """
+        Adds launch errors to the error queue.
+
+        Args:
+            shape: The shape of the nodes that failed to launch.
+            request_id: The request id of the launch request.
+            details: The details of the error.
+            e: The exception that caused the error.
+        """
+        for node_type, count in shape.items():
+            self._launch_errors_queue.append(
+                LaunchNodeError(
+                    node_type=node_type,
+                    timestamp_ns=time.time_ns(),
+                    count=count,
+                    request_id=request_id,
+                    details=details,
+                    cause=e,
+                )
+            )
+
+    def _add_terminate_errors(
+        self,
+        ids: List[CloudInstanceId],
+        request_id: str,
+        details: str,
+        e: Optional[Exception] = None,
+    ) -> None:
+        """
+        Adds terminate errors to the error queue.
+
+        Args:
+            ids: The ids of the nodes that failed to terminate.
+            request_id: The request id of the terminate request.
+            details: The details of the error.
+            e: The exception that caused the error.
+        """
+        for id in ids:
+            self._terminate_errors_queue.append(
+                TerminateNodeError(
+                    cloud_instance_id=id,
+                    timestamp_ns=time.time_ns(),
+                    request_id=request_id,
+                    details=details,
+                    cause=e,
+                )
+            )
+
+    def _sync_with_api_server(self) -> None:
+        """Fetches the RayCluster resource from the Kubernetes API server."""
+        self._ray_cluster = self._get(f"rayclusters/{self._cluster_name}")
+        self._cached_instances = self._fetch_instances()
+
+    @property
+    def ray_cluster(self) -> Dict[str, Any]:
+        return copy.deepcopy(self._ray_cluster)
+
+    @property
+    def instances(self) -> Dict[CloudInstanceId, CloudInstance]:
+        return copy.deepcopy(self._cached_instances)
+
+    @staticmethod
+    def _get_workers_delete_info(
+        ray_cluster_spec: Dict[str, Any], node_set: Set[CloudInstanceId]
+    ) -> Tuple[Set[NodeType], Set[NodeType], Set[CloudInstanceId]]:
+        """
+        Gets the worker groups that have pending deletes and the worker groups that
+        have finished deletes.
+
+        Returns:
+            worker_groups_with_pending_deletes: The worker groups that have pending
+                deletes.
+            worker_groups_with_finished_deletes: The worker groups that have finished
+                deletes.
+            worker_to_delete_set: A set of Pods that are included in the workersToDelete
+                field of any worker group.
+        """
+
+        worker_groups_with_pending_deletes = set()
+        worker_groups_with_deletes = set()
+        worker_to_delete_set = set()
+
+        worker_groups = ray_cluster_spec["spec"].get("workerGroupSpecs", [])
+        for worker_group in worker_groups:
+            workersToDelete = worker_group.get("scaleStrategy", {}).get(
+                "workersToDelete", []
+            )
+            if not workersToDelete:
+                # No workers to delete in this group.
+                continue
+
+            node_type = worker_group["groupName"]
+            worker_groups_with_deletes.add(node_type)
+
+            for worker in workersToDelete:
+                worker_to_delete_set.add(worker)
+                if worker in node_set:
+                    worker_groups_with_pending_deletes.add(node_type)
+                    break
+
+        worker_groups_with_finished_deletes = (
+            worker_groups_with_deletes - worker_groups_with_pending_deletes
+        )
+        return (
+            worker_groups_with_pending_deletes,
+            worker_groups_with_finished_deletes,
+            worker_to_delete_set,
+        )
+
+    def _fetch_instances(self) -> Dict[CloudInstanceId, CloudInstance]:
+        """
+        Fetches the pods from the Kubernetes API server and convert them to Ray
+        CloudInstance.
+
+        Returns:
+            A dict of CloudInstanceId to CloudInstance.
+        """
+        # Get the pods resource version.
+        # Specifying a resource version in list requests is important for scalability:
+        # https://kubernetes.io/docs/reference/using-api/api-concepts/#semantics-for-get-and-list
+        resource_version = self._get_head_pod_resource_version()
+        if resource_version:
+            logger.info(
+                f"Listing pods for RayCluster {self._cluster_name}"
+                f" in namespace {self._namespace}"
+                f" at pods resource version >= {resource_version}."
+            )
+
+        # Filter pods by cluster_name.
+        label_selector = requests.utils.quote(f"ray.io/cluster={self._cluster_name}")
+
+        resource_path = f"pods?labelSelector={label_selector}"
+        if resource_version:
+            resource_path += (
+                f"&resourceVersion={resource_version}"
+                + "&resourceVersionMatch=NotOlderThan"
+            )
+
+        pod_list = self._get(resource_path)
+        fetched_resource_version = pod_list["metadata"]["resourceVersion"]
+        logger.info(
+            f"Fetched pod data at resource version" f" {fetched_resource_version}."
+        )
+
+        # Extract node data from the pod list.
+        cloud_instances = {}
+        for pod in pod_list["items"]:
+            # Kubernetes sets metadata.deletionTimestamp immediately after admitting a
+            # request to delete an object. Full removal of the object may take some time
+            # after the deletion timestamp is set. See link for details:
+            # https://kubernetes.io/docs/reference/using-api/api-concepts/#resource-deletion
+            if "deletionTimestamp" in pod["metadata"]:
+                # Ignore pods marked for termination.
+                continue
+            pod_name = pod["metadata"]["name"]
+            cloud_instance = self._cloud_instance_from_pod(pod)
+            if cloud_instance:
+                cloud_instances[pod_name] = cloud_instance
+        return cloud_instances
+
+    @staticmethod
+    def _cloud_instance_from_pod(pod: Dict[str, Any]) -> Optional[CloudInstance]:
+        """
+        Convert a pod to a Ray CloudInstance.
+
+        Args:
+            pod: The pod resource dict.
+        """
+        labels = pod["metadata"]["labels"]
+        if labels[KUBERAY_LABEL_KEY_KIND] == KUBERAY_KIND_HEAD:
+            kind = NodeKind.HEAD
+            type = labels[KUBERAY_LABEL_KEY_TYPE]
+        elif labels[KUBERAY_LABEL_KEY_KIND] == KUBERAY_KIND_WORKER:
+            kind = NodeKind.WORKER
+            type = labels[KUBERAY_LABEL_KEY_TYPE]
+        else:
+            # Other ray nodes types defined by KubeRay.
+            # e.g. this could also be `redis-cleanup`
+            # We will not track these nodes.
+            return None
+
+        # TODO: we should prob get from the pod's env var (RAY_CLOUD_INSTANCE_ID)
+        # directly.
+        cloud_instance_id = pod["metadata"]["name"]
+        return CloudInstance(
+            cloud_instance_id=cloud_instance_id,
+            node_type=type,
+            node_kind=kind,
+            is_running=KubeRayProvider._is_running(pod),
+        )
+
+    @staticmethod
+    def _is_running(pod) -> bool:
+        """Convert pod state to Ray NodeStatus
+
+        A cloud instance is considered running if the pod is in the running state,
+        else it could be pending/containers-terminated.
+
+        When it disappears from the list, it is considered terminated.
+        """
+        if (
+            "containerStatuses" not in pod["status"]
+            or not pod["status"]["containerStatuses"]
+        ):
+            return False
+
+        state = pod["status"]["containerStatuses"][0]["state"]
+        if "running" in state:
+            return True
+
+        return False
+
+    def _get(self, remote_path: str) -> Dict[str, Any]:
+        """Get a resource from the Kubernetes API server."""
+        return self._k8s_api_client.get(remote_path)
+
+    def _patch(self, remote_path: str, payload: List[Dict[str, Any]]) -> Dict[str, Any]:
+        """Patch a resource on the Kubernetes API server."""
+        return self._k8s_api_client.patch(remote_path, payload)
+
+    def _get_head_pod_resource_version(self) -> str:
+        """
+        Extract a recent pods resource version by reading the head pod's
+        metadata.resourceVersion of the response.
+        """
+        if not RAY_HEAD_POD_NAME:
+            return None
+        pod_resp = self._get(f"pods/{RAY_HEAD_POD_NAME}")
+        return pod_resp["metadata"]["resourceVersion"]
diff --git a/.venv/lib/python3.11/site-packages/ray/autoscaler/v2/instance_manager/cloud_providers/read_only/__init__.py b/.venv/lib/python3.11/site-packages/ray/autoscaler/v2/instance_manager/cloud_providers/read_only/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/.venv/lib/python3.11/site-packages/ray/autoscaler/v2/instance_manager/cloud_providers/read_only/__pycache__/__init__.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/autoscaler/v2/instance_manager/cloud_providers/read_only/__pycache__/__init__.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..1e0355d291b7c118a299f654e41ca26c53331712
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/autoscaler/v2/instance_manager/cloud_providers/read_only/__pycache__/__init__.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/autoscaler/v2/instance_manager/cloud_providers/read_only/__pycache__/cloud_provider.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/autoscaler/v2/instance_manager/cloud_providers/read_only/__pycache__/cloud_provider.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..8baff162ce050711af4135f2b25028e5ac43f9e8
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/autoscaler/v2/instance_manager/cloud_providers/read_only/__pycache__/cloud_provider.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/autoscaler/v2/instance_manager/cloud_providers/read_only/cloud_provider.py b/.venv/lib/python3.11/site-packages/ray/autoscaler/v2/instance_manager/cloud_providers/read_only/cloud_provider.py
new file mode 100644
index 0000000000000000000000000000000000000000..7630d111f807ce2a85705be47c9a0d5b3dcc1caa
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/autoscaler/v2/instance_manager/cloud_providers/read_only/cloud_provider.py
@@ -0,0 +1,73 @@
+from typing import Dict, List
+
+from ray._private.utils import binary_to_hex
+from ray._raylet import GcsClient
+from ray.autoscaler._private.util import format_readonly_node_type
+from ray.autoscaler.v2.instance_manager.node_provider import (
+    CloudInstance,
+    CloudInstanceId,
+    CloudInstanceProviderError,
+    ICloudInstanceProvider,
+    NodeKind,
+)
+from ray.autoscaler.v2.sdk import get_cluster_resource_state
+from ray.autoscaler.v2.utils import is_head_node
+from ray.core.generated.autoscaler_pb2 import NodeStatus
+
+
+class ReadOnlyProvider(ICloudInstanceProvider):
+    """
+    A read only provider that use the ray node states from the GCS as the
+    cloud instances.
+
+    This is used for laptop mode / manual cluster setup modes, in order to
+    provide status reporting in the same way for users.
+    """
+
+    def __init__(self, provider_config: dict):
+        self._provider_config = provider_config
+        self._gcs_address = provider_config["gcs_address"]
+
+        self._gcs_client = GcsClient(address=self._gcs_address)
+
+    def get_non_terminated(self) -> Dict[str, CloudInstance]:
+        cluster_resource_state = get_cluster_resource_state(self._gcs_client)
+        cloud_instances = {}
+        for gcs_node_state in cluster_resource_state.node_states:
+            if gcs_node_state.status == NodeStatus.DEAD:
+                # Skip dead nodes.
+                continue
+
+            # Use node's node id if instance id is not available
+            cloud_instance_id = (
+                gcs_node_state.instance_id
+                if gcs_node_state.instance_id
+                else binary_to_hex(gcs_node_state.node_id)
+            )
+
+            # TODO: we should add a field to the proto to indicate if the node is head
+            # or not.
+            is_head = is_head_node(gcs_node_state)
+
+            cloud_instances[cloud_instance_id] = CloudInstance(
+                cloud_instance_id=cloud_instance_id,
+                node_kind=NodeKind.HEAD if is_head else NodeKind.WORKER,
+                node_type=format_readonly_node_type(
+                    binary_to_hex(gcs_node_state.node_id)  # Legacy behavior.
+                ),
+                is_running=True,
+                request_id="",
+            )
+
+        return cloud_instances
+
+    def terminate(self, instance_id: CloudInstanceId) -> None:
+        raise NotImplementedError("Cannot terminate instances in read-only mode.")
+
+    def launch(
+        self, shape: Dict[CloudInstanceId, int], request_id: CloudInstanceId
+    ) -> None:
+        raise NotImplementedError("Cannot launch instances in read-only mode.")
+
+    def poll_errors(self) -> List[CloudInstanceProviderError]:
+        return []
diff --git a/.venv/lib/python3.11/site-packages/ray/autoscaler/v2/instance_manager/subscribers/__init__.py b/.venv/lib/python3.11/site-packages/ray/autoscaler/v2/instance_manager/subscribers/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/.venv/lib/python3.11/site-packages/ray/autoscaler/v2/instance_manager/subscribers/__pycache__/__init__.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/autoscaler/v2/instance_manager/subscribers/__pycache__/__init__.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..106130c8ec995f5fbf0fa2e1d8b49816401d09d1
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/autoscaler/v2/instance_manager/subscribers/__pycache__/__init__.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/autoscaler/v2/instance_manager/subscribers/__pycache__/cloud_instance_updater.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/autoscaler/v2/instance_manager/subscribers/__pycache__/cloud_instance_updater.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e26dfadf029b8a18f5ec8b12caf7347da1913fe7
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/autoscaler/v2/instance_manager/subscribers/__pycache__/cloud_instance_updater.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/autoscaler/v2/instance_manager/subscribers/__pycache__/ray_stopper.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/autoscaler/v2/instance_manager/subscribers/__pycache__/ray_stopper.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..2be080984d9f631db5aedc04a40e3ae637970006
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/autoscaler/v2/instance_manager/subscribers/__pycache__/ray_stopper.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/autoscaler/v2/instance_manager/subscribers/__pycache__/threaded_ray_installer.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/autoscaler/v2/instance_manager/subscribers/__pycache__/threaded_ray_installer.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..070f96c6c0d34d369e07dfceca9c80feeb53cfbc
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/autoscaler/v2/instance_manager/subscribers/__pycache__/threaded_ray_installer.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/autoscaler/v2/instance_manager/subscribers/cloud_instance_updater.py b/.venv/lib/python3.11/site-packages/ray/autoscaler/v2/instance_manager/subscribers/cloud_instance_updater.py
new file mode 100644
index 0000000000000000000000000000000000000000..747525439d2d1c157d6d276ec942744ba50a8fb5
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/autoscaler/v2/instance_manager/subscribers/cloud_instance_updater.py
@@ -0,0 +1,93 @@
+import logging
+import uuid
+from collections import defaultdict
+from typing import List
+
+from ray.autoscaler.v2.instance_manager.instance_manager import (
+    InstanceUpdatedSubscriber,
+)
+from ray.autoscaler.v2.instance_manager.node_provider import ICloudInstanceProvider
+from ray.core.generated.instance_manager_pb2 import Instance, InstanceUpdateEvent
+
+logger = logging.getLogger(__name__)
+
+
+class CloudInstanceUpdater(InstanceUpdatedSubscriber):
+    """CloudInstanceUpdater is responsible for launching
+    new instances and terminating cloud instances
+
+    It requests the cloud instance provider to launch new instances when
+    there are new instance requests (with REQUESTED status change).
+
+    It requests the cloud instance provider to terminate instances when
+    there are new instance terminations (with TERMINATING status change).
+
+    The cloud instance APIs are async and non-blocking.
+    """
+
+    def __init__(
+        self,
+        cloud_provider: ICloudInstanceProvider,
+    ) -> None:
+        self._cloud_provider = cloud_provider
+
+    def notify(self, events: List[InstanceUpdateEvent]) -> None:
+        new_requests = [
+            event for event in events if event.new_instance_status == Instance.REQUESTED
+        ]
+        new_terminations = [
+            event
+            for event in events
+            if event.new_instance_status == Instance.TERMINATING
+        ]
+        self._launch_new_instances(new_requests)
+        self._terminate_instances(new_terminations)
+
+    def _terminate_instances(self, new_terminations: List[InstanceUpdateEvent]):
+        """
+        Terminate cloud instances through cloud provider.
+
+        Args:
+            new_terminations: List of new instance terminations.
+        """
+        if not new_terminations:
+            logger.debug("No instances to terminate.")
+            return
+
+        # Terminate the instances.
+        cloud_instance_ids = [event.cloud_instance_id for event in new_terminations]
+
+        # This is an async call.
+        self._cloud_provider.terminate(
+            ids=cloud_instance_ids, request_id=str(uuid.uuid4())
+        )
+
+    def _launch_new_instances(self, new_requests: List[InstanceUpdateEvent]):
+        """
+        Launches new instances by requesting the cloud provider.
+
+        Args:
+            new_requests: List of new instance requests.
+
+        """
+        if not new_requests:
+            logger.debug("No instances to launch.")
+            return
+
+        # Group new requests by launch request id.
+        requests_by_launch_request_id = defaultdict(list)
+
+        for event in new_requests:
+            assert (
+                event.launch_request_id
+            ), "Launch request id should have been set by the reconciler"
+            requests_by_launch_request_id[event.launch_request_id].append(event)
+
+        for launch_request_id, events in requests_by_launch_request_id.items():
+            request_shape = defaultdict(int)
+            for event in events:
+                request_shape[event.instance_type] += 1
+            # Make requests to the cloud provider.
+            self._cloud_provider.launch(
+                shape=request_shape, request_id=launch_request_id
+            )
diff --git a/.venv/lib/python3.11/site-packages/ray/autoscaler/v2/instance_manager/subscribers/ray_stopper.py b/.venv/lib/python3.11/site-packages/ray/autoscaler/v2/instance_manager/subscribers/ray_stopper.py
new file mode 100644
index 0000000000000000000000000000000000000000..7f00cf63dfbd1b1369b4de478d80d1b7ce46d87f
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/autoscaler/v2/instance_manager/subscribers/ray_stopper.py
@@ -0,0 +1,154 @@
+import logging
+from concurrent.futures import ThreadPoolExecutor
+from dataclasses import dataclass
+from queue import Queue
+from typing import List
+
+from ray._private.utils import hex_to_binary
+from ray._raylet import GcsClient
+from ray.autoscaler.v2.instance_manager.instance_manager import (
+    InstanceUpdatedSubscriber,
+)
+from ray.core.generated.autoscaler_pb2 import DrainNodeReason
+from ray.core.generated.instance_manager_pb2 import (
+    Instance,
+    InstanceUpdateEvent,
+    TerminationRequest,
+)
+
+logger = logging.getLogger(__name__)
+
+
+@dataclass(frozen=True)
+class RayStopError:
+    # Instance manager's instance id.
+    im_instance_id: str
+
+
+class RayStopper(InstanceUpdatedSubscriber):
+    """RayStopper is responsible for stopping ray on instances.
+
+    It will drain the ray node if it's for idle termination.
+    For other terminations, it will stop the ray node. (e.g. scale down, etc.)
+
+    If any failures happen when stopping/draining the node, we will not retry
+    and rely on the reconciler to handle the failure.
+
+    TODO: we could also surface the errors back to the reconciler for
+    quicker failure detection.
+
+    """
+
+    def __init__(self, gcs_client: GcsClient, error_queue: Queue) -> None:
+        self._gcs_client = gcs_client
+        self._error_queue = error_queue
+        self._executor = ThreadPoolExecutor(max_workers=1)
+
+    def notify(self, events: List[InstanceUpdateEvent]) -> None:
+        for event in events:
+            if event.new_instance_status == Instance.RAY_STOP_REQUESTED:
+                fut = self._executor.submit(self._stop_or_drain_ray, event)
+
+                def _log_on_error(fut):
+                    try:
+                        fut.result()
+                    except Exception:
+                        logger.exception("Error stopping/drain ray.")
+
+                fut.add_done_callback(_log_on_error)
+
+    def _stop_or_drain_ray(self, event: InstanceUpdateEvent) -> None:
+        """
+        Stops or drains the ray node based on the termination request.
+        """
+        assert event.HasField("termination_request"), "Termination request is required."
+        termination_request = event.termination_request
+        ray_node_id = termination_request.ray_node_id
+        instance_id = event.instance_id
+
+        if termination_request.cause == TerminationRequest.Cause.IDLE:
+            reason = DrainNodeReason.DRAIN_NODE_REASON_IDLE_TERMINATION
+            reason_str = "Termination of node that's idle for {} seconds.".format(
+                termination_request.idle_duration_ms / 1000
+            )
+            self._drain_ray_node(
+                self._gcs_client,
+                self._error_queue,
+                ray_node_id,
+                instance_id,
+                reason,
+                reason_str,
+            )
+            return
+
+        # If it's not an idle termination, we stop the ray node.
+        self._stop_ray_node(
+            self._gcs_client, self._error_queue, ray_node_id, instance_id
+        )
+
+    @staticmethod
+    def _drain_ray_node(
+        gcs_client: GcsClient,
+        error_queue: Queue,
+        ray_node_id: str,
+        instance_id: str,
+        reason: DrainNodeReason,
+        reason_str: str,
+    ):
+        """
+        Drains the ray node.
+
+        Args:
+            gcs_client: The gcs client to use.
+            ray_node_id: The ray node id to drain.
+            reason: The reason to drain the node.
+            reason_str: The reason message to drain the node.
+        """
+        try:
+            accepted, reject_msg_str = gcs_client.drain_node(
+                node_id=ray_node_id,
+                reason=reason,
+                reason_message=reason_str,
+                # TODO: we could probably add a deadline here that's derived
+                # from the stuck instance reconciliation configs.
+                deadline_timestamp_ms=0,
+            )
+            logger.info(
+                f"Drained ray on {ray_node_id}(success={accepted}, "
+                f"msg={reject_msg_str})"
+            )
+            if not accepted:
+                error_queue.put_nowait(RayStopError(im_instance_id=instance_id))
+        except Exception:
+            logger.exception(f"Error draining ray on {ray_node_id}")
+            error_queue.put_nowait(RayStopError(im_instance_id=instance_id))
+
+    @staticmethod
+    def _stop_ray_node(
+        gcs_client: GcsClient,
+        error_queue: Queue,
+        ray_node_id: str,
+        instance_id: str,
+    ):
+        """
+        Stops the ray node.
+
+        Args:
+            gcs_client: The gcs client to use.
+            ray_node_id: The ray node id to stop.
+        """
+        try:
+            drained = gcs_client.drain_nodes(node_ids=[hex_to_binary(ray_node_id)])
+            success = len(drained) > 0
+            logger.info(
+                f"Stopping ray on {ray_node_id}(instance={instance_id}): "
+                f"success={success})"
+            )
+
+            if not success:
+                error_queue.put_nowait(RayStopError(im_instance_id=instance_id))
+        except Exception:
+            logger.exception(
+                f"Error stopping ray on {ray_node_id}(instance={instance_id})"
+            )
+            error_queue.put_nowait(RayStopError(im_instance_id=instance_id))
diff --git a/.venv/lib/python3.11/site-packages/ray/autoscaler/v2/instance_manager/subscribers/threaded_ray_installer.py b/.venv/lib/python3.11/site-packages/ray/autoscaler/v2/instance_manager/subscribers/threaded_ray_installer.py
new file mode 100644
index 0000000000000000000000000000000000000000..1aaf32f816a13ead29d3de1f721c77f24b8a291c
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/autoscaler/v2/instance_manager/subscribers/threaded_ray_installer.py
@@ -0,0 +1,95 @@
+import logging
+import time
+from concurrent.futures import ThreadPoolExecutor
+from typing import List
+
+from ray.autoscaler.v2.instance_manager.instance_manager import (
+    InstanceUpdatedSubscriber,
+)
+from ray.autoscaler.v2.instance_manager.instance_storage import InstanceStorage
+from ray.autoscaler.v2.instance_manager.ray_installer import RayInstaller
+from ray.core.generated.instance_manager_pb2 import Instance, InstanceUpdateEvent
+
+logger = logging.getLogger(__name__)
+
+
+class ThreadedRayInstaller(InstanceUpdatedSubscriber):
+    """ThreadedRayInstaller is responsible for install ray on new nodes."""
+
+    def __init__(
+        self,
+        head_node_ip: str,
+        instance_storage: InstanceStorage,
+        ray_installer: RayInstaller,
+        max_install_attempts: int = 3,
+        install_retry_interval: int = 10,
+        max_concurrent_installs: int = 50,
+    ) -> None:
+        self._head_node_ip = head_node_ip
+        self._instance_storage = instance_storage
+        self._ray_installer = ray_installer
+        self._max_concurrent_installs = max_concurrent_installs
+        self._max_install_attempts = max_install_attempts
+        self._install_retry_interval = install_retry_interval
+        self._ray_installation_executor = ThreadPoolExecutor(
+            max_workers=self._max_concurrent_installs
+        )
+
+    def notify(self, events: List[InstanceUpdateEvent]) -> None:
+        for event in events:
+            if event.new_instance_status == Instance.ALLOCATED:
+                self._install_ray_on_new_nodes(event.instance_id)
+
+    def _install_ray_on_new_nodes(self, instance_id: str) -> None:
+        allocated_instance, _ = self._instance_storage.get_instances(
+            instance_ids={instance_id},
+            status_filter={Instance.ALLOCATED},
+        )
+        for instance in allocated_instance.values():
+            self._ray_installation_executor.submit(
+                self._install_ray_on_single_node, instance
+            )
+
+    def _install_ray_on_single_node(self, instance: Instance) -> None:
+        assert instance.status == Instance.ALLOCATED
+        success, version = self._instance_storage.upsert_instance(
+            instance, expected_instance_version=instance.version
+        )
+        if not success:
+            logger.warning(
+                f"Failed to update instance {instance.instance_id} to RAY_INSTALLING"
+            )
+            # Do not need to handle failures, it will be covered by
+            # garbage collection.
+            return
+
+        # install with exponential backoff
+        installed = False
+        backoff_factor = 1
+        for _ in range(self._max_install_attempts):
+            installed = self._ray_installer.install_ray(instance, self._head_node_ip)
+            if installed:
+                break
+            logger.warning("Failed to install ray, retrying...")
+            time.sleep(self._install_retry_interval * backoff_factor)
+            backoff_factor *= 2
+
+        if not installed:
+            instance.status = Instance.RAY_INSTALL_FAILED
+            success, version = self._instance_storage.upsert_instance(
+                instance,
+                expected_instance_version=version,
+            )
+        else:
+            instance.status = Instance.RAY_RUNNING
+            success, version = self._instance_storage.upsert_instance(
+                instance,
+                expected_instance_version=version,
+            )
+        if not success:
+            logger.warning(
+                f"Failed to update instance {instance.instance_id} to {instance.status}"
+            )
+            # Do not need to handle failures, it will be covered by
+            # garbage collection.
+            return
diff --git a/.venv/lib/python3.11/site-packages/ray/util/__init__.py b/.venv/lib/python3.11/site-packages/ray/util/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..257cfd3494d0fcdfbb2ce621c9c8e360395f7b86
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/util/__init__.py
@@ -0,0 +1,74 @@
+from typing import List
+
+import ray
+from ray._private.client_mode_hook import client_mode_hook
+from ray._private.auto_init_hook import wrap_auto_init
+from ray._private.services import get_node_ip_address
+from ray.util import iter
+from ray.util import rpdb as pdb
+from ray.util import debugpy as ray_debugpy
+from ray.util.actor_pool import ActorPool
+from ray.util import accelerators
+from ray.util.annotations import PublicAPI
+from ray.util.check_serialize import inspect_serializability
+from ray.util.client_connect import connect, disconnect
+from ray.util.debug import disable_log_once_globally, enable_periodic_logging, log_once
+from ray.util.placement_group import (
+    get_current_placement_group,
+    get_placement_group,
+    placement_group,
+    placement_group_table,
+    remove_placement_group,
+)
+from ray.util.serialization import deregister_serializer, register_serializer
+
+
+@PublicAPI(stability="beta")
+@wrap_auto_init
+@client_mode_hook
+def list_named_actors(all_namespaces: bool = False) -> List[str]:
+    """List all named actors in the system.
+
+    Actors must have been created with Actor.options(name="name").remote().
+    This works for both detached & non-detached actors.
+
+    By default, only actors in the current namespace will be returned
+    and the returned entries will simply be their name.
+
+    If `all_namespaces` is set to True, all actors in the cluster will be
+    returned regardless of namespace, and the returned entries will be of the
+    form {"namespace": namespace, "name": name}.
+    """
+    worker = ray._private.worker.global_worker
+    worker.check_connected()
+
+    actors = worker.core_worker.list_named_actors(all_namespaces)
+    if all_namespaces:
+        return [{"name": name, "namespace": namespace} for namespace, name in actors]
+    else:
+        return [name for _, name in actors]
+
+
+__all__ = [
+    "accelerators",
+    "ActorPool",
+    "disable_log_once_globally",
+    "enable_periodic_logging",
+    "iter",
+    "log_once",
+    "pdb",
+    "placement_group",
+    "placement_group_table",
+    "get_placement_group",
+    "get_current_placement_group",
+    "get_node_ip_address",
+    "remove_placement_group",
+    "ray_debugpy",
+    "inspect_serializability",
+    "collective",
+    "connect",
+    "disconnect",
+    "register_serializer",
+    "deregister_serializer",
+    "list_named_actors",
+]
diff --git a/.venv/lib/python3.11/site-packages/ray/util/actor_group.py b/.venv/lib/python3.11/site-packages/ray/util/actor_group.py
new file mode 100644
index 0000000000000000000000000000000000000000..03ffcb1184c29ca703b8639713df4eac745bf933
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/util/actor_group.py
@@ -0,0 +1,230 @@
+import weakref
+from dataclasses import dataclass
+import logging
+from typing import List, TypeVar, Optional, Dict, Type, Tuple
+
+import ray
+from ray.actor import ActorHandle
+from ray.util.annotations import Deprecated
+from ray._private.utils import get_ray_doc_version
+
+T = TypeVar("T")
+ActorMetadata = TypeVar("ActorMetadata")
+
+logger = logging.getLogger(__name__)
+
+
+@dataclass
+class ActorWrapper:
+    """Class containing an actor and its metadata."""
+
+    actor: ActorHandle
+    metadata: ActorMetadata
+
+
+@dataclass
+class ActorConfig:
+    num_cpus: float
+    num_gpus: float
+    resources: Optional[Dict[str, float]]
+    init_args: Tuple
+    init_kwargs: Dict
+
+
+class ActorGroupMethod:
+    def __init__(self, actor_group: "ActorGroup", method_name: str):
+        self.actor_group = weakref.ref(actor_group)
+        self._method_name = method_name
+
+    def __call__(self, *args, **kwargs):
+        raise TypeError(
+            "ActorGroup methods cannot be called directly. "
+            "Instead "
+            f"of running 'object.{self._method_name}()', try "
+            f"'object.{self._method_name}.remote()'."
+        )
+
+    def remote(self, *args, **kwargs):
+        return [
+            getattr(a.actor, self._method_name).remote(*args, **kwargs)
+            for a in self.actor_group().actors
+        ]
+
+
+@Deprecated(
+    message="For stateless/task processing, use ray.util.multiprocessing, see details "
+    f"in https://docs.ray.io/en/{get_ray_doc_version()}/ray-more-libs/multiprocessing.html. "  # noqa: E501
+    "For stateful/actor processing such as batch prediction, use "
+    "Datasets.map_batches(compute=ActorPoolStrategy, ...), see details in "
+    f"https://docs.ray.io/en/{get_ray_doc_version()}/data/api/dataset.html#ray.data.Dataset.map_batches.",  # noqa: E501
+    warning=True,
+)
+class ActorGroup:
+    """Group of Ray Actors that can execute arbitrary functions.
+
+    ``ActorGroup`` launches Ray actors according to the given
+    specification. It can then execute arbitrary Python functions in each of
+    these actors.
+
+    If not enough resources are available to launch the actors, the Ray
+    cluster will automatically scale up if autoscaling is enabled.
+
+    Args:
+        actor_cls: The class to use as the remote actors.
+        num_actors: The number of the provided Ray actors to
+            launch. Defaults to 1.
+        num_cpus_per_actor: The number of CPUs to reserve for each
+            actor. Fractional values are allowed. Defaults to 1.
+        num_gpus_per_actor: The number of GPUs to reserve for each
+            actor. Fractional values are allowed. Defaults to 0.
+        resources_per_actor (Optional[Dict[str, float]]):
+            Dictionary specifying the resources that will be
+            requested for each actor in addition to ``num_cpus_per_actor``
+            and ``num_gpus_per_actor``.
+        init_args, init_kwargs: If ``actor_cls`` is provided,
+            these args will be used for the actor initialization.
+
+    """
+
+    def __init__(
+        self,
+        actor_cls: Type,
+        num_actors: int = 1,
+        num_cpus_per_actor: float = 1,
+        num_gpus_per_actor: float = 0,
+        resources_per_actor: Optional[Dict[str, float]] = None,
+        init_args: Optional[Tuple] = None,
+        init_kwargs: Optional[Dict] = None,
+    ):
+        from ray._private.usage.usage_lib import record_library_usage
+
+        record_library_usage("util.ActorGroup")
+
+        if num_actors <= 0:
+            raise ValueError(
+                "The provided `num_actors` must be greater "
+                f"than 0. Received num_actors={num_actors} "
+                f"instead."
+            )
+        if num_cpus_per_actor < 0 or num_gpus_per_actor < 0:
+            raise ValueError(
+                "The number of CPUs and GPUs per actor must "
+                "not be negative. Received "
+                f"num_cpus_per_actor={num_cpus_per_actor} and "
+                f"num_gpus_per_actor={num_gpus_per_actor}."
+            )
+
+        self.actors = []
+
+        self.num_actors = num_actors
+
+        self.actor_config = ActorConfig(
+            num_cpus=num_cpus_per_actor,
+            num_gpus=num_gpus_per_actor,
+            resources=resources_per_actor,
+            init_args=init_args or (),
+            init_kwargs=init_kwargs or {},
+        )
+
+        self._remote_cls = ray.remote(
+            num_cpus=self.actor_config.num_cpus,
+            num_gpus=self.actor_config.num_gpus,
+            resources=self.actor_config.resources,
+        )(actor_cls)
+
+        self.start()
+
+    def __getattr__(self, item):
+        if len(self.actors) == 0:
+            raise RuntimeError(
+                "This ActorGroup has been shutdown. Please start it again."
+            )
+        # Same implementation as actor.py
+        return ActorGroupMethod(self, item)
+
+    def __len__(self):
+        return len(self.actors)
+
+    def __getitem__(self, item):
+        return self.actors[item]
+
+    def start(self):
+        """Starts all the actors in this actor group."""
+        if self.actors and len(self.actors) > 0:
+            raise RuntimeError(
+                "The actors have already been started. "
+                "Please call `shutdown` first if you want to "
+                "restart them."
+            )
+
+        logger.debug(f"Starting {self.num_actors} actors.")
+        self.add_actors(self.num_actors)
+        logger.debug(f"{len(self.actors)} actors have successfully started.")
+
+    def shutdown(self, patience_s: float = 5):
+        """Shutdown all the actors in this actor group.
+
+        Args:
+            patience_s: Attempt a graceful shutdown
+                of the actors for this many seconds. Fallback to force kill
+                if graceful shutdown is not complete after this time. If
+                this is less than or equal to 0, immediately force kill all
+                actors.
+        """
+        logger.debug(f"Shutting down {len(self.actors)} actors.")
+        if patience_s <= 0:
+            for actor in self.actors:
+                ray.kill(actor.actor)
+        else:
+            done_refs = [w.actor.__ray_terminate__.remote() for w in self.actors]
+            # Wait for actors to die gracefully.
+            done, not_done = ray.wait(done_refs, timeout=patience_s)
+            if not_done:
+                logger.debug("Graceful termination failed. Falling back to force kill.")
+                # If all actors are not able to die gracefully, then kill them.
+                for actor in self.actors:
+                    ray.kill(actor.actor)
+
+        logger.debug("Shutdown successful.")
+        self.actors = []
+
+    def remove_actors(self, actor_indexes: List[int]):
+        """Removes the actors with the specified indexes.
+
+        Args:
+            actor_indexes (List[int]): The indexes of the actors to remove.
+        """
+        new_actors = []
+        for i in range(len(self.actors)):
+            if i not in actor_indexes:
+                new_actors.append(self.actors[i])
+        self.actors = new_actors
+
+    def add_actors(self, num_actors: int):
+        """Adds ``num_actors`` to this ActorGroup.
+
+        Args:
+            num_actors: The number of actors to add.
+        """
+        new_actors = []
+        new_actor_metadata = []
+        for _ in range(num_actors):
+            actor = self._remote_cls.remote(
+                *self.actor_config.init_args, **self.actor_config.init_kwargs
+            )
+            new_actors.append(actor)
+            if hasattr(actor, "get_actor_metadata"):
+                new_actor_metadata.append(actor.get_actor_metadata.remote())
+
+        # Get metadata from all actors.
+        metadata = ray.get(new_actor_metadata)
+
+        if len(metadata) == 0:
+            metadata = [None] * len(new_actors)
+
+        for i in range(len(new_actors)):
+            self.actors.append(ActorWrapper(actor=new_actors[i], metadata=metadata[i]))
+
+    @property
+    def actor_metadata(self):
+        return [a.metadata for a in self.actors]
diff --git a/.venv/lib/python3.11/site-packages/ray/util/actor_pool.py b/.venv/lib/python3.11/site-packages/ray/util/actor_pool.py
new file mode 100644
index 0000000000000000000000000000000000000000..96eedfe29af1fc2a8289602ec2f37a722e967570
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/util/actor_pool.py
@@ -0,0 +1,463 @@
+from typing import TYPE_CHECKING, Any, Callable, List, TypeVar
+
+import ray
+from ray.util.annotations import DeveloperAPI
+
+if TYPE_CHECKING:
+    import ray.actor
+
+V = TypeVar("V")
+
+
+@DeveloperAPI
+class ActorPool:
+    """Utility class to operate on a fixed pool of actors.
+
+    Arguments:
+        actors: List of Ray actor handles to use in this pool.
+
+    Examples:
+        .. testcode::
+
+            import ray
+            from ray.util.actor_pool import ActorPool
+
+            @ray.remote
+            class Actor:
+                def double(self, v):
+                    return 2 * v
+
+            a1, a2 = Actor.remote(), Actor.remote()
+            pool = ActorPool([a1, a2])
+            print(list(pool.map(lambda a, v: a.double.remote(v),
+                                [1, 2, 3, 4])))
+
+        .. testoutput::
+
+            [2, 4, 6, 8]
+    """
+
+    def __init__(self, actors: list):
+        from ray._private.usage.usage_lib import record_library_usage
+
+        record_library_usage("util.ActorPool")
+
+        # actors to be used
+        self._idle_actors = list(actors)
+
+        # get actor from future
+        self._future_to_actor = {}
+
+        # get future from index
+        self._index_to_future = {}
+
+        # next task to do
+        self._next_task_index = 0
+
+        # next task to return
+        self._next_return_index = 0
+
+        # next work depending when actors free
+        self._pending_submits = []
+
+    def map(self, fn: Callable[["ray.actor.ActorHandle", V], Any], values: List[V]):
+        """Apply the given function in parallel over the actors and values.
+
+        This returns an ordered iterator that will return results of the map
+        as they finish. Note that you must iterate over the iterator to force
+        the computation to finish.
+
+        Arguments:
+            fn: Function that takes (actor, value) as argument and
+                returns an ObjectRef computing the result over the value. The
+                actor will be considered busy until the ObjectRef completes.
+            values: List of values that fn(actor, value) should be
+                applied to.
+
+        Returns:
+            Iterator over results from applying fn to the actors and values.
+
+        Examples:
+            .. testcode::
+
+                import ray
+                from ray.util.actor_pool import ActorPool
+
+                @ray.remote
+                class Actor:
+                    def double(self, v):
+                        return 2 * v
+
+                a1, a2 = Actor.remote(), Actor.remote()
+                pool = ActorPool([a1, a2])
+                print(list(pool.map(lambda a, v: a.double.remote(v),
+                                    [1, 2, 3, 4])))
+
+            .. testoutput::
+
+                [2, 4, 6, 8]
+        """
+        # Ignore/Cancel all the previous submissions
+        # by calling `has_next` and `gen_next` repeteadly.
+        while self.has_next():
+            try:
+                self.get_next(timeout=0, ignore_if_timedout=True)
+            except TimeoutError:
+                pass
+
+        for v in values:
+            self.submit(fn, v)
+
+        def get_generator():
+            while self.has_next():
+                yield self.get_next()
+
+        return get_generator()
+
+    def map_unordered(
+        self, fn: Callable[["ray.actor.ActorHandle", V], Any], values: List[V]
+    ):
+        """Similar to map(), but returning an unordered iterator.
+
+        This returns an unordered iterator that will return results of the map
+        as they finish. This can be more efficient that map() if some results
+        take longer to compute than others.
+
+        Arguments:
+            fn: Function that takes (actor, value) as argument and
+                returns an ObjectRef computing the result over the value. The
+                actor will be considered busy until the ObjectRef completes.
+            values: List of values that fn(actor, value) should be
+                applied to.
+
+        Returns:
+            Iterator over results from applying fn to the actors and values.
+
+        Examples:
+            .. testcode::
+
+                import ray
+                from ray.util.actor_pool import ActorPool
+
+                @ray.remote
+                class Actor:
+                    def double(self, v):
+                        return 2 * v
+
+                a1, a2 = Actor.remote(), Actor.remote()
+                pool = ActorPool([a1, a2])
+                print(list(pool.map_unordered(lambda a, v: a.double.remote(v),
+                                              [1, 2, 3, 4])))
+
+            .. testoutput::
+                :options: +MOCK
+
+                [6, 8, 4, 2]
+        """
+        # Ignore/Cancel all the previous submissions
+        # by calling `has_next` and `gen_next_unordered` repeteadly.
+        while self.has_next():
+            try:
+                self.get_next_unordered(timeout=0)
+            except TimeoutError:
+                pass
+
+        for v in values:
+            self.submit(fn, v)
+
+        def get_generator():
+            while self.has_next():
+                yield self.get_next_unordered()
+
+        return get_generator()
+
+    def submit(self, fn, value):
+        """Schedule a single task to run in the pool.
+
+        This has the same argument semantics as map(), but takes on a single
+        value instead of a list of values. The result can be retrieved using
+        get_next() / get_next_unordered().
+
+        Arguments:
+            fn: Function that takes (actor, value) as argument and
+                returns an ObjectRef computing the result over the value. The
+                actor will be considered busy until the ObjectRef completes.
+            value: Value to compute a result for.
+
+        Examples:
+            .. testcode::
+
+                import ray
+                from ray.util.actor_pool import ActorPool
+
+                @ray.remote
+                class Actor:
+                    def double(self, v):
+                        return 2 * v
+
+                a1, a2 = Actor.remote(), Actor.remote()
+                pool = ActorPool([a1, a2])
+                pool.submit(lambda a, v: a.double.remote(v), 1)
+                pool.submit(lambda a, v: a.double.remote(v), 2)
+                print(pool.get_next(), pool.get_next())
+
+            .. testoutput::
+
+                2 4
+        """
+        if self._idle_actors:
+            actor = self._idle_actors.pop()
+            future = fn(actor, value)
+            future_key = tuple(future) if isinstance(future, list) else future
+            self._future_to_actor[future_key] = (self._next_task_index, actor)
+            self._index_to_future[self._next_task_index] = future
+            self._next_task_index += 1
+        else:
+            self._pending_submits.append((fn, value))
+
+    def has_next(self):
+        """Returns whether there are any pending results to return.
+
+        Returns:
+            True if there are any pending results not yet returned.
+
+        Examples:
+            .. testcode::
+
+                import ray
+                from ray.util.actor_pool import ActorPool
+
+                @ray.remote
+                class Actor:
+                    def double(self, v):
+                        return 2 * v
+
+                a1, a2 = Actor.remote(), Actor.remote()
+                pool = ActorPool([a1, a2])
+                pool.submit(lambda a, v: a.double.remote(v), 1)
+                print(pool.has_next())
+                print(pool.get_next())
+                print(pool.has_next())
+
+            .. testoutput::
+
+                True
+                2
+                False
+        """
+        return bool(self._future_to_actor)
+
+    def get_next(self, timeout=None, ignore_if_timedout=False):
+        """Returns the next pending result in order.
+
+        This returns the next result produced by submit(), blocking for up to
+        the specified timeout until it is available.
+
+        Returns:
+            The next result.
+
+        Raises:
+            TimeoutError: if the timeout is reached.
+
+        Examples:
+            .. testcode::
+
+                import ray
+                from ray.util.actor_pool import ActorPool
+
+                @ray.remote
+                class Actor:
+                    def double(self, v):
+                        return 2 * v
+
+                a1, a2 = Actor.remote(), Actor.remote()
+                pool = ActorPool([a1, a2])
+                pool.submit(lambda a, v: a.double.remote(v), 1)
+                print(pool.get_next())
+
+            .. testoutput::
+
+                2
+        """
+        if not self.has_next():
+            raise StopIteration("No more results to get")
+        if self._next_return_index >= self._next_task_index:
+            raise ValueError(
+                "It is not allowed to call get_next() after get_next_unordered()."
+            )
+        future = self._index_to_future[self._next_return_index]
+        timeout_msg = "Timed out waiting for result"
+        raise_timeout_after_ignore = False
+        if timeout is not None:
+            res, _ = ray.wait([future], timeout=timeout)
+            if not res:
+                if not ignore_if_timedout:
+                    raise TimeoutError(timeout_msg)
+                else:
+                    raise_timeout_after_ignore = True
+        del self._index_to_future[self._next_return_index]
+        self._next_return_index += 1
+
+        future_key = tuple(future) if isinstance(future, list) else future
+        i, a = self._future_to_actor.pop(future_key)
+
+        self._return_actor(a)
+        if raise_timeout_after_ignore:
+            raise TimeoutError(
+                timeout_msg + ". The task {} has been ignored.".format(future)
+            )
+        return ray.get(future)
+
+    def get_next_unordered(self, timeout=None, ignore_if_timedout=False):
+        """Returns any of the next pending results.
+
+        This returns some result produced by submit(), blocking for up to
+        the specified timeout until it is available. Unlike get_next(), the
+        results are not always returned in same order as submitted, which can
+        improve performance.
+
+        Returns:
+            The next result.
+
+        Raises:
+            TimeoutError: if the timeout is reached.
+
+        Examples:
+            .. testcode::
+
+                import ray
+                from ray.util.actor_pool import ActorPool
+
+                @ray.remote
+                class Actor:
+                    def double(self, v):
+                        return 2 * v
+
+                a1, a2 = Actor.remote(), Actor.remote()
+                pool = ActorPool([a1, a2])
+                pool.submit(lambda a, v: a.double.remote(v), 1)
+                pool.submit(lambda a, v: a.double.remote(v), 2)
+                print(pool.get_next_unordered())
+                print(pool.get_next_unordered())
+
+            .. testoutput::
+                :options: +MOCK
+
+                4
+                2
+        """
+        if not self.has_next():
+            raise StopIteration("No more results to get")
+        # TODO(ekl) bulk wait for performance
+        res, _ = ray.wait(list(self._future_to_actor), num_returns=1, timeout=timeout)
+        timeout_msg = "Timed out waiting for result"
+        raise_timeout_after_ignore = False
+        if res:
+            [future] = res
+        else:
+            if not ignore_if_timedout:
+                raise TimeoutError(timeout_msg)
+            else:
+                raise_timeout_after_ignore = True
+        i, a = self._future_to_actor.pop(future)
+        self._return_actor(a)
+        del self._index_to_future[i]
+        self._next_return_index = max(self._next_return_index, i + 1)
+        if raise_timeout_after_ignore:
+            raise TimeoutError(
+                timeout_msg + ". The task {} has been ignored.".format(future)
+            )
+        return ray.get(future)
+
+    def _return_actor(self, actor):
+        self._idle_actors.append(actor)
+        if self._pending_submits:
+            self.submit(*self._pending_submits.pop(0))
+
+    def has_free(self):
+        """Returns whether there are any idle actors available.
+
+        Returns:
+            True if there are any idle actors and no pending submits.
+
+        Examples:
+            .. testcode::
+
+                import ray
+                from ray.util.actor_pool import ActorPool
+
+                @ray.remote
+                class Actor:
+                    def double(self, v):
+                        return 2 * v
+
+                a1 = Actor.remote()
+                pool = ActorPool([a1])
+                pool.submit(lambda a, v: a.double.remote(v), 1)
+                print(pool.has_free())
+                print(pool.get_next())
+                print(pool.has_free())
+
+            .. testoutput::
+
+                False
+                2
+                True
+        """
+        return len(self._idle_actors) > 0 and len(self._pending_submits) == 0
+
+    def pop_idle(self):
+        """Removes an idle actor from the pool.
+
+        Returns:
+            An idle actor if one is available.
+            None if no actor was free to be removed.
+
+        Examples:
+            .. testcode::
+
+                import ray
+                from ray.util.actor_pool import ActorPool
+
+                @ray.remote
+                class Actor:
+                    def double(self, v):
+                        return 2 * v
+
+                a1 = Actor.remote()
+                pool = ActorPool([a1])
+                pool.submit(lambda a, v: a.double.remote(v), 1)
+                assert pool.pop_idle() is None
+                assert pool.get_next() == 2
+                assert pool.pop_idle() == a1
+
+        """
+        if self.has_free():
+            return self._idle_actors.pop()
+        return None
+
+    def push(self, actor):
+        """Pushes a new actor into the current list of idle actors.
+
+        Examples:
+            .. testcode::
+
+                import ray
+                from ray.util.actor_pool import ActorPool
+
+                @ray.remote
+                class Actor:
+                    def double(self, v):
+                        return 2 * v
+
+                a1, a2 = Actor.remote(), Actor.remote()
+                pool = ActorPool([a1])
+                pool.push(a2)
+        """
+        busy_actors = []
+        if self._future_to_actor.values():
+            _, busy_actors = zip(*self._future_to_actor.values())
+        if actor in self._idle_actors or actor in busy_actors:
+            raise ValueError("Actor already belongs to current ActorPool")
+        else:
+            self._return_actor(actor)
diff --git a/.venv/lib/python3.11/site-packages/ray/util/check_open_ports.py b/.venv/lib/python3.11/site-packages/ray/util/check_open_ports.py
new file mode 100644
index 0000000000000000000000000000000000000000..29c9e03e47405dd85b5cf281e479fad684baea1e
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/util/check_open_ports.py
@@ -0,0 +1,179 @@
+"""A CLI utility for check open ports in the Ray cluster.
+
+See https://www.anyscale.com/blog/update-on-ray-cve-2023-48022-new-verification-tooling-available # noqa: E501
+for more details.
+"""
+from typing import List, Tuple
+import subprocess
+import click
+import psutil
+import urllib
+import json
+
+import ray
+from ray.util.annotations import PublicAPI
+from ray.autoscaler._private.cli_logger import add_click_logging_options, cli_logger
+from ray.autoscaler._private.constants import RAY_PROCESSES
+from ray.util.scheduling_strategies import NodeAffinitySchedulingStrategy
+
+
+def _get_ray_ports() -> List[int]:
+    unique_ports = set()
+
+    process_infos = []
+    for proc in psutil.process_iter(["name", "cmdline"]):
+        try:
+            process_infos.append((proc, proc.name(), proc.cmdline()))
+        except psutil.Error:
+            pass
+
+    for keyword, filter_by_cmd in RAY_PROCESSES:
+        for candidate in process_infos:
+            proc, proc_cmd, proc_args = candidate
+            corpus = proc_cmd if filter_by_cmd else subprocess.list2cmdline(proc_args)
+            if keyword in corpus:
+                try:
+                    for connection in proc.connections():
+                        if connection.status == psutil.CONN_LISTEN:
+                            unique_ports.add(connection.laddr.port)
+                except psutil.AccessDenied:
+                    cli_logger.info(
+                        "Access denied to process connections for process,"
+                        " worker process probably restarted",
+                        proc,
+                    )
+
+    return sorted(unique_ports)
+
+
+def _check_for_open_ports_from_internet(
+    service_url: str, ports: List[int]
+) -> Tuple[List[int], List[int]]:
+    request = urllib.request.Request(
+        method="POST",
+        url=service_url,
+        headers={
+            "Content-Type": "application/json",
+            "X-Ray-Open-Port-Check": "1",
+        },
+        data=json.dumps({"ports": ports}).encode("utf-8"),
+    )
+
+    response = urllib.request.urlopen(request)
+    if response.status != 200:
+        raise RuntimeError(
+            f"Failed to check with Ray Open Port Service: {response.status}"
+        )
+    response_body = json.load(response)
+
+    publicly_open_ports = response_body.get("open_ports", [])
+    checked_ports = response_body.get("checked_ports", [])
+
+    return publicly_open_ports, checked_ports
+
+
+def _check_if_exposed_to_internet(
+    service_url: str,
+) -> Tuple[List[int], List[int]]:
+    return _check_for_open_ports_from_internet(service_url, _get_ray_ports())
+
+
+def _check_ray_cluster(
+    service_url: str,
+) -> List[Tuple[str, Tuple[List[int], List[int]]]]:
+    ray.init(ignore_reinit_error=True)
+
+    @ray.remote(num_cpus=0)
+    def check(node_id, service_url):
+        return node_id, _check_if_exposed_to_internet(service_url)
+
+    ray_node_ids = [node["NodeID"] for node in ray.nodes() if node["Alive"]]
+    cli_logger.info(
+        f"Cluster has {len(ray_node_ids)} node(s)."
+        " Scheduling tasks on each to check for exposed ports",
+    )
+
+    per_node_tasks = {
+        node_id: (
+            check.options(
+                scheduling_strategy=NodeAffinitySchedulingStrategy(
+                    node_id=node_id, soft=False
+                )
+            ).remote(node_id, service_url)
+        )
+        for node_id in ray_node_ids
+    }
+
+    results = []
+    for node_id, per_node_task in per_node_tasks.items():
+        try:
+            results.append(ray.get(per_node_task))
+        except Exception as e:
+            cli_logger.info(f"Failed to check on node {node_id}: {e}")
+
+    return results
+
+
+@click.command()
+@click.option(
+    "--yes", "-y", is_flag=True, default=False, help="Don't ask for confirmation."
+)
+@click.option(
+    "--service-url",
+    required=False,
+    type=str,
+    default="https://ray-open-port-checker.uc.r.appspot.com/open-port-check",
+    help="The url of service that checks whether submitted ports are open.",
+)
+@add_click_logging_options
+@PublicAPI
+def check_open_ports(yes, service_url):
+    """Check open ports in the local Ray cluster."""
+    if not cli_logger.confirm(
+        yes=yes,
+        msg=(
+            "Do you want to check the local Ray cluster"
+            " for any nodes with ports accessible to the internet?"
+        ),
+        _default=True,
+    ):
+        cli_logger.info("Exiting without checking as instructed")
+        return
+
+    cluster_open_ports = _check_ray_cluster(service_url)
+
+    public_nodes = []
+    for node_id, (open_ports, checked_ports) in cluster_open_ports:
+        if open_ports:
+            cli_logger.info(
+                f"[🛑] open ports detected open_ports={open_ports!r} node={node_id!r}"
+            )
+            public_nodes.append((node_id, open_ports, checked_ports))
+        else:
+            cli_logger.info(
+                f"[🟢] No open ports detected "
+                f"checked_ports={checked_ports!r} node={node_id!r}"
+            )
+
+    cli_logger.info("Check complete, results:")
+
+    if public_nodes:
+        cli_logger.info(
+            """
+[🛑] An server on the internet was able to open a connection to one of this Ray
+cluster's public IP on one of Ray's internal ports. If this is not a false
+positive, this is an extremely unsafe configuration for Ray to be running in.
+Ray is not meant to be exposed to untrusted clients and will allow them to run
+arbitrary code on your machine.
+
+You should take immediate action to validate this result and if confirmed shut
+down your Ray cluster immediately and take appropriate action to remediate its
+exposure. Anything either running on this Ray cluster or that this cluster has
+had access to could be at risk.
+
+For guidance on how to operate Ray safely, please review [Ray's security
+documentation](https://docs.ray.io/en/latest/ray-security/index.html).
+""".strip()
+        )
+    else:
+        cli_logger.info("[🟢] No open ports detected from any Ray nodes")
diff --git a/.venv/lib/python3.11/site-packages/ray/util/check_serialize.py b/.venv/lib/python3.11/site-packages/ray/util/check_serialize.py
new file mode 100644
index 0000000000000000000000000000000000000000..a9a8377b3a77242fbadabb9684cbf5cb90208875
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/util/check_serialize.py
@@ -0,0 +1,265 @@
+"""A utility for debugging serialization issues."""
+import inspect
+from contextlib import contextmanager
+from typing import Any, Optional, Set, Tuple
+
+# Import ray first to use the bundled colorama
+import ray  # noqa: F401
+import colorama
+import ray.cloudpickle as cp
+from ray.util.annotations import DeveloperAPI
+
+
+@contextmanager
+def _indent(printer):
+    printer.level += 1
+    yield
+    printer.level -= 1
+
+
+class _Printer:
+    def __init__(self, print_file):
+        self.level = 0
+        self.print_file = print_file
+
+    def indent(self):
+        return _indent(self)
+
+    def print(self, msg):
+        indent = "    " * self.level
+        print(indent + msg, file=self.print_file)
+
+
+@DeveloperAPI
+class FailureTuple:
+    """Represents the serialization 'frame'.
+
+    Attributes:
+        obj: The object that fails serialization.
+        name: The variable name of the object.
+        parent: The object that references the `obj`.
+    """
+
+    def __init__(self, obj: Any, name: str, parent: Any):
+        self.obj = obj
+        self.name = name
+        self.parent = parent
+
+    def __repr__(self):
+        return f"FailTuple({self.name} [obj={self.obj}, parent={self.parent}])"
+
+
+def _inspect_func_serialization(base_obj, depth, parent, failure_set, printer):
+    """Adds the first-found non-serializable element to the failure_set."""
+    assert inspect.isfunction(base_obj)
+    closure = inspect.getclosurevars(base_obj)
+    found = False
+    if closure.globals:
+        printer.print(
+            f"Detected {len(closure.globals)} global variables. "
+            "Checking serializability..."
+        )
+
+        with printer.indent():
+            for name, obj in closure.globals.items():
+                serializable, _ = _inspect_serializability(
+                    obj,
+                    name=name,
+                    depth=depth - 1,
+                    parent=parent,
+                    failure_set=failure_set,
+                    printer=printer,
+                )
+                found = found or not serializable
+                if found:
+                    break
+
+    if closure.nonlocals:
+        printer.print(
+            f"Detected {len(closure.nonlocals)} nonlocal variables. "
+            "Checking serializability..."
+        )
+        with printer.indent():
+            for name, obj in closure.nonlocals.items():
+                serializable, _ = _inspect_serializability(
+                    obj,
+                    name=name,
+                    depth=depth - 1,
+                    parent=parent,
+                    failure_set=failure_set,
+                    printer=printer,
+                )
+                found = found or not serializable
+                if found:
+                    break
+    if not found:
+        printer.print(
+            f"WARNING: Did not find non-serializable object in {base_obj}. "
+            "This may be an oversight."
+        )
+    return found
+
+
+def _inspect_generic_serialization(base_obj, depth, parent, failure_set, printer):
+    """Adds the first-found non-serializable element to the failure_set."""
+    assert not inspect.isfunction(base_obj)
+    functions = inspect.getmembers(base_obj, predicate=inspect.isfunction)
+    found = False
+    with printer.indent():
+        for name, obj in functions:
+            serializable, _ = _inspect_serializability(
+                obj,
+                name=name,
+                depth=depth - 1,
+                parent=parent,
+                failure_set=failure_set,
+                printer=printer,
+            )
+            found = found or not serializable
+            if found:
+                break
+
+    with printer.indent():
+        members = inspect.getmembers(base_obj)
+        for name, obj in members:
+            if name.startswith("__") and name.endswith("__") or inspect.isbuiltin(obj):
+                continue
+            serializable, _ = _inspect_serializability(
+                obj,
+                name=name,
+                depth=depth - 1,
+                parent=parent,
+                failure_set=failure_set,
+                printer=printer,
+            )
+            found = found or not serializable
+            if found:
+                break
+    if not found:
+        printer.print(
+            f"WARNING: Did not find non-serializable object in {base_obj}. "
+            "This may be an oversight."
+        )
+    return found
+
+
+@DeveloperAPI
+def inspect_serializability(
+    base_obj: Any,
+    name: Optional[str] = None,
+    depth: int = 3,
+    print_file: Optional[Any] = None,
+) -> Tuple[bool, Set[FailureTuple]]:
+    """Identifies what objects are preventing serialization.
+
+    Args:
+        base_obj: Object to be serialized.
+        name: Optional name of string.
+        depth: Depth of the scope stack to walk through. Defaults to 3.
+        print_file: file argument that will be passed to print().
+
+    Returns:
+        bool: True if serializable.
+        set[FailureTuple]: Set of unserializable objects.
+
+    .. versionadded:: 1.1.0
+
+    """
+    printer = _Printer(print_file)
+    return _inspect_serializability(base_obj, name, depth, None, None, printer)
+
+
+def _inspect_serializability(
+    base_obj, name, depth, parent, failure_set, printer
+) -> Tuple[bool, Set[FailureTuple]]:
+    colorama.init()
+    top_level = False
+    declaration = ""
+    found = False
+    if failure_set is None:
+        top_level = True
+        failure_set = set()
+        declaration = f"Checking Serializability of {base_obj}"
+        printer.print("=" * min(len(declaration), 80))
+        printer.print(declaration)
+        printer.print("=" * min(len(declaration), 80))
+
+        if name is None:
+            name = str(base_obj)
+    else:
+        printer.print(f"Serializing '{name}' {base_obj}...")
+    try:
+        cp.dumps(base_obj)
+        return True, failure_set
+    except Exception as e:
+        printer.print(
+            f"{colorama.Fore.RED}!!! FAIL{colorama.Fore.RESET} " f"serialization: {e}"
+        )
+        found = True
+        try:
+            if depth == 0:
+                failure_set.add(FailureTuple(base_obj, name, parent))
+        # Some objects may not be hashable, so we skip adding this to the set.
+        except Exception:
+            pass
+
+    if depth <= 0:
+        return False, failure_set
+
+    # TODO: we only differentiate between 'function' and 'object'
+    # but we should do a better job of diving into something
+    # more specific like a Type, Object, etc.
+    if inspect.isfunction(base_obj):
+        _inspect_func_serialization(
+            base_obj,
+            depth=depth,
+            parent=base_obj,
+            failure_set=failure_set,
+            printer=printer,
+        )
+    else:
+        _inspect_generic_serialization(
+            base_obj,
+            depth=depth,
+            parent=base_obj,
+            failure_set=failure_set,
+            printer=printer,
+        )
+
+    if not failure_set:
+        failure_set.add(FailureTuple(base_obj, name, parent))
+
+    if top_level:
+        printer.print("=" * min(len(declaration), 80))
+        if not failure_set:
+            printer.print(
+                "Nothing failed the inspect_serialization test, though "
+                "serialization did not succeed."
+            )
+        else:
+            fail_vars = (
+                f"\n\n\t{colorama.Style.BRIGHT}"
+                + "\n".join(str(k) for k in failure_set)
+                + f"{colorama.Style.RESET_ALL}\n\n"
+            )
+            printer.print(
+                f"Variable: {fail_vars}was found to be non-serializable. "
+                "There may be multiple other undetected variables that were "
+                "non-serializable. "
+            )
+            printer.print(
+                "Consider either removing the "
+                "instantiation/imports of these variables or moving the "
+                "instantiation into the scope of the function/class. "
+            )
+        printer.print("=" * min(len(declaration), 80))
+        printer.print(
+            "Check https://docs.ray.io/en/master/ray-core/objects/serialization.html#troubleshooting for more information."  # noqa
+        )
+        printer.print(
+            "If you have any suggestions on how to improve "
+            "this error message, please reach out to the "
+            "Ray developers on github.com/ray-project/ray/issues/"
+        )
+        printer.print("=" * min(len(declaration), 80))
+    return not found, failure_set
diff --git a/.venv/lib/python3.11/site-packages/ray/util/client_connect.py b/.venv/lib/python3.11/site-packages/ray/util/client_connect.py
new file mode 100644
index 0000000000000000000000000000000000000000..c88b86457b0ac0613a1a9c7313041e9a6dca0667
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/util/client_connect.py
@@ -0,0 +1,76 @@
+from typing import Any, Dict, List, Optional, Tuple
+import logging
+
+from ray._private.client_mode_hook import (
+    _explicitly_enable_client_mode,
+    _set_client_hook_status,
+)
+from ray.job_config import JobConfig
+from ray.util.annotations import Deprecated
+from ray.util.client import ray
+from ray._private.utils import get_ray_doc_version
+
+logger = logging.getLogger(__name__)
+
+
+@Deprecated(
+    message="Use ray.init(ray://<head_node_ip_address>:<ray_client_server_port>) "
+    "instead. See detailed usage at {}.".format(
+        f"https://docs.ray.io/en/{get_ray_doc_version()}/ray-core/package-ref.html#ray-init"  # noqa: E501
+    )
+)
+def connect(
+    conn_str: str,
+    secure: bool = False,
+    metadata: List[Tuple[str, str]] = None,
+    connection_retries: int = 3,
+    job_config: JobConfig = None,
+    namespace: str = None,
+    *,
+    ignore_version: bool = False,
+    _credentials: Optional["grpc.ChannelCredentials"] = None,  # noqa: F821
+    ray_init_kwargs: Optional[Dict[str, Any]] = None,
+) -> Dict[str, Any]:
+    if ray.is_connected():
+        ignore_reinit_error = ray_init_kwargs.get("ignore_reinit_error", False)
+        if ignore_reinit_error:
+            logger.info(
+                "Calling ray.init() again after it has already been called. "
+                "Reusing the existing Ray client connection."
+            )
+            return ray.get_context().client_worker.connection_info()
+        raise RuntimeError(
+            "Ray Client is already connected. Maybe you called "
+            'ray.init("ray://<address>") twice by accident?'
+        )
+
+    # Enable the same hooks that RAY_CLIENT_MODE does, as calling
+    # ray.init("ray://<address>") is specifically for using client mode.
+    _set_client_hook_status(True)
+    _explicitly_enable_client_mode()
+
+    # TODO(barakmich): https://github.com/ray-project/ray/issues/13274
+    # for supporting things like cert_path, ca_path, etc and creating
+    # the correct metadata
+    conn = ray.connect(
+        conn_str,
+        job_config=job_config,
+        secure=secure,
+        metadata=metadata,
+        connection_retries=connection_retries,
+        namespace=namespace,
+        ignore_version=ignore_version,
+        _credentials=_credentials,
+        ray_init_kwargs=ray_init_kwargs,
+    )
+    return conn
+
+
+@Deprecated(
+    message="Use ray.shutdown() instead. See detailed usage at {}.".format(
+        f"https://docs.ray.io/en/{get_ray_doc_version()}/ray-core/package-ref.html#ray-shutdown"  # noqa: E501
+    )
+)
+def disconnect():
+    """Disconnects from server; is idempotent."""
+    return ray.disconnect()
diff --git a/.venv/lib/python3.11/site-packages/ray/util/dask/scheduler_utils.py b/.venv/lib/python3.11/site-packages/ray/util/dask/scheduler_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..efb7b18bd9111f163ad23936253964b625ad8765
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/util/dask/scheduler_utils.py
@@ -0,0 +1,371 @@
+"""
+The following is adapted from Dask release 2021.03.1:
+    https://github.com/dask/dask/blob/2021.03.1/dask/local.py
+"""
+
+import os
+from queue import Queue, Empty
+
+from dask import config
+from dask.callbacks import local_callbacks, unpack_callbacks
+from dask.core import _execute_task, flatten, get_dependencies, has_tasks, reverse_dict
+from dask.order import order
+
+if os.name == "nt":
+    # Python 3 windows Queue.get doesn't handle interrupts properly. To
+    # workaround this we poll at a sufficiently large interval that it
+    # shouldn't affect performance, but small enough that users trying to kill
+    # an application shouldn't care.
+    def queue_get(q):
+        while True:
+            try:
+                return q.get(block=True, timeout=0.1)
+            except Empty:
+                pass
+
+else:
+
+    def queue_get(q):
+        return q.get()
+
+
+def start_state_from_dask(dsk, cache=None, sortkey=None):
+    """Start state from a dask
+    Examples
+    --------
+    >>> dsk = {
+        'x': 1,
+        'y': 2,
+        'z': (inc, 'x'),
+        'w': (add, 'z', 'y')}  # doctest: +SKIP
+    >>> from pprint import pprint  # doctest: +SKIP
+    >>> pprint(start_state_from_dask(dsk))  # doctest: +SKIP
+    {'cache': {'x': 1, 'y': 2},
+     'dependencies': {'w': {'z', 'y'}, 'x': set(), 'y': set(), 'z': {'x'}},
+     'dependents': {'w': set(), 'x': {'z'}, 'y': {'w'}, 'z': {'w'}},
+     'finished': set(),
+     'ready': ['z'],
+     'released': set(),
+     'running': set(),
+     'waiting': {'w': {'z'}},
+     'waiting_data': {'x': {'z'}, 'y': {'w'}, 'z': {'w'}}}
+    """
+    if sortkey is None:
+        sortkey = order(dsk).get
+    if cache is None:
+        cache = config.get("cache", None)
+    if cache is None:
+        cache = dict()
+    data_keys = set()
+    for k, v in dsk.items():
+        if not has_tasks(dsk, v):
+            cache[k] = v
+            data_keys.add(k)
+
+    dsk2 = dsk.copy()
+    dsk2.update(cache)
+
+    dependencies = {k: get_dependencies(dsk2, k) for k in dsk}
+    waiting = {k: v.copy() for k, v in dependencies.items() if k not in data_keys}
+
+    dependents = reverse_dict(dependencies)
+    for a in cache:
+        for b in dependents.get(a, ()):
+            waiting[b].remove(a)
+    waiting_data = {k: v.copy() for k, v in dependents.items() if v}
+
+    ready_set = {k for k, v in waiting.items() if not v}
+    ready = sorted(ready_set, key=sortkey, reverse=True)
+    waiting = {k: v for k, v in waiting.items() if v}
+
+    state = {
+        "dependencies": dependencies,
+        "dependents": dependents,
+        "waiting": waiting,
+        "waiting_data": waiting_data,
+        "cache": cache,
+        "ready": ready,
+        "running": set(),
+        "finished": set(),
+        "released": set(),
+    }
+
+    return state
+
+
+def execute_task(key, task_info, dumps, loads, get_id, pack_exception):
+    """
+    Compute task and handle all administration
+    See Also
+    --------
+    _execute_task : actually execute task
+    """
+    try:
+        task, data = loads(task_info)
+        result = _execute_task(task, data)
+        id = get_id()
+        result = dumps((result, id))
+        failed = False
+    except BaseException as e:
+        result = pack_exception(e, dumps)
+        failed = True
+    return key, result, failed
+
+
+def release_data(key, state, delete=True):
+    """Remove data from temporary storage
+    See Also
+    --------
+    finish_task
+    """
+    if key in state["waiting_data"]:
+        assert not state["waiting_data"][key]
+        del state["waiting_data"][key]
+
+    state["released"].add(key)
+
+    if delete:
+        del state["cache"][key]
+
+
+DEBUG = False
+
+
+def finish_task(
+    dsk, key, state, results, sortkey, delete=True, release_data=release_data
+):
+    """
+    Update execution state after a task finishes
+    Mutates.  This should run atomically (with a lock).
+    """
+    for dep in sorted(state["dependents"][key], key=sortkey, reverse=True):
+        s = state["waiting"][dep]
+        s.remove(key)
+        if not s:
+            del state["waiting"][dep]
+            state["ready"].append(dep)
+
+    for dep in state["dependencies"][key]:
+        if dep in state["waiting_data"]:
+            s = state["waiting_data"][dep]
+            s.remove(key)
+            if not s and dep not in results:
+                if DEBUG:
+                    from chest.core import nbytes
+
+                    print(
+                        "Key: %s\tDep: %s\t NBytes: %.2f\t Release"
+                        % (key, dep, sum(map(nbytes, state["cache"].values()) / 1e6))
+                    )
+                release_data(dep, state, delete=delete)
+        elif delete and dep not in results:
+            release_data(dep, state, delete=delete)
+
+    state["finished"].add(key)
+    state["running"].remove(key)
+
+    return state
+
+
+def nested_get(ind, coll):
+    """Get nested index from collection
+    Examples
+    --------
+    >>> nested_get(1, 'abc')
+    'b'
+    >>> nested_get([1, 0], 'abc')
+    ('b', 'a')
+    >>> nested_get([[1, 0], [0, 1]], 'abc')
+    (('b', 'a'), ('a', 'b'))
+    """
+    if isinstance(ind, list):
+        return tuple(nested_get(i, coll) for i in ind)
+    else:
+        return coll[ind]
+
+
+def default_get_id():
+    """Default get_id"""
+    return None
+
+
+def default_pack_exception(e, dumps):
+    raise
+
+
+def reraise(exc, tb=None):
+    if exc.__traceback__ is not tb:
+        raise exc.with_traceback(tb)
+    raise exc
+
+
+def identity(x):
+    """Identity function. Returns x.
+    >>> identity(3)
+    3
+    """
+    return x
+
+
+def get_async(
+    apply_async,
+    num_workers,
+    dsk,
+    result,
+    cache=None,
+    get_id=default_get_id,
+    rerun_exceptions_locally=None,
+    pack_exception=default_pack_exception,
+    raise_exception=reraise,
+    callbacks=None,
+    dumps=identity,
+    loads=identity,
+    **kwargs
+):
+    """Asynchronous get function
+    This is a general version of various asynchronous schedulers for dask.  It
+    takes a an apply_async function as found on Pool objects to form a more
+    specific ``get`` method that walks through the dask array with parallel
+    workers, avoiding repeat computation and minimizing memory use.
+    Parameters
+    ----------
+    apply_async : function
+        Asynchronous apply function as found on Pool or ThreadPool
+    num_workers : int
+        The number of active tasks we should have at any one time
+    dsk : dict
+        A dask dictionary specifying a workflow
+    result : key or list of keys
+        Keys corresponding to desired data
+    cache : dict-like, optional
+        Temporary storage of results
+    get_id : callable, optional
+        Function to return the worker id, takes no arguments. Examples are
+        `threading.current_thread` and `multiprocessing.current_process`.
+    rerun_exceptions_locally : bool, optional
+        Whether to rerun failing tasks in local process to enable debugging
+        (False by default)
+    pack_exception : callable, optional
+        Function to take an exception and ``dumps`` method, and return a
+        serialized tuple of ``(exception, traceback)`` to send back to the
+        scheduler. Default is to just raise the exception.
+    raise_exception : callable, optional
+        Function that takes an exception and a traceback, and raises an error.
+    dumps: callable, optional
+        Function to serialize task data and results to communicate between
+        worker and parent.  Defaults to identity.
+    loads: callable, optional
+        Inverse function of `dumps`.  Defaults to identity.
+    callbacks : tuple or list of tuples, optional
+        Callbacks are passed in as tuples of length 5. Multiple sets of
+        callbacks may be passed in as a list of tuples. For more information,
+        see the dask.diagnostics documentation.
+    See Also
+    --------
+    threaded.get
+    """
+    queue = Queue()
+
+    if isinstance(result, list):
+        result_flat = set(flatten(result))
+    else:
+        result_flat = {result}
+    results = set(result_flat)
+
+    dsk = dict(dsk)
+    with local_callbacks(callbacks) as callbacks:
+        _, _, pretask_cbs, posttask_cbs, _ = unpack_callbacks(callbacks)
+        started_cbs = []
+        succeeded = False
+        # if start_state_from_dask fails, we will have something
+        # to pass to the final block.
+        state = {}
+        try:
+            for cb in callbacks:
+                if cb[0]:
+                    cb[0](dsk)
+                started_cbs.append(cb)
+
+            keyorder = order(dsk)
+
+            state = start_state_from_dask(dsk, cache=cache, sortkey=keyorder.get)
+
+            for _, start_state, _, _, _ in callbacks:
+                if start_state:
+                    start_state(dsk, state)
+
+            if rerun_exceptions_locally is None:
+                rerun_exceptions_locally = config.get("rerun_exceptions_locally", False)
+
+            if state["waiting"] and not state["ready"]:
+                raise ValueError("Found no accessible jobs in dask")
+
+            def fire_task():
+                """Fire off a task to the thread pool"""
+                # Choose a good task to compute
+                key = state["ready"].pop()
+                state["running"].add(key)
+                for f in pretask_cbs:
+                    f(key, dsk, state)
+
+                # Prep data to send
+                data = {dep: state["cache"][dep] for dep in get_dependencies(dsk, key)}
+                # Submit
+                apply_async(
+                    execute_task,
+                    args=(
+                        key,
+                        dumps((dsk[key], data)),
+                        dumps,
+                        loads,
+                        get_id,
+                        pack_exception,
+                    ),
+                    callback=queue.put,
+                )
+
+            # Seed initial tasks into the thread pool
+            while state["ready"] and len(state["running"]) < num_workers:
+                fire_task()
+
+            # Main loop, wait on tasks to finish, insert new ones
+            while state["waiting"] or state["ready"] or state["running"]:
+                key, res_info, failed = queue_get(queue)
+                if failed:
+                    exc, tb = loads(res_info)
+                    if rerun_exceptions_locally:
+                        data = {
+                            dep: state["cache"][dep]
+                            for dep in get_dependencies(dsk, key)
+                        }
+                        task = dsk[key]
+                        _execute_task(task, data)  # Re-execute locally
+                    else:
+                        raise_exception(exc, tb)
+                res, worker_id = loads(res_info)
+                state["cache"][key] = res
+                finish_task(dsk, key, state, results, keyorder.get)
+                for f in posttask_cbs:
+                    f(key, res, dsk, state, worker_id)
+
+                while state["ready"] and len(state["running"]) < num_workers:
+                    fire_task()
+
+            succeeded = True
+
+        finally:
+            for _, _, _, _, finish in started_cbs:
+                if finish:
+                    finish(dsk, state, not succeeded)
+
+    return nested_get(result, state["cache"])
+
+
+def apply_sync(func, args=(), kwds=None, callback=None):
+    """A naive synchronous version of apply_async"""
+    if kwds is None:
+        kwds = {}
+
+    res = func(*args, **kwds)
+    if callback is not None:
+        callback(res)
diff --git a/.venv/lib/python3.11/site-packages/ray/util/debug.py b/.venv/lib/python3.11/site-packages/ray/util/debug.py
new file mode 100644
index 0000000000000000000000000000000000000000..e5482c7b6d8c73095fd09e0f3f5b4583549d2151
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/util/debug.py
@@ -0,0 +1,274 @@
+from collections import defaultdict, namedtuple
+import gc
+import os
+import re
+import time
+import tracemalloc
+from typing import Callable, List, Optional
+from ray.util.annotations import DeveloperAPI
+
+_logged = set()
+_disabled = False
+_periodic_log = False
+_last_logged = 0.0
+
+
+@DeveloperAPI
+def log_once(key):
+    """Returns True if this is the "first" call for a given key.
+
+    Various logging settings can adjust the definition of "first".
+
+    Example:
+
+        .. testcode::
+
+            import logging
+            from ray.util.debug import log_once
+
+            logger = logging.getLogger(__name__)
+            if log_once("some_key"):
+                logger.info("Some verbose logging statement")
+    """
+
+    global _last_logged
+
+    if _disabled:
+        return False
+    elif key not in _logged:
+        _logged.add(key)
+        _last_logged = time.time()
+        return True
+    elif _periodic_log and time.time() - _last_logged > 60.0:
+        _logged.clear()
+        _last_logged = time.time()
+        return False
+    else:
+        return False
+
+
+@DeveloperAPI
+def disable_log_once_globally():
+    """Make log_once() return False in this process."""
+
+    global _disabled
+    _disabled = True
+
+
+@DeveloperAPI
+def enable_periodic_logging():
+    """Make log_once() periodically return True in this process."""
+
+    global _periodic_log
+    _periodic_log = True
+
+
+@DeveloperAPI
+def reset_log_once(key: Optional[str] = None):
+    """Resets log_once for the provided key.
+
+    If you don't provide a key, resets log_once for all keys.
+    """
+    if key is None:
+        _logged.clear()
+    else:
+        _logged.discard(key)
+
+
+# A suspicious memory-allocating stack-trace that we should re-test
+# to make sure it's not a false positive.
+Suspect = DeveloperAPI(
+    namedtuple(
+        "Suspect",
+        [
+            # The stack trace of the allocation, going back n frames, depending
+            # on the tracemalloc.start(n) call.
+            "traceback",
+            # The amount of memory taken by this particular stack trace
+            # over the course of the experiment.
+            "memory_increase",
+            # The slope of the scipy linear regression (x=iteration; y=memory size).
+            "slope",
+            # The rvalue of the scipy linear regression.
+            "rvalue",
+            # The memory size history (list of all memory sizes over all iterations).
+            "hist",
+        ],
+    )
+)
+
+
+def _test_some_code_for_memory_leaks(
+    desc: str,
+    init: Optional[Callable[[], None]],
+    code: Callable[[], None],
+    repeats: int,
+    max_num_trials: int = 1,
+) -> List[Suspect]:
+    """Runs given code (and init code) n times and checks for memory leaks.
+
+    Args:
+        desc: A descriptor of the test.
+        init: Optional code to be executed initially.
+        code: The actual code to be checked for producing memory leaks.
+        repeats: How many times to repeatedly execute `code`.
+        max_num_trials: The maximum number of trials to run. A new trial is only
+            run, if the previous one produced a memory leak. For all non-1st trials,
+            `repeats` calculates as: actual_repeats = `repeats` * (trial + 1), where
+            the first trial is 0.
+
+    Returns:
+        A list of Suspect objects, describing possible memory leaks. If list
+        is empty, no leaks have been found.
+    """
+
+    def _i_print(i):
+        if (i + 1) % 10 == 0:
+            print(".", end="" if (i + 1) % 100 else f" {i + 1}\n", flush=True)
+
+    # Do n trials to make sure a found leak is really one.
+    suspicious = set()
+    suspicious_stats = []
+    for trial in range(max_num_trials):
+        # Store up to n frames of each call stack.
+        tracemalloc.start(20)
+
+        table = defaultdict(list)
+
+        # Repeat running code for n times.
+        # Increase repeat value with each trial to make sure stats are more
+        # solid each time (avoiding false positives).
+        actual_repeats = repeats * (trial + 1)
+
+        print(f"{desc} {actual_repeats} times.")
+
+        # Initialize if necessary.
+        if init is not None:
+            init()
+        # Run `code` n times, each time taking a memory snapshot.
+        for i in range(actual_repeats):
+            _i_print(i)
+            # Manually trigger garbage collection before and after code runs in order to
+            # make tracemalloc snapshots as accurate as possible.
+            gc.collect()
+            code()
+            gc.collect()
+            _take_snapshot(table, suspicious)
+        print("\n")
+
+        # Check, which traces have moved up in their memory consumption
+        # constantly over time.
+        suspicious.clear()
+        suspicious_stats.clear()
+        # Suspicious memory allocation found?
+        suspects = _find_memory_leaks_in_table(table)
+        for suspect in sorted(suspects, key=lambda s: s.memory_increase, reverse=True):
+            # Only print out the biggest offender:
+            if len(suspicious) == 0:
+                _pprint_suspect(suspect)
+                print("-> added to retry list")
+            suspicious.add(suspect.traceback)
+            suspicious_stats.append(suspect)
+
+        tracemalloc.stop()
+
+        # Some suspicious memory allocations found.
+        if len(suspicious) > 0:
+            print(f"{len(suspicious)} suspects found. Top-ten:")
+            for i, s in enumerate(suspicious_stats):
+                if i > 10:
+                    break
+                print(
+                    f"{i}) line={s.traceback[-1]} mem-increase={s.memory_increase}B "
+                    f"slope={s.slope}B/detection rval={s.rvalue}"
+                )
+        # Nothing suspicious found -> Exit trial loop and return.
+        else:
+            print("No remaining suspects found -> returning")
+            break
+
+    # Print out final top offender.
+    if len(suspicious_stats) > 0:
+        _pprint_suspect(suspicious_stats[0])
+
+    return suspicious_stats
+
+
+def _take_snapshot(table, suspicious=None):
+    # Take a memory snapshot.
+    snapshot = tracemalloc.take_snapshot()
+    # Group all memory allocations by their stacktrace (going n frames
+    # deep as defined above in tracemalloc.start(n)).
+    # Then sort groups by size, then count, then trace.
+    top_stats = snapshot.statistics("traceback")
+
+    # For the first m largest increases, keep only, if a) first trial or b) those
+    # that are already in the `suspicious` set.
+    for stat in top_stats[:100]:
+        if not suspicious or stat.traceback in suspicious:
+            table[stat.traceback].append(stat.size)
+
+
+def _find_memory_leaks_in_table(table):
+    import scipy.stats
+    import numpy as np
+
+    suspects = []
+
+    for traceback, hist in table.items():
+        # Do a quick mem increase check.
+        memory_increase = hist[-1] - hist[0]
+
+        # Only if memory increased, do we check further.
+        if memory_increase <= 0.0:
+            continue
+
+        # Ignore this very module here (we are collecting lots of data
+        # so an increase is expected).
+        top_stack = str(traceback[-1])
+        drive_separator = "\\\\" if os.name == "nt" else "/"
+        if any(
+            s in top_stack
+            for s in [
+                "tracemalloc",
+                "pycharm",
+                "thirdparty_files/psutil",
+                re.sub("\\.", drive_separator, __name__) + ".py",
+            ]
+        ):
+            continue
+
+        # Do a linear regression to get the slope and R-value.
+        line = scipy.stats.linregress(x=np.arange(len(hist)), y=np.array(hist))
+
+        # - If weak positive slope and some confidence and
+        #   increase > n bytes -> error.
+        # - If stronger positive slope -> error.
+        if memory_increase > 1000 and (
+            (line.slope > 60.0 and line.rvalue > 0.875)
+            or (line.slope > 20.0 and line.rvalue > 0.9)
+            or (line.slope > 10.0 and line.rvalue > 0.95)
+        ):
+            suspects.append(
+                Suspect(
+                    traceback=traceback,
+                    memory_increase=memory_increase,
+                    slope=line.slope,
+                    rvalue=line.rvalue,
+                    hist=hist,
+                )
+            )
+
+    return suspects
+
+
+def _pprint_suspect(suspect):
+    print(
+        "Most suspicious memory allocation in traceback "
+        "(only printing out this one, but all (less suspicious)"
+        " suspects will be investigated as well):"
+    )
+    print("\n".join(suspect.traceback.format()))
+    print(f"Increase total={suspect.memory_increase}B")
+    print(f"Slope={suspect.slope} B/detection")
+    print(f"Rval={suspect.rvalue}")
diff --git a/.venv/lib/python3.11/site-packages/ray/util/debugpy.py b/.venv/lib/python3.11/site-packages/ray/util/debugpy.py
new file mode 100644
index 0000000000000000000000000000000000000000..32b265d1d45161cccaaaf996e7dd80a791e4ce58
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/util/debugpy.py
@@ -0,0 +1,136 @@
+import logging
+import os
+import sys
+import threading
+import importlib
+
+import ray
+from ray.util.annotations import DeveloperAPI
+
+log = logging.getLogger(__name__)
+
+POST_MORTEM_ERROR_UUID = "post_mortem_error_uuid"
+
+
+def _try_import_debugpy():
+    try:
+        debugpy = importlib.import_module("debugpy")
+        if not hasattr(debugpy, "__version__") or debugpy.__version__ < "1.8.0":
+            raise ImportError()
+        return debugpy
+    except (ModuleNotFoundError, ImportError):
+        log.error(
+            "Module 'debugpy>=1.8.0' cannot be loaded. "
+            "Ray Debugpy Debugger will not work without 'debugpy>=1.8.0' installed. "
+            "Install this module using 'pip install debugpy==1.8.0' "
+        )
+        return None
+
+
+# A lock to ensure that only one thread can open the debugger port.
+debugger_port_lock = threading.Lock()
+
+
+def _override_breakpoint_hooks():
+    """
+    This method overrides the breakpoint() function to set_trace()
+    so that other threads can reuse the same setup logic.
+    This is based on: https://github.com/microsoft/debugpy/blob/ef9a67fe150179ee4df9997f9273723c26687fab/src/debugpy/_vendored/pydevd/pydev_sitecustomize/sitecustomize.py#L87 # noqa: E501
+    """
+    sys.__breakpointhook__ = set_trace
+    sys.breakpointhook = set_trace
+    import builtins as __builtin__
+
+    __builtin__.breakpoint = set_trace
+
+
+def _ensure_debugger_port_open_thread_safe():
+    """
+    This is a thread safe method that ensure that the debugger port
+    is open, and if not, open it.
+    """
+
+    # The lock is acquired before checking the debugger port so only
+    # one thread can open the debugger port.
+    with debugger_port_lock:
+        debugpy = _try_import_debugpy()
+        if not debugpy:
+            return
+
+        debugger_port = ray._private.worker.global_worker.debugger_port
+        if not debugger_port:
+            (host, port) = debugpy.listen(
+                (ray._private.worker.global_worker.node_ip_address, 0)
+            )
+            ray._private.worker.global_worker.set_debugger_port(port)
+            log.info(f"Ray debugger is listening on {host}:{port}")
+        else:
+            log.info(f"Ray debugger is already open on {debugger_port}")
+
+
+@DeveloperAPI
+def set_trace(breakpoint_uuid=None):
+    """Interrupt the flow of the program and drop into the Ray debugger.
+    Can be used within a Ray task or actor.
+    """
+    debugpy = _try_import_debugpy()
+    if not debugpy:
+        return
+
+    _ensure_debugger_port_open_thread_safe()
+
+    # debugpy overrides the breakpoint() function, so we need to set it back
+    # so other threads can reuse it.
+    _override_breakpoint_hooks()
+
+    with ray._private.worker.global_worker.worker_paused_by_debugger():
+        msg = (
+            "Waiting for debugger to attach (see "
+            "https://docs.ray.io/en/latest/ray-observability/"
+            "ray-distributed-debugger.html)..."
+        )
+        log.info(msg)
+        debugpy.wait_for_client()
+
+    log.info("Debugger client is connected")
+    if breakpoint_uuid == POST_MORTEM_ERROR_UUID:
+        _debugpy_excepthook()
+    else:
+        _debugpy_breakpoint()
+
+
+def _debugpy_breakpoint():
+    """
+    Drop the user into the debugger on a breakpoint.
+    """
+    import pydevd
+
+    pydevd.settrace(stop_at_frame=sys._getframe().f_back)
+
+
+def _debugpy_excepthook():
+    """
+    Drop the user into the debugger on an unhandled exception.
+    """
+    import threading
+
+    import pydevd
+
+    py_db = pydevd.get_global_debugger()
+    thread = threading.current_thread()
+    additional_info = py_db.set_additional_thread_info(thread)
+    additional_info.is_tracing += 1
+    try:
+        error = sys.exc_info()
+        py_db.stop_on_unhandled_exception(py_db, thread, additional_info, error)
+        sys.excepthook(error[0], error[1], error[2])
+    finally:
+        additional_info.is_tracing -= 1
+
+
+def _is_ray_debugger_post_mortem_enabled():
+    return os.environ.get("RAY_DEBUG_POST_MORTEM", "0") == "1"
+
+
+def _post_mortem():
+    return set_trace(POST_MORTEM_ERROR_UUID)
diff --git a/.venv/lib/python3.11/site-packages/ray/util/iter_metrics.py b/.venv/lib/python3.11/site-packages/ray/util/iter_metrics.py
new file mode 100644
index 0000000000000000000000000000000000000000..eb06a97c3ace41d230e1d16b329126272d71a3ff
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/util/iter_metrics.py
@@ -0,0 +1,69 @@
+import collections
+from typing import List
+
+from ray.util.annotations import Deprecated
+from ray.util.timer import _Timer
+
+
+@Deprecated
+class MetricsContext:
+    """Metrics context object for a local iterator.
+
+    This object is accessible by all operators of a local iterator. It can be
+    used to store and retrieve global execution metrics for the iterator.
+    It can be accessed by calling LocalIterator.get_metrics(), which is only
+    allowable inside iterator functions.
+
+    Attributes:
+        counters: dict storing increasing metrics.
+        timers: dict storing latency timers.
+        info: dict storing misc metric values.
+        current_actor: reference to the actor handle that
+            produced the current iterator output. This is automatically set
+            for gather_async().
+    """
+
+    def __init__(self):
+        self.counters = collections.defaultdict(int)
+        self.timers = collections.defaultdict(_Timer)
+        self.info = {}
+        self.current_actor = None
+
+    def save(self):
+        """Return a serializable copy of this context."""
+        return {
+            "counters": dict(self.counters),
+            "info": dict(self.info),
+            "timers": None,  # TODO(ekl) consider persisting timers too
+        }
+
+    def restore(self, values):
+        """Restores state given the output of save()."""
+        self.counters.clear()
+        self.counters.update(values["counters"])
+        self.timers.clear()
+        self.info = values["info"]
+
+
+@Deprecated
+class SharedMetrics:
+    """Holds an indirect reference to a (shared) metrics context.
+
+    This is used by LocalIterator.union() to point the metrics contexts of
+    entirely separate iterator chains to the same underlying context."""
+
+    def __init__(
+        self, metrics: MetricsContext = None, parents: List["SharedMetrics"] = None
+    ):
+        self.metrics = metrics or MetricsContext()
+        self.parents = parents or []
+        self.set(self.metrics)
+
+    def set(self, metrics):
+        """Recursively set self and parents to point to the same metrics."""
+        self.metrics = metrics
+        for parent in self.parents:
+            parent.set(metrics)
+
+    def get(self):
+        return self.metrics
diff --git a/.venv/lib/python3.11/site-packages/ray/util/lightgbm/__init__.py b/.venv/lib/python3.11/site-packages/ray/util/lightgbm/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..9a46aefde633525e93463c500b0a229d05391449
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/util/lightgbm/__init__.py
@@ -0,0 +1,4 @@
+raise DeprecationWarning(
+    "ray.util.lightgbm has been removed as of Ray 2.0. Instead, use the `lightgbm-ray` "
+    "library directly or the `LightGBMTrainer` in Ray Train."
+)
diff --git a/.venv/lib/python3.11/site-packages/ray/util/lightgbm/__pycache__/__init__.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/util/lightgbm/__pycache__/__init__.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e5f8bb6bf66f821578a793e83819ff17fba36755
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/util/lightgbm/__pycache__/__init__.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/util/metrics.py b/.venv/lib/python3.11/site-packages/ray/util/metrics.py
new file mode 100644
index 0000000000000000000000000000000000000000..e838e30a651c681a5a8035c59f40273047c4ad17
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/util/metrics.py
@@ -0,0 +1,313 @@
+import logging
+
+from typing import Dict, Any, List, Optional, Tuple, Union
+
+from ray._raylet import (
+    Sum as CythonCount,
+    Histogram as CythonHistogram,
+    Gauge as CythonGauge,
+)  # noqa: E402
+
+# Sum is used for CythonCount because it allows incrementing by positive
+# values that are different from one.
+from ray.util.annotations import DeveloperAPI
+
+logger = logging.getLogger(__name__)
+
+
+@DeveloperAPI
+class Metric:
+    """The parent class of custom metrics.
+
+    Ray's custom metrics APIs are rooted from this class and share
+    the same public methods.
+    """
+
+    def __init__(
+        self,
+        name: str,
+        description: str = "",
+        tag_keys: Optional[Tuple[str, ...]] = None,
+    ):
+        if len(name) == 0:
+            raise ValueError("Empty name is not allowed. Please provide a metric name.")
+        self._name = name
+        self._description = description
+        # The default tags key-value pair.
+        self._default_tags = {}
+        # Keys of tags.
+        self._tag_keys = tag_keys or tuple()
+        # The Cython metric class. This should be set in the child class.
+        self._metric = None
+
+        if not isinstance(self._tag_keys, tuple):
+            raise TypeError(
+                "tag_keys should be a tuple type, got: " f"{type(self._tag_keys)}"
+            )
+
+        for key in self._tag_keys:
+            if not isinstance(key, str):
+                raise TypeError(f"Tag keys must be str, got {type(key)}.")
+
+    def set_default_tags(self, default_tags: Dict[str, str]):
+        """Set default tags of metrics.
+
+        Example:
+            >>> from ray.util.metrics import Counter
+            >>> # Note that set_default_tags returns the instance itself.
+            >>> counter = Counter("name", tag_keys=("a",))
+            >>> counter2 = counter.set_default_tags({"a": "b"})
+            >>> assert counter is counter2
+            >>> # this means you can instantiate it in this way.
+            >>> counter = Counter("name", tag_keys=("a",)).set_default_tags({"a": "b"})
+
+        Args:
+            default_tags: Default tags that are
+                used for every record method.
+
+        Returns:
+            Metric: it returns the instance itself.
+        """
+        for key, val in default_tags.items():
+            if key not in self._tag_keys:
+                raise ValueError(f"Unrecognized tag key {key}.")
+            if not isinstance(val, str):
+                raise TypeError(f"Tag values must be str, got {type(val)}.")
+
+        self._default_tags = default_tags
+        return self
+
+    def _record(
+        self,
+        value: Union[int, float],
+        tags: Optional[Dict[str, str]] = None,
+    ) -> None:
+        """Record the metric point of the metric.
+
+        Tags passed in will take precedence over the metric's default tags.
+
+        Args:
+            value: The value to be recorded as a metric point.
+        """
+        assert self._metric is not None
+
+        final_tags = self._get_final_tags(tags)
+        self._validate_tags(final_tags)
+        self._metric.record(value, tags=final_tags)
+
+    def _get_final_tags(self, tags):
+        if not tags:
+            return self._default_tags
+
+        for val in tags.values():
+            if not isinstance(val, str):
+                raise TypeError(f"Tag values must be str, got {type(val)}.")
+
+        return {**self._default_tags, **tags}
+
+    def _validate_tags(self, final_tags):
+        missing_tags = []
+        for tag_key in self._tag_keys:
+            # Prefer passed tags over default tags.
+            if tag_key not in final_tags:
+                missing_tags.append(tag_key)
+
+        if missing_tags:
+            raise ValueError(f"Missing value for tag key(s): {','.join(missing_tags)}.")
+
+    @property
+    def info(self) -> Dict[str, Any]:
+        """Return the information of this metric.
+
+        Example:
+            >>> from ray.util.metrics import Counter
+            >>> counter = Counter("name", description="desc")
+            >>> print(counter.info)
+            {'name': 'name', 'description': 'desc', 'tag_keys': (), 'default_tags': {}}
+        """
+        return {
+            "name": self._name,
+            "description": self._description,
+            "tag_keys": self._tag_keys,
+            "default_tags": self._default_tags,
+        }
+
+
+@DeveloperAPI
+class Counter(Metric):
+    """A cumulative metric that is monotonically increasing.
+
+    This corresponds to Prometheus' counter metric:
+    https://prometheus.io/docs/concepts/metric_types/#counter
+
+    Before Ray 2.10, this exports a Prometheus gauge metric instead of
+    a counter metric, which is wrong.
+    Since 2.10, this exports both counter (with a suffix "_total") and
+    gauge metrics (for bug compatibility).
+    Use `RAY_EXPORT_COUNTER_AS_GAUGE=0` to disable exporting the gauge metric.
+
+    Args:
+        name: Name of the metric.
+        description: Description of the metric.
+        tag_keys: Tag keys of the metric.
+    """
+
+    def __init__(
+        self,
+        name: str,
+        description: str = "",
+        tag_keys: Optional[Tuple[str, ...]] = None,
+    ):
+        super().__init__(name, description, tag_keys)
+        self._metric = CythonCount(self._name, self._description, self._tag_keys)
+
+    def __reduce__(self):
+        deserializer = self.__class__
+        serialized_data = (self._name, self._description, self._tag_keys)
+        return deserializer, serialized_data
+
+    def inc(self, value: Union[int, float] = 1.0, tags: Dict[str, str] = None):
+        """Increment the counter by `value` (defaults to 1).
+
+        Tags passed in will take precedence over the metric's default tags.
+
+        Args:
+            value(int, float): Value to increment the counter by (default=1).
+            tags(Dict[str, str]): Tags to set or override for this counter.
+        """
+        if not isinstance(value, (int, float)):
+            raise TypeError(f"value must be int or float, got {type(value)}.")
+        if value <= 0:
+            raise ValueError(f"value must be >0, got {value}")
+
+        self._record(value, tags=tags)
+
+
+@DeveloperAPI
+class Histogram(Metric):
+    """Tracks the size and number of events in buckets.
+
+    Histograms allow you to calculate aggregate quantiles
+    such as 25, 50, 95, 99 percentile latency for an RPC.
+
+    This corresponds to Prometheus' histogram metric:
+    https://prometheus.io/docs/concepts/metric_types/#histogram
+
+    Args:
+        name: Name of the metric.
+        description: Description of the metric.
+        boundaries: Boundaries of histogram buckets.
+        tag_keys: Tag keys of the metric.
+    """
+
+    def __init__(
+        self,
+        name: str,
+        description: str = "",
+        boundaries: List[float] = None,
+        tag_keys: Optional[Tuple[str, ...]] = None,
+    ):
+        super().__init__(name, description, tag_keys)
+        if boundaries is None or len(boundaries) == 0:
+            raise ValueError(
+                "boundaries argument should be provided when using "
+                "the Histogram class. e.g., "
+                'Histogram("name", boundaries=[1.0, 2.0])'
+            )
+        for i, boundary in enumerate(boundaries):
+            if boundary <= 0:
+                raise ValueError(
+                    "Invalid `boundaries` argument at index "
+                    f"{i}, {boundaries}. Use positive values for the arguments."
+                )
+
+        self.boundaries = boundaries
+        self._metric = CythonHistogram(
+            self._name, self._description, self.boundaries, self._tag_keys
+        )
+
+    def observe(self, value: Union[int, float], tags: Dict[str, str] = None):
+        """Observe a given `value` and add it to the appropriate bucket.
+
+        Tags passed in will take precedence over the metric's default tags.
+
+        Args:
+            value(int, float): Value to set the gauge to.
+            tags(Dict[str, str]): Tags to set or override for this gauge.
+        """
+        if not isinstance(value, (int, float)):
+            raise TypeError(f"value must be int or float, got {type(value)}.")
+
+        self._record(value, tags)
+
+    def __reduce__(self):
+        deserializer = Histogram
+        serialized_data = (
+            self._name,
+            self._description,
+            self.boundaries,
+            self._tag_keys,
+        )
+        return deserializer, serialized_data
+
+    @property
+    def info(self):
+        """Return information about histogram metric."""
+        info = super().info
+        info.update({"boundaries": self.boundaries})
+        return info
+
+
+@DeveloperAPI
+class Gauge(Metric):
+    """Gauges keep the last recorded value and drop everything before.
+
+    Unlike counters, gauges can go up or down over time.
+
+    This corresponds to Prometheus' gauge metric:
+    https://prometheus.io/docs/concepts/metric_types/#gauge
+
+    Args:
+        name: Name of the metric.
+        description: Description of the metric.
+        tag_keys: Tag keys of the metric.
+    """
+
+    def __init__(
+        self,
+        name: str,
+        description: str = "",
+        tag_keys: Optional[Tuple[str, ...]] = None,
+    ):
+        super().__init__(name, description, tag_keys)
+        self._metric = CythonGauge(self._name, self._description, self._tag_keys)
+
+    def set(self, value: Optional[Union[int, float]], tags: Dict[str, str] = None):
+        """Set the gauge to the given `value`.
+
+        Tags passed in will take precedence over the metric's default tags.
+
+        Args:
+            value(int, float): Value to set the gauge to. If `None`, this method is a
+                no-op.
+            tags(Dict[str, str]): Tags to set or override for this gauge.
+        """
+        if value is None:
+            return
+
+        if not isinstance(value, (int, float)):
+            raise TypeError(f"value must be int or float, got {type(value)}.")
+
+        self._record(value, tags)
+
+    def __reduce__(self):
+        deserializer = Gauge
+        serialized_data = (self._name, self._description, self._tag_keys)
+        return deserializer, serialized_data
+
+
+__all__ = [
+    "Counter",
+    "Histogram",
+    "Gauge",
+]
diff --git a/.venv/lib/python3.11/site-packages/ray/util/multiprocessing/__pycache__/__init__.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/util/multiprocessing/__pycache__/__init__.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..eb0d155ec381fc2dca68cc57a1d16ac833babd66
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/util/multiprocessing/__pycache__/__init__.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/util/multiprocessing/__pycache__/pool.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/util/multiprocessing/__pycache__/pool.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a512613b3fba8a4b8c7aed2579b5d6352bd780e6
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/util/multiprocessing/__pycache__/pool.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/util/multiprocessing/pool.py b/.venv/lib/python3.11/site-packages/ray/util/multiprocessing/pool.py
new file mode 100644
index 0000000000000000000000000000000000000000..182afabdf3e186cab6036a95ba46875661560b24
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/util/multiprocessing/pool.py
@@ -0,0 +1,995 @@
+import collections
+import copy
+import gc
+import itertools
+import logging
+import os
+import queue
+import sys
+import threading
+import time
+from multiprocessing import TimeoutError
+from typing import Any, Callable, Dict, Hashable, Iterable, List, Optional, Tuple
+
+import ray
+from ray._private.usage import usage_lib
+from ray.util import log_once
+
+try:
+    from joblib._parallel_backends import SafeFunction
+    from joblib.parallel import BatchedCalls, parallel_backend
+except ImportError:
+    BatchedCalls = None
+    parallel_backend = None
+    SafeFunction = None
+
+
+logger = logging.getLogger(__name__)
+
+RAY_ADDRESS_ENV = "RAY_ADDRESS"
+
+
+def _put_in_dict_registry(
+    obj: Any, registry_hashable: Dict[Hashable, ray.ObjectRef]
+) -> ray.ObjectRef:
+    if obj not in registry_hashable:
+        ret = ray.put(obj)
+        registry_hashable[obj] = ret
+    else:
+        ret = registry_hashable[obj]
+    return ret
+
+
+def _put_in_list_registry(
+    obj: Any, registry: List[Tuple[Any, ray.ObjectRef]]
+) -> ray.ObjectRef:
+    try:
+        ret = next((ref for o, ref in registry if o is obj))
+    except StopIteration:
+        ret = ray.put(obj)
+        registry.append((obj, ret))
+    return ret
+
+
+def ray_put_if_needed(
+    obj: Any,
+    registry: Optional[List[Tuple[Any, ray.ObjectRef]]] = None,
+    registry_hashable: Optional[Dict[Hashable, ray.ObjectRef]] = None,
+) -> ray.ObjectRef:
+    """ray.put obj in object store if it's not an ObjRef and bigger than 100 bytes,
+    with support for list and dict registries"""
+    if isinstance(obj, ray.ObjectRef) or sys.getsizeof(obj) < 100:
+        return obj
+    ret = obj
+    if registry_hashable is not None:
+        try:
+            ret = _put_in_dict_registry(obj, registry_hashable)
+        except TypeError:
+            if registry is not None:
+                ret = _put_in_list_registry(obj, registry)
+    elif registry is not None:
+        ret = _put_in_list_registry(obj, registry)
+    return ret
+
+
+def ray_get_if_needed(obj: Any) -> Any:
+    """If obj is an ObjectRef, do ray.get, otherwise return obj"""
+    if isinstance(obj, ray.ObjectRef):
+        return ray.get(obj)
+    return obj
+
+
+if BatchedCalls is not None:
+
+    class RayBatchedCalls(BatchedCalls):
+        """Joblib's BatchedCalls with basic Ray object store management
+
+        This functionality is provided through the put_items_in_object_store,
+        which uses external registries (list and dict) containing objects
+        and their ObjectRefs."""
+
+        def put_items_in_object_store(
+            self,
+            registry: Optional[List[Tuple[Any, ray.ObjectRef]]] = None,
+            registry_hashable: Optional[Dict[Hashable, ray.ObjectRef]] = None,
+        ):
+            """Puts all applicable (kw)args in self.items in object store
+
+            Takes two registries - list for unhashable objects and dict
+            for hashable objects. The registries are a part of a Pool object.
+            The method iterates through all entries in items list (usually,
+            there will be only one, but the number depends on joblib Parallel
+            settings) and puts all of the args and kwargs into the object
+            store, updating the registries.
+            If an arg or kwarg is already in a registry, it will not be
+            put again, and instead, the cached object ref will be used."""
+            new_items = []
+            for func, args, kwargs in self.items:
+                args = [
+                    ray_put_if_needed(arg, registry, registry_hashable) for arg in args
+                ]
+                kwargs = {
+                    k: ray_put_if_needed(v, registry, registry_hashable)
+                    for k, v in kwargs.items()
+                }
+                new_items.append((func, args, kwargs))
+            self.items = new_items
+
+        def __call__(self):
+            # Exactly the same as in BatchedCalls, with the
+            # difference being that it gets args and kwargs from
+            # object store (which have been put in there by
+            # put_items_in_object_store)
+
+            # Set the default nested backend to self._backend but do
+            # not set the change the default number of processes to -1
+            with parallel_backend(self._backend, n_jobs=self._n_jobs):
+                return [
+                    func(
+                        *[ray_get_if_needed(arg) for arg in args],
+                        **{k: ray_get_if_needed(v) for k, v in kwargs.items()},
+                    )
+                    for func, args, kwargs in self.items
+                ]
+
+        def __reduce__(self):
+            # Exactly the same as in BatchedCalls, with the
+            # difference being that it returns RayBatchedCalls
+            # instead
+            if self._reducer_callback is not None:
+                self._reducer_callback()
+            # no need pickle the callback.
+            return (
+                RayBatchedCalls,
+                (self.items, (self._backend, self._n_jobs), None, self._pickle_cache),
+            )
+
+else:
+    RayBatchedCalls = None
+
+
+# Helper function to divide a by b and round the result up.
+def div_round_up(a, b):
+    return -(-a // b)
+
+
+class PoolTaskError(Exception):
+    def __init__(self, underlying):
+        self.underlying = underlying
+
+
+class ResultThread(threading.Thread):
+    """Thread that collects results from distributed actors.
+
+    It winds down when either:
+        - A pre-specified number of objects has been processed
+        - When the END_SENTINEL (submitted through self.add_object_ref())
+            has been received and all objects received before that have been
+            processed.
+
+    Initialize the thread with total_object_refs = float('inf') to wait for the
+    END_SENTINEL.
+
+    Args:
+        object_refs (List[RayActorObjectRefs]): ObjectRefs to Ray Actor calls.
+            Thread tracks whether they are ready. More ObjectRefs may be added
+            with add_object_ref (or _add_object_ref internally) until the object
+            count reaches total_object_refs.
+        single_result: Should be True if the thread is managing function
+            with a single result (like apply_async). False if the thread is managing
+            a function with a List of results.
+        callback: called only once at the end of the thread
+            if no results were errors. If single_result=True, and result is
+            not an error, callback is invoked with the result as the only
+            argument. If single_result=False, callback is invoked with
+            a list of all the results as the only argument.
+        error_callback: called only once on the first result
+            that errors. Should take an Exception as the only argument.
+            If no result errors, this callback is not called.
+        total_object_refs: Number of ObjectRefs that this thread
+            expects to be ready. May be more than len(object_refs) since
+            more ObjectRefs can be submitted after the thread starts.
+            If None, defaults to len(object_refs). If float("inf"), thread runs
+            until END_SENTINEL (submitted through self.add_object_ref())
+            has been received and all objects received before that have
+            been processed.
+    """
+
+    END_SENTINEL = None
+
+    def __init__(
+        self,
+        object_refs: list,
+        single_result: bool = False,
+        callback: callable = None,
+        error_callback: callable = None,
+        total_object_refs: Optional[int] = None,
+    ):
+        threading.Thread.__init__(self, daemon=True)
+        self._got_error = False
+        self._object_refs = []
+        self._num_ready = 0
+        self._results = []
+        self._ready_index_queue = queue.Queue()
+        self._single_result = single_result
+        self._callback = callback
+        self._error_callback = error_callback
+        self._total_object_refs = total_object_refs or len(object_refs)
+        self._indices = {}
+        # Thread-safe queue used to add ObjectRefs to fetch after creating
+        # this thread (used to lazily submit for imap and imap_unordered).
+        self._new_object_refs = queue.Queue()
+        for object_ref in object_refs:
+            self._add_object_ref(object_ref)
+
+    def _add_object_ref(self, object_ref):
+        self._indices[object_ref] = len(self._object_refs)
+        self._object_refs.append(object_ref)
+        self._results.append(None)
+
+    def add_object_ref(self, object_ref):
+        self._new_object_refs.put(object_ref)
+
+    def run(self):
+        unready = copy.copy(self._object_refs)
+        aggregated_batch_results = []
+
+        # Run for a specific number of objects if self._total_object_refs is finite.
+        # Otherwise, process all objects received prior to the stop signal, given by
+        # self.add_object(END_SENTINEL).
+        while self._num_ready < self._total_object_refs:
+            # Get as many new IDs from the queue as possible without blocking,
+            # unless we have no IDs to wait on, in which case we block.
+            while True:
+                try:
+                    block = len(unready) == 0
+                    new_object_ref = self._new_object_refs.get(block=block)
+                    if new_object_ref is self.END_SENTINEL:
+                        # Receiving the END_SENTINEL object is the signal to stop.
+                        # Store the total number of objects.
+                        self._total_object_refs = len(self._object_refs)
+                    else:
+                        self._add_object_ref(new_object_ref)
+                        unready.append(new_object_ref)
+                except queue.Empty:
+                    # queue.Empty means no result was retrieved if block=False.
+                    break
+
+            [ready_id], unready = ray.wait(unready, num_returns=1)
+            try:
+                batch = ray.get(ready_id)
+            except ray.exceptions.RayError as e:
+                batch = [e]
+
+            # The exception callback is called only once on the first result
+            # that errors. If no result errors, it is never called.
+            if not self._got_error:
+                for result in batch:
+                    if isinstance(result, Exception):
+                        self._got_error = True
+                        if self._error_callback is not None:
+                            self._error_callback(result)
+                        break
+                    else:
+                        aggregated_batch_results.append(result)
+
+            self._num_ready += 1
+            self._results[self._indices[ready_id]] = batch
+            self._ready_index_queue.put(self._indices[ready_id])
+
+        # The regular callback is called only once on the entire List of
+        # results as long as none of the results were errors. If any results
+        # were errors, the regular callback is never called; instead, the
+        # exception callback is called on the first erroring result.
+        #
+        # This callback is called outside the while loop to ensure that it's
+        # called on the entire list of results– not just a single batch.
+        if not self._got_error and self._callback is not None:
+            if not self._single_result:
+                self._callback(aggregated_batch_results)
+            else:
+                # On a thread handling a function with a single result
+                # (e.g. apply_async), we call the callback on just that result
+                # instead of on a list encaspulating that result
+                self._callback(aggregated_batch_results[0])
+
+    def got_error(self):
+        # Should only be called after the thread finishes.
+        return self._got_error
+
+    def result(self, index):
+        # Should only be called on results that are ready.
+        return self._results[index]
+
+    def results(self):
+        # Should only be called after the thread finishes.
+        return self._results
+
+    def next_ready_index(self, timeout=None):
+        try:
+            return self._ready_index_queue.get(timeout=timeout)
+        except queue.Empty:
+            # queue.Queue signals a timeout by raising queue.Empty.
+            raise TimeoutError
+
+
+class AsyncResult:
+    """An asynchronous interface to task results.
+
+    This should not be constructed directly.
+    """
+
+    def __init__(
+        self, chunk_object_refs, callback=None, error_callback=None, single_result=False
+    ):
+        self._single_result = single_result
+        self._result_thread = ResultThread(
+            chunk_object_refs, single_result, callback, error_callback
+        )
+        self._result_thread.start()
+
+    def wait(self, timeout=None):
+        """
+        Returns once the result is ready or the timeout expires (does not
+        raise TimeoutError).
+
+        Args:
+            timeout: timeout in milliseconds.
+        """
+
+        self._result_thread.join(timeout)
+
+    def get(self, timeout=None):
+        self.wait(timeout)
+        if self._result_thread.is_alive():
+            raise TimeoutError
+
+        results = []
+        for batch in self._result_thread.results():
+            for result in batch:
+                if isinstance(result, PoolTaskError):
+                    raise result.underlying
+                elif isinstance(result, Exception):
+                    raise result
+            results.extend(batch)
+
+        if self._single_result:
+            return results[0]
+
+        return results
+
+    def ready(self):
+        """
+        Returns true if the result is ready, else false if the tasks are still
+        running.
+        """
+
+        return not self._result_thread.is_alive()
+
+    def successful(self):
+        """
+        Returns true if none of the submitted tasks errored, else false. Should
+        only be called once the result is ready (can be checked using `ready`).
+        """
+
+        if not self.ready():
+            raise ValueError(f"{self!r} not ready")
+        return not self._result_thread.got_error()
+
+
+class IMapIterator:
+    """Base class for OrderedIMapIterator and UnorderedIMapIterator."""
+
+    def __init__(self, pool, func, iterable, chunksize=None):
+        self._pool = pool
+        self._func = func
+        self._next_chunk_index = 0
+        self._finished_iterating = False
+        # List of bools indicating if the given chunk is ready or not for all
+        # submitted chunks. Ordering mirrors that in the in the ResultThread.
+        self._submitted_chunks = []
+        self._ready_objects = collections.deque()
+        self._iterator = iter(iterable)
+        if isinstance(iterable, collections.abc.Iterator):
+            # Got iterator (which has no len() function).
+            # Make default chunksize 1 instead of using _calculate_chunksize().
+            # Indicate unknown queue length, requiring explicit stopping.
+            self._chunksize = chunksize or 1
+            result_list_size = float("inf")
+        else:
+            self._chunksize = chunksize or pool._calculate_chunksize(iterable)
+            result_list_size = div_round_up(len(iterable), chunksize)
+
+        self._result_thread = ResultThread([], total_object_refs=result_list_size)
+        self._result_thread.start()
+
+        for _ in range(len(self._pool._actor_pool)):
+            self._submit_next_chunk()
+
+    def _submit_next_chunk(self):
+        # The full iterable has already been submitted, so no-op.
+        if self._finished_iterating:
+            return
+
+        actor_index = len(self._submitted_chunks) % len(self._pool._actor_pool)
+        chunk_iterator = itertools.islice(self._iterator, self._chunksize)
+
+        # Check whether we have run out of samples.
+        # This consumes the original iterator, so we convert to a list and back
+        chunk_list = list(chunk_iterator)
+        if len(chunk_list) < self._chunksize:
+            # Reached end of self._iterator
+            self._finished_iterating = True
+            if len(chunk_list) == 0:
+                # Nothing to do, return.
+                return
+        chunk_iterator = iter(chunk_list)
+
+        new_chunk_id = self._pool._submit_chunk(
+            self._func, chunk_iterator, self._chunksize, actor_index
+        )
+        self._submitted_chunks.append(False)
+        # Wait for the result
+        self._result_thread.add_object_ref(new_chunk_id)
+        # If we submitted the final chunk, notify the result thread
+        if self._finished_iterating:
+            self._result_thread.add_object_ref(ResultThread.END_SENTINEL)
+
+    def __iter__(self):
+        return self
+
+    def __next__(self):
+        return self.next()
+
+    def next(self):
+        # Should be implemented by subclasses.
+        raise NotImplementedError
+
+
+class OrderedIMapIterator(IMapIterator):
+    """Iterator to the results of tasks submitted using `imap`.
+
+    The results are returned in the same order that they were submitted, even
+    if they don't finish in that order. Only one batch of tasks per actor
+    process is submitted at a time - the rest are submitted as results come in.
+
+    Should not be constructed directly.
+    """
+
+    def next(self, timeout=None):
+        if len(self._ready_objects) == 0:
+            if self._finished_iterating and (
+                self._next_chunk_index == len(self._submitted_chunks)
+            ):
+                # Finish when all chunks have been dispatched and processed
+                # Notify the calling process that the work is done.
+                raise StopIteration
+
+            # This loop will break when the next index in order is ready or
+            # self._result_thread.next_ready_index() raises a timeout.
+            index = -1
+            while index != self._next_chunk_index:
+                start = time.time()
+                index = self._result_thread.next_ready_index(timeout=timeout)
+                self._submit_next_chunk()
+                self._submitted_chunks[index] = True
+                if timeout is not None:
+                    timeout = max(0, timeout - (time.time() - start))
+
+            while (
+                self._next_chunk_index < len(self._submitted_chunks)
+                and self._submitted_chunks[self._next_chunk_index]
+            ):
+                for result in self._result_thread.result(self._next_chunk_index):
+                    self._ready_objects.append(result)
+                self._next_chunk_index += 1
+
+        return self._ready_objects.popleft()
+
+
+class UnorderedIMapIterator(IMapIterator):
+    """Iterator to the results of tasks submitted using `imap`.
+
+    The results are returned in the order that they finish. Only one batch of
+    tasks per actor process is submitted at a time - the rest are submitted as
+    results come in.
+
+    Should not be constructed directly.
+    """
+
+    def next(self, timeout=None):
+        if len(self._ready_objects) == 0:
+            if self._finished_iterating and (
+                self._next_chunk_index == len(self._submitted_chunks)
+            ):
+                # Finish when all chunks have been dispatched and processed
+                # Notify the calling process that the work is done.
+                raise StopIteration
+
+            index = self._result_thread.next_ready_index(timeout=timeout)
+            self._submit_next_chunk()
+
+            for result in self._result_thread.result(index):
+                self._ready_objects.append(result)
+            self._next_chunk_index += 1
+
+        return self._ready_objects.popleft()
+
+
+@ray.remote(num_cpus=0)
+class PoolActor:
+    """Actor used to process tasks submitted to a Pool."""
+
+    def __init__(self, initializer=None, initargs=None):
+        if initializer:
+            initargs = initargs or ()
+            initializer(*initargs)
+
+    def ping(self):
+        # Used to wait for this actor to be initialized.
+        pass
+
+    def run_batch(self, func, batch):
+        results = []
+        for args, kwargs in batch:
+            args = args or ()
+            kwargs = kwargs or {}
+            try:
+                results.append(func(*args, **kwargs))
+            except Exception as e:
+                results.append(PoolTaskError(e))
+        return results
+
+
+# https://docs.python.org/3/library/multiprocessing.html#module-multiprocessing.pool
+class Pool:
+    """A pool of actor processes that is used to process tasks in parallel.
+
+    Args:
+        processes: number of actor processes to start in the pool. Defaults to
+            the number of cores in the Ray cluster if one is already running,
+            otherwise the number of cores on this machine.
+        initializer: function to be run in each actor when it starts up.
+        initargs: iterable of arguments to the initializer function.
+        maxtasksperchild: maximum number of tasks to run in each actor process.
+            After a process has executed this many tasks, it will be killed and
+            replaced with a new one.
+        ray_address: address of the Ray cluster to run on. If None, a new local
+            Ray cluster will be started on this machine. Otherwise, this will
+            be passed to `ray.init()` to connect to a running cluster. This may
+            also be specified using the `RAY_ADDRESS` environment variable.
+        ray_remote_args: arguments used to configure the Ray Actors making up
+            the pool. See :func:`ray.remote` for details.
+    """
+
+    def __init__(
+        self,
+        processes: Optional[int] = None,
+        initializer: Optional[Callable] = None,
+        initargs: Optional[Iterable] = None,
+        maxtasksperchild: Optional[int] = None,
+        context: Any = None,
+        ray_address: Optional[str] = None,
+        ray_remote_args: Optional[Dict[str, Any]] = None,
+    ):
+        usage_lib.record_library_usage("util.multiprocessing.Pool")
+
+        self._closed = False
+        self._initializer = initializer
+        self._initargs = initargs
+        self._maxtasksperchild = maxtasksperchild or -1
+        self._actor_deletion_ids = []
+        self._registry: List[Tuple[Any, ray.ObjectRef]] = []
+        self._registry_hashable: Dict[Hashable, ray.ObjectRef] = {}
+        self._current_index = 0
+        self._ray_remote_args = ray_remote_args or {}
+        self._pool_actor = None
+
+        if context and log_once("context_argument_warning"):
+            logger.warning(
+                "The 'context' argument is not supported using "
+                "ray. Please refer to the documentation for how "
+                "to control ray initialization."
+            )
+
+        processes = self._init_ray(processes, ray_address)
+        self._start_actor_pool(processes)
+
+    def _init_ray(self, processes=None, ray_address=None):
+        # Initialize ray. If ray is already initialized, we do nothing.
+        # Else, the priority is:
+        # ray_address argument > RAY_ADDRESS > start new local cluster.
+        if not ray.is_initialized():
+            # Cluster mode.
+            if ray_address is None and (
+                RAY_ADDRESS_ENV in os.environ
+                or ray._private.utils.read_ray_address() is not None
+            ):
+                ray.init()
+            elif ray_address is not None:
+                init_kwargs = {}
+                if ray_address == "local":
+                    init_kwargs["num_cpus"] = processes
+                ray.init(address=ray_address, **init_kwargs)
+            # Local mode.
+            else:
+                ray.init(num_cpus=processes)
+
+        ray_cpus = int(ray._private.state.cluster_resources()["CPU"])
+        if processes is None:
+            processes = ray_cpus
+        if processes <= 0:
+            raise ValueError("Processes in the pool must be >0.")
+        if ray_cpus < processes:
+            raise ValueError(
+                "Tried to start a pool with {} processes on an "
+                "existing ray cluster, but there are only {} "
+                "CPUs in the ray cluster.".format(processes, ray_cpus)
+            )
+
+        return processes
+
+    def _start_actor_pool(self, processes):
+        self._pool_actor = None
+        self._actor_pool = [self._new_actor_entry() for _ in range(processes)]
+        ray.get([actor.ping.remote() for actor, _ in self._actor_pool])
+
+    def _wait_for_stopping_actors(self, timeout=None):
+        if len(self._actor_deletion_ids) == 0:
+            return
+        if timeout is not None:
+            timeout = float(timeout)
+
+        _, deleting = ray.wait(
+            self._actor_deletion_ids,
+            num_returns=len(self._actor_deletion_ids),
+            timeout=timeout,
+        )
+        self._actor_deletion_ids = deleting
+
+    def _stop_actor(self, actor):
+        # Check and clean up any outstanding IDs corresponding to deletions.
+        self._wait_for_stopping_actors(timeout=0.0)
+        # The deletion task will block until the actor has finished executing
+        # all pending tasks.
+        self._actor_deletion_ids.append(actor.__ray_terminate__.remote())
+
+    def _new_actor_entry(self):
+        # NOTE(edoakes): The initializer function can't currently be used to
+        # modify the global namespace (e.g., import packages or set globals)
+        # due to a limitation in cloudpickle.
+        # Cache the PoolActor with options
+        if not self._pool_actor:
+            self._pool_actor = PoolActor.options(**self._ray_remote_args)
+        return (self._pool_actor.remote(self._initializer, self._initargs), 0)
+
+    def _next_actor_index(self):
+        if self._current_index == len(self._actor_pool) - 1:
+            self._current_index = 0
+        else:
+            self._current_index += 1
+        return self._current_index
+
+    # Batch should be a list of tuples: (args, kwargs).
+    def _run_batch(self, actor_index, func, batch):
+        actor, count = self._actor_pool[actor_index]
+        object_ref = actor.run_batch.remote(func, batch)
+        count += 1
+        assert self._maxtasksperchild == -1 or count <= self._maxtasksperchild
+        if count == self._maxtasksperchild:
+            self._stop_actor(actor)
+            actor, count = self._new_actor_entry()
+        self._actor_pool[actor_index] = (actor, count)
+        return object_ref
+
+    def apply(
+        self,
+        func: Callable,
+        args: Optional[Tuple] = None,
+        kwargs: Optional[Dict] = None,
+    ):
+        """Run the given function on a random actor process and return the
+        result synchronously.
+
+        Args:
+            func: function to run.
+            args: optional arguments to the function.
+            kwargs: optional keyword arguments to the function.
+
+        Returns:
+            The result.
+        """
+
+        return self.apply_async(func, args, kwargs).get()
+
+    def apply_async(
+        self,
+        func: Callable,
+        args: Optional[Tuple] = None,
+        kwargs: Optional[Dict] = None,
+        callback: Callable[[Any], None] = None,
+        error_callback: Callable[[Exception], None] = None,
+    ):
+        """Run the given function on a random actor process and return an
+        asynchronous interface to the result.
+
+        Args:
+            func: function to run.
+            args: optional arguments to the function.
+            kwargs: optional keyword arguments to the function.
+            callback: callback to be executed on the result once it is finished
+                only if it succeeds.
+            error_callback: callback to be executed the result once it is
+                finished only if the task errors. The exception raised by the
+                task will be passed as the only argument to the callback.
+
+        Returns:
+            AsyncResult containing the result.
+        """
+
+        self._check_running()
+        func = self._convert_to_ray_batched_calls_if_needed(func)
+        object_ref = self._run_batch(self._next_actor_index(), func, [(args, kwargs)])
+        return AsyncResult([object_ref], callback, error_callback, single_result=True)
+
+    def _convert_to_ray_batched_calls_if_needed(self, func: Callable) -> Callable:
+        """Convert joblib's BatchedCalls to RayBatchedCalls for ObjectRef caching.
+
+        This converts joblib's BatchedCalls callable, which is a collection of
+        functions with their args and kwargs to be ran sequentially in an
+        Actor, to a RayBatchedCalls callable, which provides identical
+        functionality in addition to a method which ensures that common
+        args and kwargs are put into the object store just once, saving time
+        and memory. That method is then ran.
+
+        If func is not a BatchedCalls instance, it is returned without changes.
+
+        The ObjectRefs are cached inside two registries (_registry and
+        _registry_hashable), which are common for the entire Pool and are
+        cleaned on close."""
+        if RayBatchedCalls is None:
+            return func
+        orginal_func = func
+        # SafeFunction is a Python 2 leftover and can be
+        # safely removed.
+        if isinstance(func, SafeFunction):
+            func = func.func
+        if isinstance(func, BatchedCalls):
+            func = RayBatchedCalls(
+                func.items,
+                (func._backend, func._n_jobs),
+                func._reducer_callback,
+                func._pickle_cache,
+            )
+            # go through all the items and replace args and kwargs with
+            # ObjectRefs, caching them in registries
+            func.put_items_in_object_store(self._registry, self._registry_hashable)
+        else:
+            func = orginal_func
+        return func
+
+    def _calculate_chunksize(self, iterable):
+        chunksize, extra = divmod(len(iterable), len(self._actor_pool) * 4)
+        if extra:
+            chunksize += 1
+        return chunksize
+
+    def _submit_chunk(self, func, iterator, chunksize, actor_index, unpack_args=False):
+        chunk = []
+        while len(chunk) < chunksize:
+            try:
+                args = next(iterator)
+                if not unpack_args:
+                    args = (args,)
+                chunk.append((args, {}))
+            except StopIteration:
+                break
+
+        # Nothing to submit. The caller should prevent this.
+        assert len(chunk) > 0
+
+        return self._run_batch(actor_index, func, chunk)
+
+    def _chunk_and_run(self, func, iterable, chunksize=None, unpack_args=False):
+        if not hasattr(iterable, "__len__"):
+            iterable = list(iterable)
+
+        if chunksize is None:
+            chunksize = self._calculate_chunksize(iterable)
+
+        iterator = iter(iterable)
+        chunk_object_refs = []
+        while len(chunk_object_refs) * chunksize < len(iterable):
+            actor_index = len(chunk_object_refs) % len(self._actor_pool)
+            chunk_object_refs.append(
+                self._submit_chunk(
+                    func, iterator, chunksize, actor_index, unpack_args=unpack_args
+                )
+            )
+
+        return chunk_object_refs
+
+    def _map_async(
+        self,
+        func,
+        iterable,
+        chunksize=None,
+        unpack_args=False,
+        callback=None,
+        error_callback=None,
+    ):
+        self._check_running()
+        object_refs = self._chunk_and_run(
+            func, iterable, chunksize=chunksize, unpack_args=unpack_args
+        )
+        return AsyncResult(object_refs, callback, error_callback)
+
+    def map(self, func: Callable, iterable: Iterable, chunksize: Optional[int] = None):
+        """Run the given function on each element in the iterable round-robin
+        on the actor processes and return the results synchronously.
+
+        Args:
+            func: function to run.
+            iterable: iterable of objects to be passed as the sole argument to
+                func.
+            chunksize: number of tasks to submit as a batch to each actor
+                process. If unspecified, a suitable chunksize will be chosen.
+
+        Returns:
+            A list of results.
+        """
+
+        return self._map_async(
+            func, iterable, chunksize=chunksize, unpack_args=False
+        ).get()
+
+    def map_async(
+        self,
+        func: Callable,
+        iterable: Iterable,
+        chunksize: Optional[int] = None,
+        callback: Callable[[List], None] = None,
+        error_callback: Callable[[Exception], None] = None,
+    ):
+        """Run the given function on each element in the iterable round-robin
+        on the actor processes and return an asynchronous interface to the
+        results.
+
+        Args:
+            func: function to run.
+            iterable: iterable of objects to be passed as the only argument to
+                func.
+            chunksize: number of tasks to submit as a batch to each actor
+                process. If unspecified, a suitable chunksize will be chosen.
+            callback: Will only be called if none of the results were errors,
+                and will only be called once after all results are finished.
+                A Python List of all the finished results will be passed as the
+                only argument to the callback.
+            error_callback: callback executed on the first errored result.
+                The Exception raised by the task will be passed as the only
+                argument to the callback.
+
+        Returns:
+            AsyncResult
+        """
+        return self._map_async(
+            func,
+            iterable,
+            chunksize=chunksize,
+            unpack_args=False,
+            callback=callback,
+            error_callback=error_callback,
+        )
+
+    def starmap(self, func, iterable, chunksize=None):
+        """Same as `map`, but unpacks each element of the iterable as the
+        arguments to func like: [func(*args) for args in iterable].
+        """
+
+        return self._map_async(
+            func, iterable, chunksize=chunksize, unpack_args=True
+        ).get()
+
+    def starmap_async(
+        self,
+        func: Callable,
+        iterable: Iterable,
+        callback: Callable[[List], None] = None,
+        error_callback: Callable[[Exception], None] = None,
+    ):
+        """Same as `map_async`, but unpacks each element of the iterable as the
+        arguments to func like: [func(*args) for args in iterable].
+        """
+
+        return self._map_async(
+            func,
+            iterable,
+            unpack_args=True,
+            callback=callback,
+            error_callback=error_callback,
+        )
+
+    def imap(self, func: Callable, iterable: Iterable, chunksize: Optional[int] = 1):
+        """Same as `map`, but only submits one batch of tasks to each actor
+        process at a time.
+
+        This can be useful if the iterable of arguments is very large or each
+        task's arguments consumes a large amount of resources.
+
+        The results are returned in the order corresponding to their arguments
+        in the iterable.
+
+        Returns:
+            OrderedIMapIterator
+        """
+
+        self._check_running()
+        return OrderedIMapIterator(self, func, iterable, chunksize=chunksize)
+
+    def imap_unordered(
+        self, func: Callable, iterable: Iterable, chunksize: Optional[int] = 1
+    ):
+        """Same as `map`, but only submits one batch of tasks to each actor
+        process at a time.
+
+        This can be useful if the iterable of arguments is very large or each
+        task's arguments consumes a large amount of resources.
+
+        The results are returned in the order that they finish.
+
+        Returns:
+            UnorderedIMapIterator
+        """
+
+        self._check_running()
+        return UnorderedIMapIterator(self, func, iterable, chunksize=chunksize)
+
+    def _check_running(self):
+        if self._closed:
+            raise ValueError("Pool not running")
+
+    def __enter__(self):
+        self._check_running()
+        return self
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        self.terminate()
+
+    def close(self):
+        """Close the pool.
+
+        Prevents any more tasks from being submitted on the pool but allows
+        outstanding work to finish.
+        """
+
+        self._registry.clear()
+        self._registry_hashable.clear()
+        for actor, _ in self._actor_pool:
+            self._stop_actor(actor)
+        self._closed = True
+        gc.collect()
+
+    def terminate(self):
+        """Close the pool.
+
+        Prevents any more tasks from being submitted on the pool and stops
+        outstanding work.
+        """
+
+        if not self._closed:
+            self.close()
+        for actor, _ in self._actor_pool:
+            ray.kill(actor)
+
+    def join(self):
+        """Wait for the actors in a closed pool to exit.
+
+        If the pool was closed using `close`, this will return once all
+        outstanding work is completed.
+
+        If the pool was closed using `terminate`, this will return quickly.
+        """
+
+        if not self._closed:
+            raise ValueError("Pool is still running")
+        self._wait_for_stopping_actors()
diff --git a/.venv/lib/python3.11/site-packages/ray/util/placement_group.py b/.venv/lib/python3.11/site-packages/ray/util/placement_group.py
new file mode 100644
index 0000000000000000000000000000000000000000..21ca50176d75a028f899a7a90a31728175e9c23d
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/util/placement_group.py
@@ -0,0 +1,555 @@
+import warnings
+from typing import Dict, List, Optional, Union
+
+import ray
+from ray._private.auto_init_hook import auto_init_ray
+from ray._private.client_mode_hook import client_mode_should_convert, client_mode_wrap
+from ray._private.utils import hex_to_binary, get_ray_doc_version
+from ray._raylet import PlacementGroupID
+from ray.util.annotations import DeveloperAPI, PublicAPI
+from ray.util.scheduling_strategies import PlacementGroupSchedulingStrategy
+import ray._private.ray_constants as ray_constants
+
+bundle_reservation_check = None
+
+VALID_PLACEMENT_GROUP_STRATEGIES = {
+    "PACK",
+    "SPREAD",
+    "STRICT_PACK",
+    "STRICT_SPREAD",
+}
+
+
+# We need to import this method to use for ready API.
+# But ray.remote is only available in runtime, and
+# if we define this method inside ready method, this function is
+# exported whenever ready is called, which can impact performance,
+# https://github.com/ray-project/ray/issues/6240.
+def _export_bundle_reservation_check_method_if_needed():
+    global bundle_reservation_check
+    if bundle_reservation_check:
+        return
+
+    @ray.remote(num_cpus=0)
+    def bundle_reservation_check_func(placement_group):
+        return placement_group
+
+    bundle_reservation_check = bundle_reservation_check_func
+
+
+@PublicAPI
+class PlacementGroup:
+    """A handle to a placement group."""
+
+    @staticmethod
+    def empty() -> "PlacementGroup":
+        return PlacementGroup(PlacementGroupID.nil())
+
+    def __init__(
+        self,
+        id: "ray._raylet.PlacementGroupID",
+        bundle_cache: Optional[List[Dict]] = None,
+    ):
+        self.id = id
+        self.bundle_cache = bundle_cache
+
+    @property
+    def is_empty(self):
+        return self.id.is_nil()
+
+    def ready(self) -> "ray._raylet.ObjectRef":
+        """Returns an ObjectRef to check ready status.
+
+        This API runs a small dummy task to wait for placement group creation.
+        It is compatible to ray.get and ray.wait.
+
+        Example:
+            .. testcode::
+
+                import ray
+
+                pg = ray.util.placement_group([{"CPU": 1}])
+                ray.get(pg.ready())
+
+                pg = ray.util.placement_group([{"CPU": 1}])
+                ray.wait([pg.ready()])
+
+        """
+        self._fill_bundle_cache_if_needed()
+
+        _export_bundle_reservation_check_method_if_needed()
+
+        assert len(self.bundle_cache) != 0, (
+            "ready() cannot be called on placement group object with a "
+            "bundle length == 0, current bundle length: "
+            f"{len(self.bundle_cache)}"
+        )
+
+        return bundle_reservation_check.options(
+            scheduling_strategy=PlacementGroupSchedulingStrategy(placement_group=self),
+        ).remote(self)
+
+    def wait(self, timeout_seconds: Union[float, int] = 30) -> bool:
+        """Wait for the placement group to be ready within the specified time.
+        Args:
+             timeout_seconds(float|int): Timeout in seconds.
+        Return:
+             True if the placement group is created. False otherwise.
+        """
+        return _call_placement_group_ready(self.id, timeout_seconds)
+
+    @property
+    def bundle_specs(self) -> List[Dict]:
+        """List[Dict]: Return bundles belonging to this placement group."""
+        self._fill_bundle_cache_if_needed()
+        return self.bundle_cache
+
+    @property
+    def bundle_count(self) -> int:
+        self._fill_bundle_cache_if_needed()
+        return len(self.bundle_cache)
+
+    def _fill_bundle_cache_if_needed(self) -> None:
+        if not self.bundle_cache:
+            self.bundle_cache = _get_bundle_cache(self.id)
+
+    def __eq__(self, other):
+        if not isinstance(other, PlacementGroup):
+            return False
+        return self.id == other.id
+
+    def __hash__(self):
+        return hash(self.id)
+
+
+@client_mode_wrap
+def _call_placement_group_ready(pg_id: PlacementGroupID, timeout_seconds: int) -> bool:
+    worker = ray._private.worker.global_worker
+    worker.check_connected()
+
+    return worker.core_worker.wait_placement_group_ready(pg_id, timeout_seconds)
+
+
+@client_mode_wrap
+def _get_bundle_cache(pg_id: PlacementGroupID) -> List[Dict]:
+    worker = ray._private.worker.global_worker
+    worker.check_connected()
+
+    return list(
+        ray._private.state.state.placement_group_table(pg_id)["bundles"].values()
+    )
+
+
+@PublicAPI
+@client_mode_wrap
+def placement_group(
+    bundles: List[Dict[str, float]],
+    strategy: str = "PACK",
+    name: str = "",
+    lifetime: Optional[str] = None,
+    _max_cpu_fraction_per_node: float = 1.0,
+    _soft_target_node_id: Optional[str] = None,
+) -> PlacementGroup:
+    """Asynchronously creates a PlacementGroup.
+
+    Args:
+        bundles: A list of bundles which
+            represent the resources requirements.
+        strategy: The strategy to create the placement group.
+
+         - "PACK": Packs Bundles into as few nodes as possible.
+         - "SPREAD": Places Bundles across distinct nodes as even as possible.
+         - "STRICT_PACK": Packs Bundles into one node. The group is
+           not allowed to span multiple nodes.
+         - "STRICT_SPREAD": Packs Bundles across distinct nodes.
+
+        name: The name of the placement group.
+        lifetime: Either `None`, which defaults to the placement group
+            will fate share with its creator and will be deleted once its
+            creator is dead, or "detached", which means the placement group
+            will live as a global object independent of the creator.
+        _max_cpu_fraction_per_node: (Experimental) Disallow placing bundles on nodes
+            if it would cause the fraction of CPUs used by bundles from *any* placement
+            group on the node to exceed this fraction. This effectively sets aside
+            CPUs that placement groups cannot occupy on nodes. when
+            `max_cpu_fraction_per_node < 1.0`, at least 1 CPU will be excluded from
+            placement group scheduling. Note: This feature is experimental and is not
+            recommended for use with autoscaling clusters (scale-up will not trigger
+            properly).
+        _soft_target_node_id: (Private, Experimental) Soft hint where bundles of
+            this placement group should be placed.
+            The target node is specified by it's hex ID.
+            If the target node has no available resources or died,
+            bundles can be placed elsewhere.
+            This currently only works with STRICT_PACK pg.
+
+    Raises:
+        ValueError: if bundle type is not a list.
+        ValueError: if empty bundle or empty resource bundles are given.
+        ValueError: if the wrong lifetime arguments are given.
+
+    Return:
+        PlacementGroup: Placement group object.
+    """
+
+    worker = ray._private.worker.global_worker
+    worker.check_connected()
+
+    validate_placement_group(
+        bundles=bundles,
+        strategy=strategy,
+        lifetime=lifetime,
+        _max_cpu_fraction_per_node=_max_cpu_fraction_per_node,
+        _soft_target_node_id=_soft_target_node_id,
+    )
+
+    if lifetime == "detached":
+        detached = True
+    else:
+        detached = False
+
+    placement_group_id = worker.core_worker.create_placement_group(
+        name,
+        bundles,
+        strategy,
+        detached,
+        _max_cpu_fraction_per_node,
+        _soft_target_node_id,
+    )
+
+    return PlacementGroup(placement_group_id)
+
+
+@PublicAPI
+@client_mode_wrap
+def remove_placement_group(placement_group: PlacementGroup) -> None:
+    """Asynchronously remove placement group.
+
+    Args:
+        placement_group: The placement group to delete.
+    """
+    assert placement_group is not None
+    worker = ray._private.worker.global_worker
+    worker.check_connected()
+
+    worker.core_worker.remove_placement_group(placement_group.id)
+
+
+@PublicAPI
+@client_mode_wrap
+def get_placement_group(placement_group_name: str) -> PlacementGroup:
+    """Get a placement group object with a global name.
+
+    Returns:
+        None if can't find a placement group with the given name.
+        The placement group object otherwise.
+    """
+    if not placement_group_name:
+        raise ValueError("Please supply a non-empty value to get_placement_group")
+    worker = ray._private.worker.global_worker
+    worker.check_connected()
+    placement_group_info = ray._private.state.state.get_placement_group_by_name(
+        placement_group_name, worker.namespace
+    )
+    if placement_group_info is None:
+        raise ValueError(
+            f"Failed to look up placement group with name: {placement_group_name}"
+        )
+    else:
+        return PlacementGroup(
+            PlacementGroupID(hex_to_binary(placement_group_info["placement_group_id"]))
+        )
+
+
+@DeveloperAPI
+@client_mode_wrap
+def placement_group_table(placement_group: PlacementGroup = None) -> dict:
+    """Get the state of the placement group from GCS.
+
+    Args:
+        placement_group: placement group to see
+            states.
+    """
+    worker = ray._private.worker.global_worker
+    worker.check_connected()
+    placement_group_id = placement_group.id if (placement_group is not None) else None
+    return ray._private.state.state.placement_group_table(placement_group_id)
+
+
+@PublicAPI
+def get_current_placement_group() -> Optional[PlacementGroup]:
+    """Get the current placement group which a task or actor is using.
+
+    It returns None if there's no current placement group for the worker.
+    For example, if you call this method in your driver, it returns None
+    (because drivers never belong to any placement group).
+
+    Examples:
+        .. testcode::
+
+            import ray
+            from ray.util.placement_group import get_current_placement_group
+            from ray.util.scheduling_strategies import PlacementGroupSchedulingStrategy
+
+            @ray.remote
+            def f():
+                # This returns the placement group the task f belongs to.
+                # It means this pg is identical to the pg created below.
+                return get_current_placement_group()
+
+            pg = ray.util.placement_group([{"CPU": 2}])
+            assert ray.get(f.options(
+                    scheduling_strategy=PlacementGroupSchedulingStrategy(
+                        placement_group=pg)).remote()) == pg
+
+            # Driver doesn't belong to any placement group,
+            # so it returns None.
+            assert get_current_placement_group() is None
+
+    Return:
+        PlacementGroup: Placement group object.
+            None if the current task or actor wasn't
+            created with any placement group.
+    """
+    auto_init_ray()
+    if client_mode_should_convert():
+        # Client mode is only a driver.
+        return None
+    worker = ray._private.worker.global_worker
+    worker.check_connected()
+    pg_id = worker.placement_group_id
+    if pg_id.is_nil():
+        return None
+    return PlacementGroup(pg_id)
+
+
+def check_placement_group_index(
+    placement_group: PlacementGroup, bundle_index: int
+) -> None:
+    assert placement_group is not None
+    if placement_group.id.is_nil():
+        if bundle_index != -1:
+            raise ValueError(
+                "If placement group is not set, "
+                "the value of bundle index must be -1."
+            )
+    elif bundle_index >= placement_group.bundle_count or bundle_index < -1:
+        raise ValueError(
+            f"placement group bundle index {bundle_index} "
+            f"is invalid. Valid placement group indexes: "
+            f"0-{placement_group.bundle_count}"
+        )
+
+
+def validate_placement_group(
+    bundles: List[Dict[str, float]],
+    strategy: str = "PACK",
+    lifetime: Optional[str] = None,
+    _max_cpu_fraction_per_node: float = 1.0,
+    _soft_target_node_id: Optional[str] = None,
+) -> bool:
+    """Validates inputs for placement_group.
+
+    Raises ValueError if inputs are invalid.
+    """
+
+    assert _max_cpu_fraction_per_node is not None
+
+    if _max_cpu_fraction_per_node <= 0 or _max_cpu_fraction_per_node > 1:
+        raise ValueError(
+            "Invalid argument `_max_cpu_fraction_per_node`: "
+            f"{_max_cpu_fraction_per_node}. "
+            "_max_cpu_fraction_per_node must be a float between 0 and 1. "
+        )
+
+    if _soft_target_node_id and strategy != "STRICT_PACK":
+        raise ValueError(
+            "_soft_target_node_id currently only works "
+            f"with STRICT_PACK but got {strategy}"
+        )
+
+    if _soft_target_node_id and ray.NodeID.from_hex(_soft_target_node_id).is_nil():
+        raise ValueError(
+            f"Invalid hex ID of _soft_target_node_id, got {_soft_target_node_id}"
+        )
+
+    _validate_bundles(bundles)
+
+    if strategy not in VALID_PLACEMENT_GROUP_STRATEGIES:
+        raise ValueError(
+            f"Invalid placement group strategy {strategy}. "
+            f"Supported strategies are: {VALID_PLACEMENT_GROUP_STRATEGIES}."
+        )
+
+    if lifetime not in [None, "detached"]:
+        raise ValueError(
+            "Placement group `lifetime` argument must be either `None` or "
+            f"'detached'. Got {lifetime}."
+        )
+
+
+def _validate_bundles(bundles: List[Dict[str, float]]):
+    """Validates each bundle and raises a ValueError if any bundle is invalid."""
+
+    if not isinstance(bundles, list):
+        raise ValueError(
+            "Placement group bundles must be a list, " f"got {type(bundles)}."
+        )
+
+    if len(bundles) == 0:
+        raise ValueError(
+            "Bundles must be a non-empty list of resource "
+            'dictionaries. For example: `[{"CPU": 1.0}, {"GPU": 1.0}]`. '
+            "Got empty list instead."
+        )
+
+    for bundle in bundles:
+        if (
+            not isinstance(bundle, dict)
+            or not all(isinstance(k, str) for k in bundle.keys())
+            or not all(isinstance(v, (int, float)) for v in bundle.values())
+        ):
+            raise ValueError(
+                "Bundles must be a non-empty list of "
+                "resource dictionaries. For example: "
+                '`[{"CPU": 1.0}, {"GPU": 1.0}]`.'
+            )
+
+        if len(bundle) == 0 or all(
+            resource_value == 0 for resource_value in bundle.values()
+        ):
+            raise ValueError(
+                "Bundles cannot be an empty dictionary or "
+                f"resources with only 0 values. Bundles: {bundles}"
+            )
+
+        if "object_store_memory" in bundle.keys():
+            warnings.warn(
+                "Setting 'object_store_memory' for"
+                " bundles is deprecated since it doesn't actually"
+                " reserve the required object store memory."
+                f" Use object spilling that's enabled by default (https://docs.ray.io/en/{get_ray_doc_version()}/ray-core/objects/object-spilling.html) "  # noqa: E501
+                "instead to bypass the object store memory size limitation.",
+                DeprecationWarning,
+                stacklevel=1,
+            )
+
+
+def _valid_resource_shape(resources, bundle_specs):
+    """
+    If the resource shape cannot fit into every
+    bundle spec, return False
+    """
+    for bundle in bundle_specs:
+        fit_in_bundle = True
+        for resource, requested_val in resources.items():
+            # Skip "bundle" resource as it is automatically added
+            # to all nodes with bundles by the placement group.
+            if resource == ray_constants.PLACEMENT_GROUP_BUNDLE_RESOURCE_NAME:
+                continue
+            if bundle.get(resource, 0) < requested_val:
+                fit_in_bundle = False
+                break
+        if fit_in_bundle:
+            # If resource request fits in any bundle, it is valid.
+            return True
+    return False
+
+
+def _validate_resource_shape(
+    placement_group, resources, placement_resources, task_or_actor_repr
+):
+    bundles = placement_group.bundle_specs
+    resources_valid = _valid_resource_shape(resources, bundles)
+    placement_resources_valid = _valid_resource_shape(placement_resources, bundles)
+
+    if not resources_valid:
+        raise ValueError(
+            f"Cannot schedule {task_or_actor_repr} with "
+            "the placement group because the resource request "
+            f"{resources} cannot fit into any bundles for "
+            f"the placement group, {bundles}."
+        )
+    if not placement_resources_valid:
+        # Happens for the default actor case.
+        # placement_resources is not an exposed concept to users,
+        # so we should write more specialized error messages.
+        raise ValueError(
+            f"Cannot schedule {task_or_actor_repr} with "
+            "the placement group because the actor requires "
+            f"{placement_resources.get('CPU', 0)} CPU for "
+            "creation, but it cannot "
+            f"fit into any bundles for the placement group, "
+            f"{bundles}. Consider "
+            "creating a placement group with CPU resources."
+        )
+
+
+def _configure_placement_group_based_on_context(
+    placement_group_capture_child_tasks: bool,
+    bundle_index: int,
+    resources: Dict,
+    placement_resources: Dict,
+    task_or_actor_repr: str,
+    placement_group: Union[PlacementGroup, str, None] = "default",
+) -> PlacementGroup:
+    """Configure the placement group based on the given context.
+
+    Based on the given context, this API returns the placement group instance
+    for task/actor scheduling.
+
+    Params:
+        placement_group_capture_child_tasks: Whether or not the
+            placement group needs to be captured from the global
+            context.
+        bundle_index: The bundle index for tasks/actor scheduling.
+        resources: The scheduling resources.
+        placement_resources: The scheduling placement resources for
+            actors.
+        task_or_actor_repr: The repr of task or actor
+            function/class descriptor.
+        placement_group: The placement group instance.
+            - "default": Default placement group argument. Currently,
+                the default behavior is to capture the parent task'
+                placement group if placement_group_capture_child_tasks
+                is set.
+            - None: means placement group is explicitly not configured.
+            - Placement group instance: In this case, do nothing.
+
+    Returns:
+        Placement group instance based on the given context.
+
+    Raises:
+        ValueError: If the bundle index is invalid for the placement group
+            or the requested resources shape doesn't fit to any
+            bundles.
+    """
+    # Validate inputs.
+    assert placement_group_capture_child_tasks is not None
+    assert resources is not None
+
+    # Validate and get the PlacementGroup instance.
+    # Placement group could be None, default, or placement group.
+    # Default behavior is "do not capture child tasks".
+    if placement_group != "default":
+        if not placement_group:
+            placement_group = PlacementGroup.empty()
+    elif placement_group == "default":
+        if placement_group_capture_child_tasks:
+            placement_group = get_current_placement_group()
+        else:
+            placement_group = PlacementGroup.empty()
+
+    if not placement_group:
+        placement_group = PlacementGroup.empty()
+    assert isinstance(placement_group, PlacementGroup)
+
+    # Validate the index.
+    check_placement_group_index(placement_group, bundle_index)
+
+    # Validate the shape.
+    if not placement_group.is_empty:
+        _validate_resource_shape(
+            placement_group, resources, placement_resources, task_or_actor_repr
+        )
+    return placement_group
diff --git a/.venv/lib/python3.11/site-packages/ray/util/queue.py b/.venv/lib/python3.11/site-packages/ray/util/queue.py
new file mode 100644
index 0000000000000000000000000000000000000000..b97bd80d137138115d474abe10b2316ddcd04c66
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/util/queue.py
@@ -0,0 +1,305 @@
+import asyncio
+from typing import Optional, Any, List, Dict
+from collections.abc import Iterable
+
+import ray
+from ray.util.annotations import PublicAPI
+
+
+@PublicAPI(stability="beta")
+class Empty(Exception):
+    pass
+
+
+@PublicAPI(stability="beta")
+class Full(Exception):
+    pass
+
+
+@PublicAPI(stability="beta")
+class Queue:
+    """A first-in, first-out queue implementation on Ray.
+
+    The behavior and use cases are similar to those of the asyncio.Queue class.
+
+    Features both sync and async put and get methods.  Provides the option to
+    block until space is available when calling put on a full queue,
+    or to block until items are available when calling get on an empty queue.
+
+    Optionally supports batched put and get operations to minimize
+    serialization overhead.
+
+    Args:
+        maxsize (optional, int): maximum size of the queue. If zero, size is
+            unbounded.
+        actor_options (optional, Dict): Dictionary of options to pass into
+            the QueueActor during creation. These are directly passed into
+            QueueActor.options(...). This could be useful if you
+            need to pass in custom resource requirements, for example.
+
+    Examples:
+        .. testcode::
+
+            from ray.util.queue import Queue
+            q = Queue()
+            items = list(range(10))
+            for item in items:
+                q.put(item)
+            for item in items:
+                assert item == q.get()
+            # Create Queue with the underlying actor reserving 1 CPU.
+            q = Queue(actor_options={"num_cpus": 1})
+    """
+
+    def __init__(self, maxsize: int = 0, actor_options: Optional[Dict] = None) -> None:
+        from ray._private.usage.usage_lib import record_library_usage
+
+        record_library_usage("util.Queue")
+
+        actor_options = actor_options or {}
+        self.maxsize = maxsize
+        self.actor = (
+            ray.remote(_QueueActor).options(**actor_options).remote(self.maxsize)
+        )
+
+    def __len__(self) -> int:
+        return self.size()
+
+    def size(self) -> int:
+        """The size of the queue."""
+        return ray.get(self.actor.qsize.remote())
+
+    def qsize(self) -> int:
+        """The size of the queue."""
+        return self.size()
+
+    def empty(self) -> bool:
+        """Whether the queue is empty."""
+        return ray.get(self.actor.empty.remote())
+
+    def full(self) -> bool:
+        """Whether the queue is full."""
+        return ray.get(self.actor.full.remote())
+
+    def put(
+        self, item: Any, block: bool = True, timeout: Optional[float] = None
+    ) -> None:
+        """Adds an item to the queue.
+
+        If block is True and the queue is full, blocks until the queue is no
+        longer full or until timeout.
+
+        There is no guarantee of order if multiple producers put to the same
+        full queue.
+
+        Raises:
+            Full: if the queue is full and blocking is False.
+            Full: if the queue is full, blocking is True, and it timed out.
+            ValueError: if timeout is negative.
+        """
+        if not block:
+            try:
+                ray.get(self.actor.put_nowait.remote(item))
+            except asyncio.QueueFull:
+                raise Full
+        else:
+            if timeout is not None and timeout < 0:
+                raise ValueError("'timeout' must be a non-negative number")
+            else:
+                ray.get(self.actor.put.remote(item, timeout))
+
+    async def put_async(
+        self, item: Any, block: bool = True, timeout: Optional[float] = None
+    ) -> None:
+        """Adds an item to the queue.
+
+        If block is True and the queue is full,
+        blocks until the queue is no longer full or until timeout.
+
+        There is no guarantee of order if multiple producers put to the same
+        full queue.
+
+        Raises:
+            Full: if the queue is full and blocking is False.
+            Full: if the queue is full, blocking is True, and it timed out.
+            ValueError: if timeout is negative.
+        """
+        if not block:
+            try:
+                await self.actor.put_nowait.remote(item)
+            except asyncio.QueueFull:
+                raise Full
+        else:
+            if timeout is not None and timeout < 0:
+                raise ValueError("'timeout' must be a non-negative number")
+            else:
+                await self.actor.put.remote(item, timeout)
+
+    def get(self, block: bool = True, timeout: Optional[float] = None) -> Any:
+        """Gets an item from the queue.
+
+        If block is True and the queue is empty, blocks until the queue is no
+        longer empty or until timeout.
+
+        There is no guarantee of order if multiple consumers get from the
+        same empty queue.
+
+        Returns:
+            The next item in the queue.
+
+        Raises:
+            Empty: if the queue is empty and blocking is False.
+            Empty: if the queue is empty, blocking is True, and it timed out.
+            ValueError: if timeout is negative.
+        """
+        if not block:
+            try:
+                return ray.get(self.actor.get_nowait.remote())
+            except asyncio.QueueEmpty:
+                raise Empty
+        else:
+            if timeout is not None and timeout < 0:
+                raise ValueError("'timeout' must be a non-negative number")
+            else:
+                return ray.get(self.actor.get.remote(timeout))
+
+    async def get_async(
+        self, block: bool = True, timeout: Optional[float] = None
+    ) -> Any:
+        """Gets an item from the queue.
+
+        There is no guarantee of order if multiple consumers get from the
+        same empty queue.
+
+        Returns:
+            The next item in the queue.
+        Raises:
+            Empty: if the queue is empty and blocking is False.
+            Empty: if the queue is empty, blocking is True, and it timed out.
+            ValueError: if timeout is negative.
+        """
+        if not block:
+            try:
+                return await self.actor.get_nowait.remote()
+            except asyncio.QueueEmpty:
+                raise Empty
+        else:
+            if timeout is not None and timeout < 0:
+                raise ValueError("'timeout' must be a non-negative number")
+            else:
+                return await self.actor.get.remote(timeout)
+
+    def put_nowait(self, item: Any) -> None:
+        """Equivalent to put(item, block=False).
+
+        Raises:
+            Full: if the queue is full.
+        """
+        return self.put(item, block=False)
+
+    def put_nowait_batch(self, items: Iterable) -> None:
+        """Takes in a list of items and puts them into the queue in order.
+
+        Raises:
+            Full: if the items will not fit in the queue
+        """
+        if not isinstance(items, Iterable):
+            raise TypeError("Argument 'items' must be an Iterable")
+
+        ray.get(self.actor.put_nowait_batch.remote(items))
+
+    def get_nowait(self) -> Any:
+        """Equivalent to get(block=False).
+
+        Raises:
+            Empty: if the queue is empty.
+        """
+        return self.get(block=False)
+
+    def get_nowait_batch(self, num_items: int) -> List[Any]:
+        """Gets items from the queue and returns them in a
+        list in order.
+
+        Raises:
+            Empty: if the queue does not contain the desired number of items
+        """
+        if not isinstance(num_items, int):
+            raise TypeError("Argument 'num_items' must be an int")
+        if num_items < 0:
+            raise ValueError("'num_items' must be nonnegative")
+
+        return ray.get(self.actor.get_nowait_batch.remote(num_items))
+
+    def shutdown(self, force: bool = False, grace_period_s: int = 5) -> None:
+        """Terminates the underlying QueueActor.
+
+        All of the resources reserved by the queue will be released.
+
+        Args:
+            force: If True, forcefully kill the actor, causing an
+                immediate failure. If False, graceful
+                actor termination will be attempted first, before falling back
+                to a forceful kill.
+            grace_period_s: If force is False, how long in seconds to
+                wait for graceful termination before falling back to
+                forceful kill.
+        """
+        if self.actor:
+            if force:
+                ray.kill(self.actor, no_restart=True)
+            else:
+                done_ref = self.actor.__ray_terminate__.remote()
+                done, not_done = ray.wait([done_ref], timeout=grace_period_s)
+                if not_done:
+                    ray.kill(self.actor, no_restart=True)
+        self.actor = None
+
+
+class _QueueActor:
+    def __init__(self, maxsize):
+        self.maxsize = maxsize
+        self.queue = asyncio.Queue(self.maxsize)
+
+    def qsize(self):
+        return self.queue.qsize()
+
+    def empty(self):
+        return self.queue.empty()
+
+    def full(self):
+        return self.queue.full()
+
+    async def put(self, item, timeout=None):
+        try:
+            await asyncio.wait_for(self.queue.put(item), timeout)
+        except asyncio.TimeoutError:
+            raise Full
+
+    async def get(self, timeout=None):
+        try:
+            return await asyncio.wait_for(self.queue.get(), timeout)
+        except asyncio.TimeoutError:
+            raise Empty
+
+    def put_nowait(self, item):
+        self.queue.put_nowait(item)
+
+    def put_nowait_batch(self, items):
+        # If maxsize is 0, queue is unbounded, so no need to check size.
+        if self.maxsize > 0 and len(items) + self.qsize() > self.maxsize:
+            raise Full(
+                f"Cannot add {len(items)} items to queue of size "
+                f"{self.qsize()} and maxsize {self.maxsize}."
+            )
+        for item in items:
+            self.queue.put_nowait(item)
+
+    def get_nowait(self):
+        return self.queue.get_nowait()
+
+    def get_nowait_batch(self, num_items):
+        if num_items > self.qsize():
+            raise Empty(
+                f"Cannot get {num_items} items from queue of size " f"{self.qsize()}."
+            )
+        return [self.queue.get_nowait() for _ in range(num_items)]
diff --git a/.venv/lib/python3.11/site-packages/ray/util/rpdb.py b/.venv/lib/python3.11/site-packages/ray/util/rpdb.py
new file mode 100644
index 0000000000000000000000000000000000000000..89700466b0e3c3bae4efeed600ad3d8d1a2c6215
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/util/rpdb.py
@@ -0,0 +1,380 @@
+# Some code in this file is from
+# https://github.com/ionelmc/python-remote-pdb/blob/07d563331c4ab9eb45731bb272b158816d98236e/src/remote_pdb.py
+# (BSD 2-Clause "Simplified" License)
+
+import errno
+import inspect
+import json
+import logging
+import os
+import re
+import select
+import socket
+import sys
+import time
+import traceback
+import uuid
+from pdb import Pdb
+from typing import Callable
+
+import setproctitle
+
+import ray
+from ray._private import ray_constants
+from ray.experimental.internal_kv import _internal_kv_del, _internal_kv_put
+from ray.util.annotations import DeveloperAPI
+
+log = logging.getLogger(__name__)
+
+
+def _cry(message, stderr=sys.__stderr__):
+    print(message, file=stderr)
+    stderr.flush()
+
+
+class _LF2CRLF_FileWrapper(object):
+    def __init__(self, connection):
+        self.connection = connection
+        self.stream = fh = connection.makefile("rw")
+        self.read = fh.read
+        self.readline = fh.readline
+        self.readlines = fh.readlines
+        self.close = fh.close
+        self.flush = fh.flush
+        self.fileno = fh.fileno
+        if hasattr(fh, "encoding"):
+            self._send = lambda data: connection.sendall(
+                data.encode(fh.encoding, errors="replace")
+            )
+        else:
+            self._send = connection.sendall
+
+    @property
+    def encoding(self):
+        return self.stream.encoding
+
+    def __iter__(self):
+        return self.stream.__iter__()
+
+    def write(self, data, nl_rex=re.compile("\r?\n")):
+        data = nl_rex.sub("\r\n", data)
+        self._send(data)
+
+    def writelines(self, lines, nl_rex=re.compile("\r?\n")):
+        for line in lines:
+            self.write(line, nl_rex)
+
+
+class _PdbWrap(Pdb):
+    """Wrap PDB to run a custom exit hook on continue."""
+
+    def __init__(self, exit_hook: Callable[[], None]):
+        self._exit_hook = exit_hook
+        Pdb.__init__(self)
+
+    def do_continue(self, arg):
+        self._exit_hook()
+        return Pdb.do_continue(self, arg)
+
+    do_c = do_cont = do_continue
+
+
+class _RemotePdb(Pdb):
+    """
+    This will run pdb as a ephemeral telnet service. Once you connect no one
+    else can connect. On construction this object will block execution till a
+    client has connected.
+    Based on https://github.com/tamentis/rpdb I think ...
+    To use this::
+        RemotePdb(host="0.0.0.0", port=4444).set_trace()
+    Then run: telnet 127.0.0.1 4444
+    """
+
+    active_instance = None
+
+    def __init__(
+        self,
+        breakpoint_uuid,
+        host,
+        port,
+        ip_address,
+        patch_stdstreams=False,
+        quiet=False,
+    ):
+        self._breakpoint_uuid = breakpoint_uuid
+        self._quiet = quiet
+        self._patch_stdstreams = patch_stdstreams
+        self._listen_socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
+        self._listen_socket.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, True)
+        self._listen_socket.bind((host, port))
+        self._ip_address = ip_address
+
+    def listen(self):
+        if not self._quiet:
+            _cry(
+                "RemotePdb session open at %s:%s, "
+                "use 'ray debug' to connect..."
+                % (self._ip_address, self._listen_socket.getsockname()[1])
+            )
+        self._listen_socket.listen(1)
+        connection, address = self._listen_socket.accept()
+        if not self._quiet:
+            _cry(f"RemotePdb accepted connection from {address}")
+        self.handle = _LF2CRLF_FileWrapper(connection)
+        Pdb.__init__(
+            self,
+            completekey="tab",
+            stdin=self.handle,
+            stdout=self.handle,
+            skip=["ray.*"],
+        )
+        self.backup = []
+        if self._patch_stdstreams:
+            for name in (
+                "stderr",
+                "stdout",
+                "__stderr__",
+                "__stdout__",
+                "stdin",
+                "__stdin__",
+            ):
+                self.backup.append((name, getattr(sys, name)))
+                setattr(sys, name, self.handle)
+        _RemotePdb.active_instance = self
+
+    def __restore(self):
+        if self.backup and not self._quiet:
+            _cry("Restoring streams: %s ..." % self.backup)
+        for name, fh in self.backup:
+            setattr(sys, name, fh)
+        self.handle.close()
+        _RemotePdb.active_instance = None
+
+    def do_quit(self, arg):
+        self.__restore()
+        return Pdb.do_quit(self, arg)
+
+    do_q = do_exit = do_quit
+
+    def do_continue(self, arg):
+        self.__restore()
+        self.handle.connection.close()
+        return Pdb.do_continue(self, arg)
+
+    do_c = do_cont = do_continue
+
+    def set_trace(self, frame=None):
+        if frame is None:
+            frame = sys._getframe().f_back
+        try:
+            Pdb.set_trace(self, frame)
+        except IOError as exc:
+            if exc.errno != errno.ECONNRESET:
+                raise
+
+    def post_mortem(self, traceback=None):
+        # See https://github.com/python/cpython/blob/
+        # 022bc7572f061e1d1132a4db9d085b29707701e7/Lib/pdb.py#L1617
+        try:
+            t = sys.exc_info()[2]
+            self.reset()
+            Pdb.interaction(self, None, t)
+        except IOError as exc:
+            if exc.errno != errno.ECONNRESET:
+                raise
+
+    def do_remote(self, arg):
+        """remote
+        Skip into the next remote call.
+        """
+        # Tell the next task to drop into the debugger.
+        ray._private.worker.global_worker.debugger_breakpoint = self._breakpoint_uuid
+        # Tell the debug loop to connect to the next task.
+        data = json.dumps(
+            {
+                "job_id": ray.get_runtime_context().get_job_id(),
+            }
+        )
+        _internal_kv_put(
+            "RAY_PDB_CONTINUE_{}".format(self._breakpoint_uuid),
+            data,
+            namespace=ray_constants.KV_NAMESPACE_PDB,
+        )
+        self.__restore()
+        self.handle.connection.close()
+        return Pdb.do_continue(self, arg)
+
+    def do_get(self, arg):
+        """get
+        Skip to where the current task returns to.
+        """
+        ray._private.worker.global_worker.debugger_get_breakpoint = (
+            self._breakpoint_uuid
+        )
+        self.__restore()
+        self.handle.connection.close()
+        return Pdb.do_continue(self, arg)
+
+
+def _connect_ray_pdb(
+    host=None,
+    port=None,
+    patch_stdstreams=False,
+    quiet=None,
+    breakpoint_uuid=None,
+    debugger_external=False,
+):
+    """
+    Opens a remote PDB on first available port.
+    """
+    if debugger_external:
+        assert not host, "Cannot specify both host and debugger_external"
+        host = "0.0.0.0"
+    elif host is None:
+        host = os.environ.get("REMOTE_PDB_HOST", "127.0.0.1")
+    if port is None:
+        port = int(os.environ.get("REMOTE_PDB_PORT", "0"))
+    if quiet is None:
+        quiet = bool(os.environ.get("REMOTE_PDB_QUIET", ""))
+    if not breakpoint_uuid:
+        breakpoint_uuid = uuid.uuid4().hex
+    if debugger_external:
+        ip_address = ray._private.worker.global_worker.node_ip_address
+    else:
+        ip_address = "localhost"
+    rdb = _RemotePdb(
+        breakpoint_uuid=breakpoint_uuid,
+        host=host,
+        port=port,
+        ip_address=ip_address,
+        patch_stdstreams=patch_stdstreams,
+        quiet=quiet,
+    )
+    sockname = rdb._listen_socket.getsockname()
+    pdb_address = "{}:{}".format(ip_address, sockname[1])
+    parentframeinfo = inspect.getouterframes(inspect.currentframe())[2]
+    data = {
+        "proctitle": setproctitle.getproctitle(),
+        "pdb_address": pdb_address,
+        "filename": parentframeinfo.filename,
+        "lineno": parentframeinfo.lineno,
+        "traceback": "\n".join(traceback.format_exception(*sys.exc_info())),
+        "timestamp": time.time(),
+        "job_id": ray.get_runtime_context().get_job_id(),
+        "node_id": ray.get_runtime_context().get_node_id(),
+        "worker_id": ray.get_runtime_context().get_worker_id(),
+        "actor_id": ray.get_runtime_context().get_actor_id(),
+        "task_id": ray.get_runtime_context().get_task_id(),
+    }
+    _internal_kv_put(
+        "RAY_PDB_{}".format(breakpoint_uuid),
+        json.dumps(data),
+        overwrite=True,
+        namespace=ray_constants.KV_NAMESPACE_PDB,
+    )
+    rdb.listen()
+    _internal_kv_del(
+        "RAY_PDB_{}".format(breakpoint_uuid), namespace=ray_constants.KV_NAMESPACE_PDB
+    )
+
+    return rdb
+
+
+@DeveloperAPI
+def set_trace(breakpoint_uuid=None):
+    """Interrupt the flow of the program and drop into the Ray debugger.
+
+    Can be used within a Ray task or actor.
+    """
+    if os.environ.get("RAY_DEBUG", "1") == "1":
+        return ray.util.ray_debugpy.set_trace(breakpoint_uuid)
+    if os.environ.get("RAY_DEBUG", "1") == "legacy":
+        # If there is an active debugger already, we do not want to
+        # start another one, so "set_trace" is just a no-op in that case.
+        if ray._private.worker.global_worker.debugger_breakpoint == b"":
+            frame = sys._getframe().f_back
+            rdb = _connect_ray_pdb(
+                host=None,
+                port=None,
+                patch_stdstreams=False,
+                quiet=None,
+                breakpoint_uuid=breakpoint_uuid.decode() if breakpoint_uuid else None,
+                debugger_external=ray._private.worker.global_worker.ray_debugger_external,  # noqa: E501
+            )
+            rdb.set_trace(frame=frame)
+
+
+def _driver_set_trace():
+    """The breakpoint hook to use for the driver.
+
+    This disables Ray driver logs temporarily so that the PDB console is not
+    spammed: https://github.com/ray-project/ray/issues/18172
+    """
+    if os.environ.get("RAY_DEBUG", "1") == "1":
+        return ray.util.ray_debugpy.set_trace()
+    if os.environ.get("RAY_DEBUG", "1") == "legacy":
+        print("*** Temporarily disabling Ray worker logs ***")
+        ray._private.worker._worker_logs_enabled = False
+
+        def enable_logging():
+            print("*** Re-enabling Ray worker logs ***")
+            ray._private.worker._worker_logs_enabled = True
+
+        pdb = _PdbWrap(enable_logging)
+        frame = sys._getframe().f_back
+        pdb.set_trace(frame)
+
+
+def _is_ray_debugger_post_mortem_enabled():
+    return os.environ.get("RAY_DEBUG_POST_MORTEM", "0") == "1"
+
+
+def _post_mortem():
+    if os.environ.get("RAY_DEBUG", "1") == "1":
+        return ray.util.ray_debugpy._post_mortem()
+
+    rdb = _connect_ray_pdb(
+        host=None,
+        port=None,
+        patch_stdstreams=False,
+        quiet=None,
+        debugger_external=ray._private.worker.global_worker.ray_debugger_external,
+    )
+    rdb.post_mortem()
+
+
+def _connect_pdb_client(host, port):
+    if sys.platform == "win32":
+        import msvcrt
+    s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
+    s.connect((host, port))
+
+    while True:
+        # Get the list of sockets which are readable.
+        if sys.platform == "win32":
+            ready_to_read = select.select([s], [], [], 1)[0]
+            if msvcrt.kbhit():
+                ready_to_read.append(sys.stdin)
+            if not ready_to_read and not sys.stdin.isatty():
+                # in tests, when using pexpect, the pipe makes
+                # the msvcrt.kbhit() trick fail. Assume we are waiting
+                # for stdin, since this will block waiting for input
+                ready_to_read.append(sys.stdin)
+        else:
+            ready_to_read, write_sockets, error_sockets = select.select(
+                [sys.stdin, s], [], []
+            )
+
+        for sock in ready_to_read:
+            if sock == s:
+                # Incoming message from remote debugger.
+                data = sock.recv(4096)
+                if not data:
+                    return
+                else:
+                    sys.stdout.write(data.decode())
+                    sys.stdout.flush()
+            else:
+                # User entered a message.
+                msg = sys.stdin.readline()
+                s.send(msg.encode())
diff --git a/.venv/lib/python3.11/site-packages/ray/util/scheduling_strategies.py b/.venv/lib/python3.11/site-packages/ray/util/scheduling_strategies.py
new file mode 100644
index 0000000000000000000000000000000000000000..b283aed5046541749487294df751da978faecc3f
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/util/scheduling_strategies.py
@@ -0,0 +1,197 @@
+from typing import Dict, Union, Optional, TYPE_CHECKING
+from ray.util.annotations import PublicAPI
+
+if TYPE_CHECKING:
+    from ray.util.placement_group import PlacementGroup
+
+# "DEFAULT": The default hybrid scheduling strategy
+# based on config scheduler_spread_threshold.
+# This disables any potential placement group capture.
+
+# "SPREAD": Spread scheduling on a best effort basis.
+
+
+@PublicAPI
+class PlacementGroupSchedulingStrategy:
+    """Placement group based scheduling strategy.
+
+    Attributes:
+        placement_group: the placement group this actor belongs to,
+            or None if it doesn't belong to any group.
+        placement_group_bundle_index: the index of the bundle
+            if the actor belongs to a placement group, which may be -1 to
+            specify any available bundle.
+        placement_group_capture_child_tasks: Whether or not children tasks
+            of this actor should implicitly use the same placement group
+            as its parent. It is False by default.
+    """
+
+    def __init__(
+        self,
+        placement_group: "PlacementGroup",
+        placement_group_bundle_index: int = -1,
+        placement_group_capture_child_tasks: Optional[bool] = None,
+    ):
+        self.placement_group = placement_group
+        self.placement_group_bundle_index = placement_group_bundle_index
+        self.placement_group_capture_child_tasks = placement_group_capture_child_tasks
+
+
+@PublicAPI
+class NodeAffinitySchedulingStrategy:
+    """Static scheduling strategy used to run a task or actor on a particular node.
+
+    Attributes:
+        node_id: the hex id of the node where the task or actor should run.
+        soft: whether the scheduler should run the task or actor somewhere else
+            if the target node doesn't exist (e.g. the node dies) or is infeasible
+            during scheduling.
+            If the node exists and is feasible, the task or actor
+            will only be scheduled there.
+            This means if the node doesn't have the available resources,
+            the task or actor will wait indefinitely until resources become available.
+            If the node doesn't exist or is infeasible, the task or actor
+            will fail if soft is False
+            or be scheduled somewhere else if soft is True.
+    """
+
+    def __init__(
+        self,
+        node_id: str,
+        soft: bool,
+        _spill_on_unavailable: bool = False,
+        _fail_on_unavailable: bool = False,
+    ):
+        # This will be removed once we standardize on node id being hex string.
+        if not isinstance(node_id, str):
+            node_id = node_id.hex()
+
+        self.node_id = node_id
+        self.soft = soft
+        self._spill_on_unavailable = _spill_on_unavailable
+        self._fail_on_unavailable = _fail_on_unavailable
+
+
+def _validate_label_match_operator_values(values, operator):
+    if not values:
+        raise ValueError(
+            f"The variadic parameter of the {operator} operator"
+            f' must be a non-empty tuple: e.g. {operator}("value1", "value2").'
+        )
+
+    index = 0
+    for value in values:
+        if not isinstance(value, str):
+            raise ValueError(
+                f"Type of value in position {index} for the {operator} operator "
+                f'must be str (e.g. {operator}("value1", "value2")) '
+                f"but got {str(value)} of type {type(value)}."
+            )
+        index = index + 1
+
+
+@PublicAPI(stability="alpha")
+class In:
+    def __init__(self, *values):
+        _validate_label_match_operator_values(values, "In")
+        self.values = list(values)
+
+
+@PublicAPI(stability="alpha")
+class NotIn:
+    def __init__(self, *values):
+        _validate_label_match_operator_values(values, "NotIn")
+        self.values = list(values)
+
+
+@PublicAPI(stability="alpha")
+class Exists:
+    def __init__(self):
+        pass
+
+
+@PublicAPI(stability="alpha")
+class DoesNotExist:
+    def __init__(self):
+        pass
+
+
+class _LabelMatchExpression:
+    """An expression used to select node by node's labels
+    Attributes:
+        key: the key of label
+        operator: In、NotIn、Exists、DoesNotExist
+    """
+
+    def __init__(self, key: str, operator: Union[In, NotIn, Exists, DoesNotExist]):
+        self.key = key
+        self.operator = operator
+
+
+LabelMatchExpressionsT = Dict[str, Union[In, NotIn, Exists, DoesNotExist]]
+
+
+@PublicAPI(stability="alpha")
+class NodeLabelSchedulingStrategy:
+    """
+    Label based node affinity scheduling strategy
+
+    scheduling_strategy=NodeLabelSchedulingStrategy({
+        "region": In("us"),
+        "gpu_type": Exists(),
+    })
+    """
+
+    def __init__(
+        self, hard: LabelMatchExpressionsT, *, soft: LabelMatchExpressionsT = None
+    ):
+        self.hard = _convert_map_to_expressions(hard, "hard")
+        self.soft = _convert_map_to_expressions(soft, "soft")
+        self._check_usage()
+
+    def _check_usage(self):
+        if not (self.hard or self.soft):
+            raise ValueError(
+                "The `hard` and `soft` parameter "
+                "of NodeLabelSchedulingStrategy cannot both be empty."
+            )
+
+
+def _convert_map_to_expressions(map_expressions: LabelMatchExpressionsT, param: str):
+    expressions = []
+    if map_expressions is None:
+        return expressions
+
+    if not isinstance(map_expressions, Dict):
+        raise ValueError(
+            f'The {param} parameter must be a map (e.g. {{"key1": In("value1")}}) '
+            f"but got type {type(map_expressions)}."
+        )
+
+    for key, value in map_expressions.items():
+        if not isinstance(key, str):
+            raise ValueError(
+                f"The map key of the {param} parameter must "
+                f'be of type str (e.g. {{"key1": In("value1")}}) '
+                f"but got {str(key)} of type {type(key)}."
+            )
+
+        if not isinstance(value, (In, NotIn, Exists, DoesNotExist)):
+            raise ValueError(
+                f"The map value for key {key} of the {param} parameter "
+                f"must be one of the `In`, `NotIn`, `Exists` or `DoesNotExist` "
+                f'operator (e.g. {{"key1": In("value1")}}) '
+                f"but got {str(value)} of type {type(value)}."
+            )
+
+        expressions.append(_LabelMatchExpression(key, value))
+    return expressions
+
+
+SchedulingStrategyT = Union[
+    None,
+    str,  # Literal["DEFAULT", "SPREAD"]
+    PlacementGroupSchedulingStrategy,
+    NodeAffinitySchedulingStrategy,
+    NodeLabelSchedulingStrategy,
+]
diff --git a/.venv/lib/python3.11/site-packages/ray/util/serialization.py b/.venv/lib/python3.11/site-packages/ray/util/serialization.py
new file mode 100644
index 0000000000000000000000000000000000000000..106e06c8681bc959db3f9ae7e0c64e5c0187bb38
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/util/serialization.py
@@ -0,0 +1,55 @@
+import ray
+import ray.cloudpickle as pickle
+from ray.util.annotations import DeveloperAPI, PublicAPI
+
+
+@PublicAPI
+def register_serializer(cls: type, *, serializer: callable, deserializer: callable):
+    """Use the given serializer to serialize instances of type ``cls``,
+    and use the deserializer to deserialize the serialized object.
+
+    Args:
+        cls: A Python class/type.
+        serializer: A function that converts an instances of
+            type ``cls`` into a serializable object (e.g. python dict
+            of basic objects).
+        deserializer: A function that constructs the
+            instance of type ``cls`` from the serialized object.
+            This function itself must be serializable.
+    """
+    context = ray._private.worker.global_worker.get_serialization_context()
+    context._register_cloudpickle_serializer(cls, serializer, deserializer)
+
+
+@PublicAPI
+def deregister_serializer(cls: type):
+    """Deregister the serializer associated with the type ``cls``.
+    There is no effect if the serializer is unavailable.
+
+    Args:
+        cls: A Python class/type.
+    """
+    context = ray._private.worker.global_worker.get_serialization_context()
+    context._unregister_cloudpickle_reducer(cls)
+
+
+@DeveloperAPI
+class StandaloneSerializationContext:
+    # NOTE(simon): Used for registering custom serializers. We cannot directly
+    # use the SerializationContext because it requires Ray workers. Please
+    # make sure to keep the API consistent.
+
+    def _register_cloudpickle_reducer(self, cls, reducer):
+        pickle.CloudPickler.dispatch[cls] = reducer
+
+    def _unregister_cloudpickle_reducer(self, cls):
+        pickle.CloudPickler.dispatch.pop(cls, None)
+
+    def _register_cloudpickle_serializer(
+        self, cls, custom_serializer, custom_deserializer
+    ):
+        def _CloudPicklerReducer(obj):
+            return custom_deserializer, (custom_serializer(obj),)
+
+        # construct a reducer
+        pickle.CloudPickler.dispatch[cls] = _CloudPicklerReducer
diff --git a/.venv/lib/python3.11/site-packages/ray/util/serialization_addons.py b/.venv/lib/python3.11/site-packages/ray/util/serialization_addons.py
new file mode 100644
index 0000000000000000000000000000000000000000..0f7390a29b84b6faaa95f25ca377117d3ca944c7
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/util/serialization_addons.py
@@ -0,0 +1,39 @@
+"""
+This module is intended for implementing internal serializers for some
+site packages.
+"""
+
+import sys
+
+from ray.util.annotations import DeveloperAPI
+
+
+@DeveloperAPI
+def register_starlette_serializer(serialization_context):
+    try:
+        import starlette.datastructures
+    except ImportError:
+        return
+
+    # Starlette's app.state object is not serializable
+    # because it overrides __getattr__
+    serialization_context._register_cloudpickle_serializer(
+        starlette.datastructures.State,
+        custom_serializer=lambda s: s._state,
+        custom_deserializer=lambda s: starlette.datastructures.State(s),
+    )
+
+
+@DeveloperAPI
+def apply(serialization_context):
+    from ray._private.pydantic_compat import register_pydantic_serializers
+
+    register_pydantic_serializers(serialization_context)
+    register_starlette_serializer(serialization_context)
+
+    if sys.platform != "win32":
+        from ray._private.arrow_serialization import (
+            _register_custom_datasets_serializers,
+        )
+
+        _register_custom_datasets_serializers(serialization_context)
diff --git a/.venv/lib/python3.11/site-packages/ray/util/spark/__init__.py b/.venv/lib/python3.11/site-packages/ray/util/spark/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..edded13240a1e177368e5410cf6f654ea980e028
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/util/spark/__init__.py
@@ -0,0 +1,13 @@
+from ray.util.spark.cluster_init import (
+    setup_ray_cluster,
+    shutdown_ray_cluster,
+    MAX_NUM_WORKER_NODES,
+    setup_global_ray_cluster,
+)
+
+__all__ = [
+    "setup_ray_cluster",
+    "shutdown_ray_cluster",
+    "MAX_NUM_WORKER_NODES",
+    "setup_global_ray_cluster",
+]
diff --git a/.venv/lib/python3.11/site-packages/ray/util/spark/__pycache__/__init__.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/util/spark/__pycache__/__init__.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..70bc94950f750c61135ef015f2f6d2bd163572f0
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/util/spark/__pycache__/__init__.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/util/spark/__pycache__/cluster_init.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/util/spark/__pycache__/cluster_init.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b73cf64b3acda37ba99b701666033aac482fabfb
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/util/spark/__pycache__/cluster_init.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/util/spark/__pycache__/databricks_hook.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/util/spark/__pycache__/databricks_hook.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..5d2db515eb37bce7ab7827bbcfc8e4c9eb454f4d
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/util/spark/__pycache__/databricks_hook.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/util/spark/__pycache__/start_hook_base.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/util/spark/__pycache__/start_hook_base.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..737a5bae9305cd5f2fa4c93eed0d8b356084bc36
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/util/spark/__pycache__/start_hook_base.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/util/spark/__pycache__/start_ray_node.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/util/spark/__pycache__/start_ray_node.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..fb1c51f678ad4a4a461d05c63401fa195778d347
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/util/spark/__pycache__/start_ray_node.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/util/spark/__pycache__/utils.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/util/spark/__pycache__/utils.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d7c75463d97df782ef752560ade05c165b5283a2
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/util/spark/__pycache__/utils.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/ray/util/spark/cluster_init.py b/.venv/lib/python3.11/site-packages/ray/util/spark/cluster_init.py
new file mode 100644
index 0000000000000000000000000000000000000000..edf1610034d22c7c8dceee743a251edb3b6acf7a
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/util/spark/cluster_init.py
@@ -0,0 +1,1919 @@
+import copy
+import signal
+
+import yaml
+import json
+import os
+import socket
+import sys
+import time
+import threading
+import logging
+import uuid
+import warnings
+import requests
+from packaging.version import Version
+from typing import Optional, Dict, Tuple, Type
+
+import ray
+import ray._private.services
+from ray.autoscaler._private.spark.node_provider import HEAD_NODE_ID
+from ray.util.annotations import DeveloperAPI, PublicAPI
+from ray._private.storage import _load_class
+
+from .utils import (
+    exec_cmd,
+    is_port_in_use,
+    get_random_unused_port,
+    get_spark_session,
+    get_spark_application_driver_host,
+    is_in_databricks_runtime,
+    get_spark_task_assigned_physical_gpus,
+    get_avail_mem_per_ray_worker_node,
+    get_max_num_concurrent_tasks,
+    gen_cmd_exec_failure_msg,
+    calc_mem_ray_head_node,
+    _wait_service_up,
+    _get_local_ray_node_slots,
+    get_configured_spark_executor_memory_bytes,
+    _get_cpu_cores,
+    _get_num_physical_gpus,
+)
+from .start_hook_base import RayOnSparkStartHook
+from .databricks_hook import DefaultDatabricksRayOnSparkStartHook
+from threading import Event
+
+
+_logger = logging.getLogger("ray.util.spark")
+_logger.setLevel(logging.INFO)
+
+
+RAY_ON_SPARK_START_HOOK = "RAY_ON_SPARK_START_HOOK"
+
+MAX_NUM_WORKER_NODES = -1
+
+RAY_ON_SPARK_COLLECT_LOG_TO_PATH = "RAY_ON_SPARK_COLLECT_LOG_TO_PATH"
+RAY_ON_SPARK_START_RAY_PARENT_PID = "RAY_ON_SPARK_START_RAY_PARENT_PID"
+
+
+def _check_system_environment():
+    if os.name != "posix":
+        raise RuntimeError("Ray on spark only supports running on POSIX system.")
+
+    spark_dependency_error = "ray.util.spark module requires pyspark >= 3.3"
+    try:
+        import pyspark
+
+        if Version(pyspark.__version__).release < (3, 3, 0):
+            raise RuntimeError(spark_dependency_error)
+    except ImportError:
+        raise RuntimeError(spark_dependency_error)
+
+
+class RayClusterOnSpark:
+    """
+    This class is the type of instance returned by the `_setup_ray_cluster` interface.
+    Its main functionality is to:
+    Connect to, disconnect from, and shutdown the Ray cluster running on Apache Spark.
+    Serve as a Python context manager for the `RayClusterOnSpark` instance.
+
+    Args
+        address: The url for the ray head node (defined as the hostname and unused
+                 port on Spark driver node)
+        head_proc: Ray head process
+        spark_job_group_id: The Spark job id for a submitted ray job
+        num_workers_node: The number of workers in the ray cluster.
+    """
+
+    def __init__(
+        self,
+        address,
+        head_proc,
+        min_worker_nodes,
+        max_worker_nodes,
+        temp_dir,
+        cluster_unique_id,
+        start_hook,
+        ray_dashboard_port,
+        spark_job_server,
+        global_cluster_lock_fd,
+        ray_client_server_port,
+    ):
+        self.address = address
+        self.head_proc = head_proc
+        self.min_worker_nodes = min_worker_nodes
+        self.max_worker_nodes = max_worker_nodes
+        self.temp_dir = temp_dir
+        self.cluster_unique_id = cluster_unique_id
+        self.start_hook = start_hook
+        self.ray_dashboard_port = ray_dashboard_port
+        self.spark_job_server = spark_job_server
+        self.global_cluster_lock_fd = global_cluster_lock_fd
+        self.ray_client_server_port = ray_client_server_port
+
+        self.is_shutdown = False
+        self.spark_job_is_canceled = False
+        self.background_job_exception = None
+
+        # Ray client context returns by `ray.init`
+        self.ray_ctx = None
+
+    def wait_until_ready(self):
+        import ray
+
+        if self.is_shutdown:
+            raise RuntimeError(
+                "The ray cluster has been shut down or it failed to start."
+            )
+
+        try:
+            ray.init(address=self.address)
+
+            if self.ray_dashboard_port is not None and _wait_service_up(
+                self.address.split(":")[0],
+                self.ray_dashboard_port,
+                _RAY_DASHBOARD_STARTUP_TIMEOUT,
+            ):
+                self.start_hook.on_ray_dashboard_created(self.ray_dashboard_port)
+            else:
+                try:
+                    __import__("ray.dashboard.optional_deps")
+                except ModuleNotFoundError as e:
+                    _logger.warning(
+                        "Dependencies to launch the optional dashboard API "
+                        "server cannot be found. They can be installed with "
+                        f"pip install ray[default], root cause: ({repr(e)})"
+                    )
+
+            last_alive_worker_count = 0
+            last_progress_move_time = time.time()
+            while True:
+                time.sleep(_RAY_CLUSTER_STARTUP_PROGRESS_CHECKING_INTERVAL)
+
+                # Inside the waiting ready loop,
+                # checking `self.background_job_exception`, if it is not None,
+                # it means the background spark job has failed,
+                # in this case, raise error directly.
+                if self.background_job_exception is not None:
+                    raise RuntimeError(
+                        "Ray workers failed to start."
+                    ) from self.background_job_exception
+
+                cur_alive_worker_count = (
+                    len([node for node in ray.nodes() if node["Alive"]]) - 1
+                )  # Minus 1 means excluding the head node.
+
+                if cur_alive_worker_count >= self.min_worker_nodes:
+                    _logger.info(
+                        f"Started {cur_alive_worker_count} Ray worker nodes, "
+                        f"meet the minimum number of Ray worker nodes required."
+                    )
+                    return
+
+                if cur_alive_worker_count > last_alive_worker_count:
+                    last_alive_worker_count = cur_alive_worker_count
+                    last_progress_move_time = time.time()
+                    _logger.info(
+                        "Ray worker nodes are starting. Progress: "
+                        f"({cur_alive_worker_count} / {self.max_worker_nodes})"
+                    )
+                else:
+                    if (
+                        time.time() - last_progress_move_time
+                        > _RAY_CONNECT_CLUSTER_POLL_PROGRESS_TIMEOUT
+                    ):
+                        if cur_alive_worker_count == 0:
+                            (
+                                job_server_host,
+                                job_server_port,
+                            ) = self.spark_job_server.server_address[:2]
+                            response = requests.post(
+                                url=(
+                                    f"http://{job_server_host}:{job_server_port}"
+                                    "/query_last_worker_err"
+                                ),
+                                json={"spark_job_group_id": None},
+                            )
+                            response.raise_for_status()
+
+                            decoded_resp = response.content.decode("utf-8")
+                            json_res = json.loads(decoded_resp)
+                            last_worker_err = json_res["last_worker_err"]
+
+                            if last_worker_err:
+                                raise RuntimeError(
+                                    "Starting Ray worker node failed, error:\n"
+                                    f"{last_worker_err}"
+                                )
+                            else:
+                                raise RuntimeError(
+                                    "Current spark cluster has no resources to launch "
+                                    "Ray worker nodes."
+                                )
+                        _logger.warning(
+                            "Timeout in waiting for minimal ray workers to start. "
+                            "Started / Total requested: "
+                            f"({cur_alive_worker_count} / {self.min_worker_nodes}). "
+                            "Current spark cluster does not have sufficient resources "
+                            "to launch requested minimal number of Ray worker nodes."
+                        )
+                        return
+        finally:
+            ray.shutdown()
+
+    def connect(self):
+        if ray.is_initialized():
+            raise RuntimeError("Already connected to Ray cluster.")
+        self.ray_ctx = ray.init(address=self.address)
+
+    def disconnect(self):
+        ray.shutdown()
+        self.ray_ctx = None
+
+    def shutdown(self):
+        """
+        Shutdown the ray cluster created by the `setup_ray_cluster` API.
+        """
+        import fcntl
+
+        if not self.is_shutdown:
+            try:
+                self.disconnect()
+            except Exception:
+                pass
+            os.environ.pop("RAY_ADDRESS", None)
+
+            if self.global_cluster_lock_fd is not None:
+                # release global mode cluster lock.
+                fcntl.flock(self.global_cluster_lock_fd, fcntl.LOCK_UN)
+
+            self.spark_job_server.shutdown()
+            try:
+                self.head_proc.terminate()
+            except Exception as e:
+                # swallow exception.
+                _logger.warning(
+                    "An Error occurred during shutdown of ray head node: " f"{repr(e)}"
+                )
+            self.is_shutdown = True
+
+    def __enter__(self):
+        return self
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        self.shutdown()
+
+
+def _convert_ray_node_option(key, value):
+    converted_key = f"--{key.replace('_', '-')}"
+    if key in ["system_config", "resources", "labels"]:
+        return f"{converted_key}={json.dumps(value)}"
+    if value is None:
+        return converted_key
+    return f"{converted_key}={str(value)}"
+
+
+def _convert_ray_node_options(options):
+    return [_convert_ray_node_option(k, v) for k, v in options.items()]
+
+
+_RAY_HEAD_STARTUP_TIMEOUT = 20
+_RAY_DASHBOARD_STARTUP_TIMEOUT = 60
+_BACKGROUND_JOB_STARTUP_WAIT = int(
+    os.environ.get("RAY_ON_SPARK_BACKGROUND_JOB_STARTUP_WAIT", "30")
+)
+_RAY_CLUSTER_STARTUP_PROGRESS_CHECKING_INTERVAL = 3
+_RAY_WORKER_NODE_STARTUP_INTERVAL = int(
+    os.environ.get("RAY_ON_SPARK_RAY_WORKER_NODE_STARTUP_INTERVAL", "10")
+)
+_RAY_CONNECT_CLUSTER_POLL_PROGRESS_TIMEOUT = 120
+
+
+def _preallocate_ray_worker_port_range():
+    """
+    If we start multiple ray workers on a machine concurrently, some ray worker
+    processes might fail due to ray port conflicts, this is because race condition
+    on getting free port and opening the free port.
+    To address the issue, this function use an exclusive file lock to delay the
+    worker processes to ensure that port acquisition does not create a resource
+    contention issue due to a race condition.
+
+    After acquiring lock, it will allocate port range for worker ports
+    (for ray node config --min-worker-port and --max-worker-port).
+    Because on a spark cluster, multiple ray cluster might be created, so on one spark
+    worker machine, there might be multiple ray worker nodes running, these worker
+    nodes might belong to different ray cluster, and we must ensure these ray nodes on
+    the same machine using non-overlapping worker port range, to achieve this, in this
+    function, it creates a file `/tmp/ray_on_spark_worker_port_allocation.txt` file,
+    the file format is composed of multiple lines, each line contains 2 number: `pid`
+    and `port_range_slot_index`, each port range slot allocates 1000 ports, and
+    corresponding port range is:
+     - range_begin (inclusive): 20000 + port_range_slot_index * 1000
+     - range_end (exclusive): range_begin + 1000
+    In this function, it first scans `/tmp/ray_on_spark_worker_port_allocation.txt`
+    file, removing lines that containing dead process pid, then find the first unused
+    port_range_slot_index, then regenerate this file, and return the allocated port
+    range.
+
+    Returns: Allocated port range for current worker ports
+    """
+    import psutil
+    import fcntl
+
+    def acquire_lock(file_path):
+        mode = os.O_RDWR | os.O_CREAT | os.O_TRUNC
+        try:
+            fd = os.open(file_path, mode)
+            # The lock file must be readable / writable to all users.
+            os.chmod(file_path, 0o0777)
+            # Allow for retrying getting a file lock a maximum number of seconds
+            max_lock_iter = 600
+            for _ in range(max_lock_iter):
+                try:
+                    fcntl.flock(fd, fcntl.LOCK_EX | fcntl.LOCK_NB)
+                except BlockingIOError:
+                    # Lock is used by other processes, continue loop to wait for lock
+                    # available
+                    pass
+                else:
+                    # Acquire lock successfully.
+                    return fd
+                time.sleep(10)
+            raise TimeoutError(f"Acquiring lock on file {file_path} timeout.")
+        except Exception:
+            os.close(fd)
+
+    lock_file_path = "/tmp/ray_on_spark_worker_startup_barrier_lock.lock"
+    try:
+        lock_fd = acquire_lock(lock_file_path)
+    except TimeoutError:
+        # If timeout happens, the file lock might be hold by another process and that
+        # process does not release the lock in time by some unexpected reason.
+        # In this case, remove the existing lock file and create the file again, and
+        # then acquire file lock on the new file.
+        try:
+            os.remove(lock_file_path)
+        except Exception:
+            pass
+        lock_fd = acquire_lock(lock_file_path)
+
+    def release_lock():
+        fcntl.flock(lock_fd, fcntl.LOCK_UN)
+        os.close(lock_fd)
+
+    try:
+        port_alloc_file = "/tmp/ray_on_spark_worker_port_allocation.txt"
+
+        # NB: reading / writing `port_alloc_file` is protected by exclusive lock
+        # on file `lock_file_path`
+        if os.path.exists(port_alloc_file):
+            with open(port_alloc_file, mode="r") as fp:
+                port_alloc_data = fp.read()
+            port_alloc_table = [
+                line.split(" ") for line in port_alloc_data.strip().split("\n")
+            ]
+            port_alloc_table = [
+                (int(pid_str), int(slot_index_str))
+                for pid_str, slot_index_str in port_alloc_table
+            ]
+        else:
+            port_alloc_table = []
+            with open(port_alloc_file, mode="w"):
+                pass
+            # The port range allocation file must be readable / writable to all users.
+            os.chmod(port_alloc_file, 0o0777)
+
+        port_alloc_map = {
+            pid: slot_index
+            for pid, slot_index in port_alloc_table
+            if psutil.pid_exists(pid)  # remove slot used by dead process
+        }
+
+        allocated_slot_set = set(port_alloc_map.values())
+
+        if len(allocated_slot_set) == 0:
+            new_slot_index = 0
+        else:
+            new_slot_index = max(allocated_slot_set) + 1
+            for index in range(new_slot_index):
+                if index not in allocated_slot_set:
+                    new_slot_index = index
+                    break
+
+        port_alloc_map[os.getpid()] = new_slot_index
+
+        with open(port_alloc_file, mode="w") as fp:
+            for pid, slot_index in port_alloc_map.items():
+                fp.write(f"{pid} {slot_index}\n")
+
+        worker_port_range_begin = 20000 + new_slot_index * 1000
+        worker_port_range_end = worker_port_range_begin + 1000
+
+        if worker_port_range_end > 65536:
+            raise RuntimeError(
+                "Too many ray worker nodes are running on this machine, cannot "
+                "allocate worker port range for new ray worker node."
+            )
+    except Exception:
+        release_lock()
+        raise
+
+    def hold_lock():
+        time.sleep(_RAY_WORKER_NODE_STARTUP_INTERVAL)
+        release_lock()
+
+    threading.Thread(target=hold_lock, args=()).start()
+
+    return worker_port_range_begin, worker_port_range_end
+
+
+def _append_default_spilling_dir_config(head_node_options, object_spilling_dir):
+    if "system_config" not in head_node_options:
+        head_node_options["system_config"] = {}
+    sys_conf = head_node_options["system_config"]
+    if "object_spilling_config" not in sys_conf:
+        sys_conf["object_spilling_config"] = json.dumps(
+            {
+                "type": "filesystem",
+                "params": {
+                    "directory_path": object_spilling_dir,
+                },
+            }
+        )
+    return head_node_options
+
+
+def _append_resources_config(node_options, resources):
+    if "resources" not in node_options:
+        node_options["resources"] = {}
+
+    node_options["resources"].update(resources)
+    return node_options
+
+
+def _get_default_ray_tmp_dir():
+    return os.path.join(os.environ.get("RAY_TMPDIR", "/tmp"), "ray")
+
+
+def _create_hook_entry(is_global):
+    if RAY_ON_SPARK_START_HOOK in os.environ:
+        return _load_class(os.environ[RAY_ON_SPARK_START_HOOK])()
+    elif is_in_databricks_runtime():
+        return DefaultDatabricksRayOnSparkStartHook(is_global)
+    else:
+        return RayOnSparkStartHook(is_global)
+
+
+def _setup_ray_cluster(
+    *,
+    max_worker_nodes: int,
+    min_worker_nodes: int,
+    num_cpus_worker_node: int,
+    num_cpus_head_node: int,
+    num_gpus_worker_node: int,
+    num_gpus_head_node: int,
+    using_stage_scheduling: bool,
+    heap_memory_worker_node: int,
+    heap_memory_head_node: int,
+    object_store_memory_worker_node: int,
+    object_store_memory_head_node: int,
+    head_node_options: Dict,
+    worker_node_options: Dict,
+    ray_temp_root_dir: str,
+    collect_log_to_path: str,
+    autoscale_upscaling_speed: float,
+    autoscale_idle_timeout_minutes: float,
+    is_global: bool,
+) -> Type[RayClusterOnSpark]:
+    """
+    The public API `ray.util.spark.setup_ray_cluster` does some argument
+    validation and then pass validated arguments to this interface.
+    and it returns a `RayClusterOnSpark` instance.
+
+    The returned instance can be used to connect to, disconnect from and shutdown the
+    ray cluster. This instance can also be used as a context manager (used by
+    encapsulating operations within `with _setup_ray_cluster(...):`). Upon entering the
+    managed scope, the ray cluster is initiated and connected to. When exiting the
+    scope, the ray cluster is disconnected and shut down.
+
+    Note: This function interface is stable and can be used for
+    instrumentation logging patching.
+    """
+    import fcntl
+
+    start_hook = _create_hook_entry(is_global)
+    spark = get_spark_session()
+
+    ray_head_ip = socket.gethostbyname(get_spark_application_driver_host(spark))
+    ray_head_port = get_random_unused_port(ray_head_ip, min_port=9000, max_port=10000)
+    port_exclude_list = [ray_head_port]
+
+    # Make a copy for head_node_options to avoid changing original dict in user code.
+    head_node_options = head_node_options.copy()
+    include_dashboard = head_node_options.pop("include_dashboard", None)
+    ray_dashboard_port = head_node_options.pop("dashboard_port", None)
+
+    if is_global:
+        ray_client_server_port = 10001
+    else:
+        ray_client_server_port = get_random_unused_port(
+            ray_head_ip,
+            min_port=9000,
+            max_port=10000,
+            exclude_list=port_exclude_list,
+        )
+
+    port_exclude_list.append(ray_client_server_port)
+
+    spark_job_server_port = get_random_unused_port(
+        ray_head_ip,
+        min_port=9000,
+        max_port=10000,
+        exclude_list=port_exclude_list,
+    )
+    port_exclude_list.append(spark_job_server_port)
+
+    if include_dashboard is None or include_dashboard is True:
+        if ray_dashboard_port is None:
+            ray_dashboard_port = get_random_unused_port(
+                ray_head_ip,
+                min_port=9000,
+                max_port=10000,
+                exclude_list=port_exclude_list,
+            )
+            port_exclude_list.append(ray_dashboard_port)
+        ray_dashboard_agent_port = get_random_unused_port(
+            ray_head_ip,
+            min_port=9000,
+            max_port=10000,
+            exclude_list=port_exclude_list,
+        )
+        port_exclude_list.append(ray_dashboard_agent_port)
+
+        dashboard_options = [
+            "--dashboard-host=0.0.0.0",
+            f"--dashboard-port={ray_dashboard_port}",
+            f"--dashboard-agent-listen-port={ray_dashboard_agent_port}",
+        ]
+        # If include_dashboard is None, we don't set `--include-dashboard` option,
+        # in this case Ray will decide whether dashboard can be started
+        # (e.g. checking any missing dependencies).
+        if include_dashboard is True:
+            dashboard_options += ["--include-dashboard=true"]
+    else:
+        dashboard_options = [
+            "--include-dashboard=false",
+        ]
+
+    _logger.info(
+        f"Ray head hostname: {ray_head_ip}, port: {ray_head_port}, "
+        f"ray client server port: {ray_client_server_port}."
+    )
+
+    cluster_unique_id = uuid.uuid4().hex[:8]
+
+    if is_global:
+        # global mode enabled
+        # for global mode, Ray always uses default temp dir
+        # so that local Ray client can discover it without specifying
+        # head node address.
+        if ray_temp_root_dir is not None:
+            raise ValueError(
+                "Ray on spark global mode cluster does not allow you to set "
+                "'ray_temp_root_dir' argument."
+            )
+
+        # We only allow user to launch one active Ray on spark global cluster
+        # at a time. So acquiring a global file lock before setting up a new
+        # Ray on spark global cluster.
+        global_cluster_lock_fd = os.open(
+            "/tmp/ray_on_spark_global_cluster.lock", os.O_RDWR | os.O_CREAT | os.O_TRUNC
+        )
+
+        try:
+            # acquiring exclusive lock to ensure copy logs and removing dir safely.
+            fcntl.flock(global_cluster_lock_fd, fcntl.LOCK_EX | fcntl.LOCK_NB)
+        except BlockingIOError:
+            # acquiring global lock failed.
+            raise ValueError(
+                "Acquiring global lock failed for setting up new global mode Ray on "
+                "spark cluster. If there is an active global mode Ray on spark "
+                "cluster, please shut down it before you create a new one."
+            )
+
+        ray_temp_dir = None
+        ray_default_tmp_dir = _get_default_ray_tmp_dir()
+        os.makedirs(ray_default_tmp_dir, exist_ok=True)
+        object_spilling_dir = os.path.join(ray_default_tmp_dir, "spill")
+    else:
+        global_cluster_lock_fd = None
+        if ray_temp_root_dir is None:
+            ray_temp_root_dir = start_hook.get_default_temp_root_dir()
+        ray_temp_dir = os.path.join(
+            ray_temp_root_dir, f"ray-{ray_head_port}-{cluster_unique_id}"
+        )
+        os.makedirs(ray_temp_dir, exist_ok=True)
+        object_spilling_dir = os.path.join(ray_temp_dir, "spill")
+
+    os.makedirs(object_spilling_dir, exist_ok=True)
+
+    head_node_options = _append_default_spilling_dir_config(
+        head_node_options, object_spilling_dir
+    )
+
+    from ray.autoscaler._private.spark.spark_job_server import (
+        _start_spark_job_server,
+    )
+
+    ray_node_custom_env = start_hook.custom_environment_variables()
+    spark_job_server = _start_spark_job_server(
+        ray_head_ip, spark_job_server_port, spark, ray_node_custom_env
+    )
+    autoscaling_cluster = AutoscalingCluster(
+        head_resources={
+            "CPU": num_cpus_head_node,
+            "GPU": num_gpus_head_node,
+            "memory": heap_memory_head_node,
+            "object_store_memory": object_store_memory_head_node,
+        },
+        worker_node_types={
+            "ray.worker": {
+                "resources": {
+                    "CPU": num_cpus_worker_node,
+                    "GPU": num_gpus_worker_node,
+                    "memory": heap_memory_worker_node,
+                    "object_store_memory": object_store_memory_worker_node,
+                },
+                "node_config": {},
+                "min_workers": min_worker_nodes,
+                "max_workers": max_worker_nodes,
+            },
+        },
+        extra_provider_config={
+            "ray_head_ip": ray_head_ip,
+            "ray_head_port": ray_head_port,
+            "cluster_unique_id": cluster_unique_id,
+            "using_stage_scheduling": using_stage_scheduling,
+            "ray_temp_dir": ray_temp_dir,
+            "worker_node_options": worker_node_options,
+            "collect_log_to_path": collect_log_to_path,
+            "spark_job_server_port": spark_job_server_port,
+        },
+        upscaling_speed=autoscale_upscaling_speed,
+        idle_timeout_minutes=autoscale_idle_timeout_minutes,
+    )
+    ray_head_proc, tail_output_deque = autoscaling_cluster.start(
+        ray_head_ip,
+        ray_head_port,
+        ray_client_server_port,
+        ray_temp_dir,
+        dashboard_options,
+        head_node_options,
+        collect_log_to_path,
+        ray_node_custom_env,
+    )
+    ray_head_node_cmd = autoscaling_cluster.ray_head_node_cmd
+
+    # wait ray head node spin up.
+    time.sleep(_RAY_HEAD_STARTUP_TIMEOUT)
+
+    if not is_port_in_use(ray_head_ip, ray_head_port):
+        if ray_head_proc.poll() is None:
+            # Ray head GCS service is down. Kill ray head node.
+            ray_head_proc.terminate()
+            # wait killing complete.
+            time.sleep(0.5)
+
+        cmd_exec_failure_msg = gen_cmd_exec_failure_msg(
+            ray_head_node_cmd, ray_head_proc.returncode, tail_output_deque
+        )
+        raise RuntimeError("Start Ray head node failed!\n" + cmd_exec_failure_msg)
+
+    _logger.info("Ray head node started.")
+
+    cluster_address = f"{ray_head_ip}:{ray_head_port}"
+    # Set RAY_ADDRESS environment variable to the cluster address.
+    os.environ["RAY_ADDRESS"] = cluster_address
+
+    ray_cluster_handler = RayClusterOnSpark(
+        address=cluster_address,
+        head_proc=ray_head_proc,
+        min_worker_nodes=min_worker_nodes,
+        max_worker_nodes=max_worker_nodes,
+        temp_dir=ray_temp_dir,
+        cluster_unique_id=cluster_unique_id,
+        start_hook=start_hook,
+        ray_dashboard_port=ray_dashboard_port,
+        spark_job_server=spark_job_server,
+        global_cluster_lock_fd=global_cluster_lock_fd,
+        ray_client_server_port=ray_client_server_port,
+    )
+
+    start_hook.on_cluster_created(ray_cluster_handler)
+
+    return ray_cluster_handler
+
+
+_active_ray_cluster = None
+_active_ray_cluster_rwlock = threading.RLock()
+
+
+def _create_resource_profile(num_cpus_per_node, num_gpus_per_node):
+    from pyspark.resource.profile import ResourceProfileBuilder
+    from pyspark.resource.requests import TaskResourceRequests
+
+    task_res_req = TaskResourceRequests().cpus(num_cpus_per_node)
+    if num_gpus_per_node > 0:
+        task_res_req = task_res_req.resource("gpu", num_gpus_per_node)
+    return ResourceProfileBuilder().require(task_res_req).build
+
+
+# A dict storing blocked key to replacement argument you should use.
+_head_node_option_block_keys = {
+    "temp_dir": "ray_temp_root_dir",
+    "block": None,
+    "head": None,
+    "node_ip_address": None,
+    "port": None,
+    "num_cpus": None,
+    "num_gpus": None,
+    "dashboard_host": None,
+    "dashboard_agent_listen_port": None,
+}
+
+_worker_node_option_block_keys = {
+    "temp_dir": "ray_temp_root_dir",
+    "block": None,
+    "head": None,
+    "address": None,
+    "num_cpus": "num_cpus_worker_node",
+    "num_gpus": "num_gpus_worker_node",
+    "memory": None,
+    "object_store_memory": "object_store_memory_worker_node",
+    "dashboard_agent_listen_port": None,
+    "min_worker_port": None,
+    "max_worker_port": None,
+}
+
+
+def _verify_node_options(node_options, block_keys, node_type):
+    for key in node_options:
+        if key.startswith("--") or "-" in key:
+            raise ValueError(
+                "For a ray node option like '--foo-bar', you should convert it to "
+                "following format 'foo_bar' in 'head_node_options' / "
+                "'worker_node_options' arguments."
+            )
+
+        if key in block_keys:
+            common_err_msg = (
+                f"Setting the option '{key}' for {node_type} nodes is not allowed."
+            )
+            replacement_arg = block_keys[key]
+            if replacement_arg:
+                raise ValueError(
+                    f"{common_err_msg} You should set the '{replacement_arg}' option "
+                    "instead."
+                )
+            else:
+                raise ValueError(
+                    f"{common_err_msg} This option is controlled by Ray on Spark."
+                )
+
+
+def _setup_ray_cluster_internal(
+    max_worker_nodes: int,
+    min_worker_nodes: Optional[int],
+    num_cpus_worker_node: Optional[int],
+    num_cpus_head_node: Optional[int],
+    num_gpus_worker_node: Optional[int],
+    num_gpus_head_node: Optional[int],
+    heap_memory_worker_node: Optional[int],
+    heap_memory_head_node: Optional[int],
+    object_store_memory_worker_node: Optional[int],
+    object_store_memory_head_node: Optional[int],
+    head_node_options: Optional[Dict],
+    worker_node_options: Optional[Dict],
+    ray_temp_root_dir: Optional[str],
+    strict_mode: bool,
+    collect_log_to_path: Optional[str],
+    autoscale_upscaling_speed: Optional[float],
+    autoscale_idle_timeout_minutes: Optional[float],
+    is_global: bool,
+    **kwargs,
+) -> Tuple[str, str]:
+    global _active_ray_cluster
+
+    _check_system_environment()
+    _install_sigterm_signal()
+
+    head_node_options = head_node_options or {}
+    worker_node_options = worker_node_options or {}
+
+    _verify_node_options(
+        head_node_options,
+        _head_node_option_block_keys,
+        "Ray head node on spark",
+    )
+    _verify_node_options(
+        worker_node_options,
+        _worker_node_option_block_keys,
+        "Ray worker node on spark",
+    )
+
+    if _active_ray_cluster is not None:
+        raise RuntimeError(
+            "Current active ray cluster on spark haven't shut down. Please call "
+            "`ray.util.spark.shutdown_ray_cluster()` before initiating a new Ray "
+            "cluster on spark."
+        )
+
+    if ray.is_initialized():
+        raise RuntimeError(
+            "Current python process already initialized Ray, Please shut down it "
+            "by `ray.shutdown()` before initiating a Ray cluster on spark."
+        )
+
+    spark = get_spark_session()
+
+    spark_master = spark.sparkContext.master
+
+    is_spark_local_mode = spark_master == "local" or spark_master.startswith("local[")
+
+    if not (
+        spark_master.startswith("spark://")
+        or spark_master.startswith("local-cluster[")
+        or is_spark_local_mode
+    ):
+        raise RuntimeError(
+            "Ray on Spark only supports spark cluster in standalone mode, "
+            "local-cluster mode or spark local mode."
+        )
+
+    if is_spark_local_mode:
+        support_stage_scheduling = False
+    elif (
+        is_in_databricks_runtime()
+        and Version(os.environ["DATABRICKS_RUNTIME_VERSION"]).major >= 12
+    ):
+        support_stage_scheduling = True
+    else:
+        import pyspark
+
+        if Version(pyspark.__version__).release >= (3, 4, 0):
+            support_stage_scheduling = True
+        else:
+            support_stage_scheduling = False
+
+    if "num_cpus_per_node" in kwargs:
+        if num_cpus_worker_node is not None:
+            raise ValueError(
+                "'num_cpus_per_node' and 'num_cpus_worker_node' arguments are "
+                "equivalent. Only set 'num_cpus_worker_node'."
+            )
+        num_cpus_worker_node = kwargs["num_cpus_per_node"]
+        warnings.warn(
+            "'num_cpus_per_node' argument is deprecated, please use "
+            "'num_cpus_worker_node' argument instead.",
+            DeprecationWarning,
+        )
+
+    if "num_gpus_per_node" in kwargs:
+        if num_gpus_worker_node is not None:
+            raise ValueError(
+                "'num_gpus_per_node' and 'num_gpus_worker_node' arguments are "
+                "equivalent. Only set 'num_gpus_worker_node'."
+            )
+        num_gpus_worker_node = kwargs["num_gpus_per_node"]
+        warnings.warn(
+            "'num_gpus_per_node' argument is deprecated, please use "
+            "'num_gpus_worker_node' argument instead.",
+            DeprecationWarning,
+        )
+
+    if "object_store_memory_per_node" in kwargs:
+        if object_store_memory_worker_node is not None:
+            raise ValueError(
+                "'object_store_memory_per_node' and 'object_store_memory_worker_node' "
+                "arguments  are equivalent. Only set "
+                "'object_store_memory_worker_node'."
+            )
+        object_store_memory_worker_node = kwargs["object_store_memory_per_node"]
+        warnings.warn(
+            "'object_store_memory_per_node' argument is deprecated, please use "
+            "'object_store_memory_worker_node' argument instead.",
+            DeprecationWarning,
+        )
+
+    # Environment configurations within the Spark Session that dictate how many cpus
+    # and gpus to use for each submitted spark task.
+    num_spark_task_cpus = int(spark.sparkContext.getConf().get("spark.task.cpus", "1"))
+
+    if num_cpus_worker_node is not None and num_cpus_worker_node <= 0:
+        raise ValueError("Argument `num_cpus_worker_node` value must be > 0.")
+
+    # note: spark.task.resource.gpu.amount config might be fractional value like 0.5
+    default_num_spark_task_gpus = float(
+        spark.sparkContext.getConf().get("spark.task.resource.gpu.amount", "0")
+    )
+    rounded_num_spark_task_gpus = int(default_num_spark_task_gpus)
+    if default_num_spark_task_gpus > 0:
+        warn_msg = (
+            "You configured 'spark.task.resource.gpu.amount' to "
+            f"{default_num_spark_task_gpus},"
+            "we recommend setting this value to 0 so that Spark jobs do not "
+            "reserve GPU resources, preventing Ray-on-Spark workloads from having the "
+            "maximum number of GPUs available."
+        )
+
+        if is_in_databricks_runtime():
+            from ray.util.spark.databricks_hook import (
+                get_databricks_display_html_function,
+            )
+
+            get_databricks_display_html_function()(
+                f"<b style='color:red;'>{warn_msg}</b>"
+            )
+        else:
+            _logger.warning(warn_msg)
+
+    if num_gpus_worker_node is not None and num_gpus_worker_node < 0:
+        raise ValueError("Argument `num_gpus_worker_node` value must be >= 0.")
+
+    def _get_spark_worker_resources(_):
+        from ray.util.spark.utils import (
+            _get_cpu_cores,
+            _get_num_physical_gpus,
+            _get_spark_worker_total_physical_memory,
+        )
+
+        num_cpus_spark_worker = _get_cpu_cores()
+        num_gpus_spark_worker = _get_num_physical_gpus()
+        total_mem_bytes = _get_spark_worker_total_physical_memory()
+
+        return (
+            num_cpus_spark_worker,
+            num_gpus_spark_worker,
+            total_mem_bytes,
+        )
+
+    (num_cpus_spark_worker, num_gpus_spark_worker, spark_worker_mem_bytes,) = (
+        spark.sparkContext.parallelize([1], 1)
+        .map(_get_spark_worker_resources)
+        .collect()[0]
+    )
+
+    if num_cpus_worker_node is not None and num_gpus_worker_node is not None:
+        if support_stage_scheduling:
+            using_stage_scheduling = True
+            res_profile = _create_resource_profile(
+                num_cpus_worker_node, num_gpus_worker_node
+            )
+        else:
+            raise ValueError(
+                "Current spark version does not support stage scheduling, so that "
+                "you cannot set the argument `num_cpus_worker_node` and "
+                "`num_gpus_worker_node` values. Without setting the 2 arguments, "
+                "per-Ray worker node will be assigned with number of "
+                f"'spark.task.cpus' (equals to {num_spark_task_cpus}) cpu cores "
+                "and rounded down number of 'spark.task.resource.gpu.amount' "
+                f"(equals to {rounded_num_spark_task_gpus}) GPUs. To enable spark "
+                f"stage scheduling, you need to upgrade spark to 3.4 version or use "
+                "Databricks Runtime 12.x, and you cannot use spark local mode."
+            )
+    elif num_cpus_worker_node is None and num_gpus_worker_node is None:
+        if support_stage_scheduling:
+            # Make one Ray worker node using maximum CPU / GPU resources
+            # of the whole spark worker node, this is the optimal
+            # configuration.
+            num_cpus_worker_node = num_cpus_spark_worker
+            num_gpus_worker_node = num_gpus_spark_worker
+            using_stage_scheduling = True
+            res_profile = _create_resource_profile(
+                num_cpus_worker_node, num_gpus_worker_node
+            )
+        else:
+            using_stage_scheduling = False
+            res_profile = None
+
+            num_cpus_worker_node = num_spark_task_cpus
+            num_gpus_worker_node = rounded_num_spark_task_gpus
+    else:
+        raise ValueError(
+            "'num_cpus_worker_node' and 'num_gpus_worker_node' arguments must be"
+            "set together or unset together."
+        )
+
+    (
+        ray_worker_node_heap_mem_bytes,
+        ray_worker_node_object_store_mem_bytes,
+    ) = get_avail_mem_per_ray_worker_node(
+        spark,
+        heap_memory_worker_node,
+        object_store_memory_worker_node,
+        num_cpus_worker_node,
+        num_gpus_worker_node,
+    )
+
+    spark_worker_ray_node_slots = _get_local_ray_node_slots(
+        num_cpus_spark_worker,
+        num_gpus_spark_worker,
+        num_cpus_worker_node,
+        num_gpus_worker_node,
+    )
+
+    spark_executor_memory_bytes = get_configured_spark_executor_memory_bytes(spark)
+    spark_worker_required_memory_bytes = (
+        spark_executor_memory_bytes
+        + spark_worker_ray_node_slots
+        * (ray_worker_node_heap_mem_bytes + ray_worker_node_object_store_mem_bytes)
+    )
+    if spark_worker_required_memory_bytes > 0.8 * spark_worker_mem_bytes:
+        warn_msg = (
+            "In each spark worker node, we recommend making the sum of "
+            "'spark_executor_memory + num_Ray_worker_nodes_per_spark_worker * "
+            "(memory_worker_node + object_store_memory_worker_node)' to be less than "
+            "'spark_worker_physical_memory * 0.8', otherwise it might lead to "
+            "spark worker physical memory exhaustion and Ray task OOM errors."
+        )
+
+        if is_in_databricks_runtime():
+            from ray.util.spark.databricks_hook import (
+                get_databricks_display_html_function,
+            )
+
+            get_databricks_display_html_function()(
+                f"<b style='background-color:Cyan;'>{warn_msg}<br></b>"
+            )
+        else:
+            _logger.warning(warn_msg)
+
+    if "num_worker_nodes" in kwargs:
+        raise ValueError(
+            "'num_worker_nodes' argument is removed, please set "
+            "'max_worker_nodes' and 'min_worker_nodes' argument instead."
+        )
+
+    if max_worker_nodes == MAX_NUM_WORKER_NODES:
+        if min_worker_nodes is not None:
+            raise ValueError(
+                "If you set 'max_worker_nodes' to 'MAX_NUM_WORKER_NODES', autoscaling "
+                "is not supported, so that you cannot set 'min_worker_nodes' argument "
+                "and 'min_worker_nodes' is automatically set to be equal to "
+                "'max_worker_nodes'."
+            )
+
+        # max_worker_nodes=MAX_NUM_WORKER_NODES represents using all available
+        # spark task slots
+        max_worker_nodes = get_max_num_concurrent_tasks(spark.sparkContext, res_profile)
+        min_worker_nodes = max_worker_nodes
+    elif max_worker_nodes <= 0:
+        raise ValueError(
+            "The value of 'max_worker_nodes' argument must be either a positive "
+            "integer or 'ray.util.spark.MAX_NUM_WORKER_NODES'."
+        )
+
+    if "autoscale" in kwargs:
+        raise ValueError(
+            "'autoscale' argument is removed. You can set 'min_worker_nodes' argument "
+            "to be less than 'max_worker_nodes' to make autoscaling enabled."
+        )
+
+    if min_worker_nodes is None:
+        min_worker_nodes = max_worker_nodes
+    elif not (0 <= min_worker_nodes <= max_worker_nodes):
+        raise ValueError(
+            "The value of 'max_worker_nodes' argument must be an integer >= 0 "
+            "and <= 'max_worker_nodes'"
+        )
+
+    insufficient_resources = []
+
+    if num_cpus_worker_node < 4:
+        insufficient_resources.append(
+            "The provided CPU resources for each ray worker are inadequate to start "
+            "a ray cluster. Based on the total cpu resources available and the "
+            "configured task sizing, each ray worker node would start with "
+            f"{num_cpus_worker_node} CPU cores. This is less than the recommended "
+            "value of `4` CPUs per worker. On spark version >= 3.4 or Databricks "
+            "Runtime 12.x, you can set the argument `num_cpus_worker_node` to "
+            "a value >= 4 to address it, otherwise you need to increase the spark "
+            "application configuration 'spark.task.cpus' to a minimum of `4` to "
+            "address it."
+        )
+
+    if ray_worker_node_heap_mem_bytes < 10 * 1024 * 1024 * 1024:
+        insufficient_resources.append(
+            "The provided memory resources for each ray worker node are inadequate. "
+            "Based on the total memory available on the spark cluster and the "
+            "configured task sizing, each ray worker would start with "
+            f"{ray_worker_node_heap_mem_bytes} bytes heap memory. This is less than "
+            "the recommended value of 10GB. The ray worker node heap memory size is "
+            "calculated by "
+            "(SPARK_WORKER_PHYSICAL_MEMORY / num_local_spark_task_slots * 0.8) - "
+            "object_store_memory_worker_node. To increase the heap space available, "
+            "increase the memory in the spark cluster by using instance types with "
+            "larger memory, or increase number of CPU/GPU per Ray worker node "
+            "(so it leads to less Ray worker node slots per spark worker node), "
+            "or apply a lower `object_store_memory_worker_node`."
+        )
+    if insufficient_resources:
+        if strict_mode:
+            raise ValueError(
+                "You are creating ray cluster on spark with strict mode (it can be "
+                "disabled by setting argument 'strict_mode=False' when calling API "
+                "'setup_ray_cluster'), strict mode requires the spark cluster config "
+                "satisfying following criterion: "
+                "\n".join(insufficient_resources)
+            )
+        else:
+            _logger.warning("\n".join(insufficient_resources))
+
+    if num_cpus_head_node is None:
+        if is_global:
+            num_cpus_head_node = _get_cpu_cores()
+        else:
+            num_cpus_head_node = 0
+    else:
+        if num_cpus_head_node < 0:
+            raise ValueError(
+                "Argument `num_cpus_head_node` value must be >= 0. "
+                f"Current value is {num_cpus_head_node}."
+            )
+
+    if num_gpus_head_node is None:
+        if is_global:
+            try:
+                num_gpus_head_node = _get_num_physical_gpus()
+            except Exception:
+                num_gpus_head_node = 0
+        else:
+            num_gpus_head_node = 0
+    else:
+        if num_gpus_head_node < 0:
+            raise ValueError(
+                "Argument `num_gpus_head_node` value must be >= 0."
+                f"Current value is {num_gpus_head_node}."
+            )
+
+    if (
+        num_cpus_head_node == 0
+        and num_gpus_head_node == 0
+        and object_store_memory_head_node is None
+    ):
+        # Because tasks that require CPU or GPU resources are not scheduled to Ray
+        # head node, and user does not set `object_store_memory_head_node` explicitly,
+        # limit the heap memory and object store memory allocation to the
+        # head node, in order to save spark driver memory.
+        heap_memory_head_node = 1024 * 1024 * 1024
+        object_store_memory_head_node = 1024 * 1024 * 1024
+    else:
+        heap_memory_head_node, object_store_memory_head_node = calc_mem_ray_head_node(
+            heap_memory_head_node, object_store_memory_head_node
+        )
+
+    with _active_ray_cluster_rwlock:
+        cluster = _setup_ray_cluster(
+            max_worker_nodes=max_worker_nodes,
+            min_worker_nodes=min_worker_nodes,
+            num_cpus_worker_node=num_cpus_worker_node,
+            num_cpus_head_node=num_cpus_head_node,
+            num_gpus_worker_node=num_gpus_worker_node,
+            num_gpus_head_node=num_gpus_head_node,
+            using_stage_scheduling=using_stage_scheduling,
+            heap_memory_worker_node=ray_worker_node_heap_mem_bytes,
+            heap_memory_head_node=heap_memory_head_node,
+            object_store_memory_worker_node=ray_worker_node_object_store_mem_bytes,
+            object_store_memory_head_node=object_store_memory_head_node,
+            head_node_options=head_node_options,
+            worker_node_options=worker_node_options,
+            ray_temp_root_dir=ray_temp_root_dir,
+            collect_log_to_path=collect_log_to_path,
+            autoscale_upscaling_speed=autoscale_upscaling_speed,
+            autoscale_idle_timeout_minutes=autoscale_idle_timeout_minutes,
+            is_global=is_global,
+        )
+        # set global _active_ray_cluster to be the
+        # started cluster.
+        _active_ray_cluster = cluster
+
+        try:
+            cluster.wait_until_ready()  # NB: this line might raise error.
+        except Exception as e:
+            try:
+                shutdown_ray_cluster()
+            except Exception:
+                pass
+            raise RuntimeError("Launch Ray-on-Saprk cluster failed") from e
+
+    head_ip = cluster.address.split(":")[0]
+    remote_connection_address = f"ray://{head_ip}:{cluster.ray_client_server_port}"
+    return cluster.address, remote_connection_address
+
+
+@PublicAPI
+def setup_ray_cluster(
+    *,
+    max_worker_nodes: int,
+    min_worker_nodes: Optional[int] = None,
+    num_cpus_worker_node: Optional[int] = None,
+    num_cpus_head_node: Optional[int] = None,
+    num_gpus_worker_node: Optional[int] = None,
+    num_gpus_head_node: Optional[int] = None,
+    memory_worker_node: Optional[int] = None,
+    memory_head_node: Optional[int] = None,
+    object_store_memory_worker_node: Optional[int] = None,
+    object_store_memory_head_node: Optional[int] = None,
+    head_node_options: Optional[Dict] = None,
+    worker_node_options: Optional[Dict] = None,
+    ray_temp_root_dir: Optional[str] = None,
+    strict_mode: bool = False,
+    collect_log_to_path: Optional[str] = None,
+    autoscale_upscaling_speed: Optional[float] = 1.0,
+    autoscale_idle_timeout_minutes: Optional[float] = 1.0,
+    **kwargs,
+) -> Tuple[str, str]:
+    """
+    Set up a ray cluster on the spark cluster by starting a ray head node in the
+    spark application's driver side node.
+    After creating the head node, a background spark job is created that
+    generates an instance of `RayClusterOnSpark` that contains configuration for the
+    ray cluster that will run on the Spark cluster's worker nodes.
+    After a ray cluster is set up, "RAY_ADDRESS" environment variable is set to
+    the cluster address, so you can call `ray.init()` without specifying ray cluster
+    address to connect to the cluster. To shut down the cluster you can call
+    `ray.util.spark.shutdown_ray_cluster()`.
+    Note: If the active ray cluster haven't shut down, you cannot create a new ray
+    cluster.
+
+    Args:
+        max_worker_nodes: This argument represents maximum ray worker nodes to start
+            for the ray cluster. you can
+            specify the `max_worker_nodes` as `ray.util.spark.MAX_NUM_WORKER_NODES`
+            represents a ray cluster
+            configuration that will use all available resources configured for the
+            spark application.
+            To create a spark application that is intended to exclusively run a
+            shared ray cluster in non-scaling, it is recommended to set this argument
+            to `ray.util.spark.MAX_NUM_WORKER_NODES`.
+        min_worker_nodes: Minimal number of worker nodes (default `None`),
+            if "max_worker_nodes" value is equal to "min_worker_nodes" argument,
+            or "min_worker_nodes" argument value is None, then autoscaling is disabled
+            and Ray cluster is launched with fixed number "max_worker_nodes" of
+            Ray worker nodes, otherwise autoscaling is enabled.
+        num_cpus_worker_node: Number of cpus available to per-ray worker node, if not
+            provided, if spark stage scheduling is supported, 'num_cpus_head_node'
+            value equals to number of cpu cores per spark worker node, otherwise
+            it uses spark application configuration 'spark.task.cpus' instead.
+            **Limitation** Only spark version >= 3.4 or Databricks Runtime 12.x
+            supports setting this argument.
+        num_cpus_head_node: Number of cpus available to Ray head node, if not provide,
+            if it is global mode Ray cluster, use number of cpu cores in spark driver
+            node, otherwise use 0 instead.
+            use 0 instead. Number 0 means tasks requiring CPU resources are not
+            scheduled to Ray head node.
+        num_gpus_worker_node: Number of gpus available to per-ray worker node, if not
+            provided, if spark stage scheduling is supported, 'num_gpus_worker_node'
+            value equals to number of GPUs per spark worker node, otherwise
+            it uses rounded down value of spark application configuration
+            'spark.task.resource.gpu.amount' instead.
+            This argument is only available on spark cluster that is configured with
+            'gpu' resources.
+            **Limitation** Only spark version >= 3.4 or Databricks Runtime 12.x
+            supports setting this argument.
+        num_gpus_head_node: Number of gpus available to Ray head node, if not provide,
+            if it is global mode Ray cluster, use number of GPUs in spark driver node,
+            otherwise use 0 instead.
+            This argument is only available on spark cluster which spark driver node
+            has GPUs.
+        memory_worker_node: Optional[int]:
+            Heap memory configured for Ray worker node. This is basically setting
+            `--memory` option when starting Ray node by `ray start` command.
+        memory_head_node: Optional[int]:
+            Heap memory configured for Ray head node. This is basically setting
+            `--memory` option when starting Ray node by `ray start` command.
+        object_store_memory_worker_node: Object store memory available to per-ray worker
+            node, but it is capped by
+            "dev_shm_available_size * 0.8 / num_tasks_per_spark_worker".
+            The default value equals to
+            "0.3 * spark_worker_physical_memory * 0.8 / num_tasks_per_spark_worker".
+        object_store_memory_head_node: Object store memory available to Ray head
+            node, but it is capped by "dev_shm_available_size * 0.8".
+            The default value equals to
+            "0.3 * spark_driver_physical_memory * 0.8".
+        head_node_options: A dict representing Ray head node extra options, these
+            options will be passed to `ray start` script. Note you need to convert
+            `ray start` options key from `--foo-bar` format to `foo_bar` format.
+            For flag options (e.g. '--disable-usage-stats'), you should set the value
+            to None in the option dict, like `{"disable_usage_stats": None}`.
+            Note: Short name options (e.g. '-v') are not supported.
+        worker_node_options: A dict representing Ray worker node extra options,
+            these options will be passed to `ray start` script. Note you need to
+            convert `ray start` options key from `--foo-bar` format to `foo_bar`
+            format.
+            For flag options (e.g. '--disable-usage-stats'), you should set the value
+            to None in the option dict, like `{"disable_usage_stats": None}`.
+            Note: Short name options (e.g. '-v') are not supported.
+        ray_temp_root_dir: A local disk path to store the ray temporary data. The
+            created cluster will create a subdirectory
+            "ray-{head_port}-{random_suffix}" beneath this path.
+        strict_mode: Boolean flag to fast-fail initialization of the ray cluster if
+            the available spark cluster does not have sufficient resources to fulfill
+            the resource allocation for memory, cpu and gpu. When set to true, if the
+            requested resources are not available for recommended minimum recommended
+            functionality, an exception will be raised that details the inadequate
+            spark cluster configuration settings. If overridden as `False`,
+            a warning is raised.
+        collect_log_to_path: If specified, after ray head / worker nodes terminated,
+            collect their logs to the specified path. On Databricks Runtime, we
+            recommend you to specify a local path starts with '/dbfs/', because the
+            path mounts with a centralized storage device and stored data is persisted
+            after Databricks spark cluster terminated.
+        autoscale_upscaling_speed: If autoscale enabled, it represents the number of
+            nodes allowed to be pending as a multiple of the current number of nodes.
+            The higher the value, the more aggressive upscaling will be. For example,
+            if this is set to 1.0, the cluster can grow in size by at most 100% at any
+            time, so if the cluster currently has 20 nodes, at most 20 pending launches
+            are allowed. The minimum number of pending launches is 5 regardless of
+            this setting.
+            Default value is 1.0, minimum value is 1.0
+        autoscale_idle_timeout_minutes: If autoscale enabled, it represents the number
+            of minutes that need to pass before an idle worker node is removed by the
+            autoscaler. The smaller the value, the more aggressive downscaling will be.
+            Worker nodes are considered idle when they hold no active tasks, actors,
+            or referenced objects (either in-memory or spilled to disk). This parameter
+            does not affect the head node.
+            Default value is 1.0, minimum value is 0
+    Returns:
+        returns a tuple of (address, remote_connection_address)
+        "address" is in format of "<ray_head_node_ip>:<port>"
+        "remote_connection_address" is in format of
+        "ray://<ray_head_node_ip>:<ray-client-server-port>",
+        if your client runs on a machine that also hosts a Ray cluster node locally,
+        you can connect to the Ray cluster via ``ray.init(address)``,
+        otherwise you can connect to the Ray cluster via
+        ``ray.init(remote_connection_address)``.
+    """
+
+    return _setup_ray_cluster_internal(
+        max_worker_nodes=max_worker_nodes,
+        min_worker_nodes=min_worker_nodes,
+        num_cpus_worker_node=num_cpus_worker_node,
+        num_cpus_head_node=num_cpus_head_node,
+        num_gpus_worker_node=num_gpus_worker_node,
+        num_gpus_head_node=num_gpus_head_node,
+        heap_memory_worker_node=memory_worker_node,
+        heap_memory_head_node=memory_head_node,
+        object_store_memory_worker_node=object_store_memory_worker_node,
+        object_store_memory_head_node=object_store_memory_head_node,
+        head_node_options=head_node_options,
+        worker_node_options=worker_node_options,
+        ray_temp_root_dir=ray_temp_root_dir,
+        strict_mode=strict_mode,
+        collect_log_to_path=collect_log_to_path,
+        autoscale_upscaling_speed=autoscale_upscaling_speed,
+        autoscale_idle_timeout_minutes=autoscale_idle_timeout_minutes,
+        is_global=False,
+        **kwargs,
+    )
+
+
+@PublicAPI
+def setup_global_ray_cluster(
+    *,
+    max_worker_nodes: int,
+    is_blocking: bool = True,
+    min_worker_nodes: Optional[int] = None,
+    num_cpus_worker_node: Optional[int] = None,
+    num_cpus_head_node: Optional[int] = None,
+    num_gpus_worker_node: Optional[int] = None,
+    num_gpus_head_node: Optional[int] = None,
+    memory_worker_node: Optional[int] = None,
+    memory_head_node: Optional[int] = None,
+    object_store_memory_worker_node: Optional[int] = None,
+    object_store_memory_head_node: Optional[int] = None,
+    head_node_options: Optional[Dict] = None,
+    worker_node_options: Optional[Dict] = None,
+    strict_mode: bool = False,
+    collect_log_to_path: Optional[str] = None,
+    autoscale_upscaling_speed: Optional[float] = 1.0,
+    autoscale_idle_timeout_minutes: Optional[float] = 1.0,
+):
+    """
+    Set up a global mode cluster.
+    The global Ray on spark cluster means:
+    - You can only create one active global Ray on spark cluster at a time.
+    On databricks cluster, the global Ray cluster can be used by all users,
+    - as contrast, non-global Ray cluster can only be used by current notebook
+    user.
+    - It is up persistently without automatic shutdown.
+    - On databricks notebook, you can connect to the global cluster by calling
+    ``ray.init()`` without specifying its address, it will discover the
+    global cluster automatically if it is up.
+
+    For global mode, the ``ray_temp_root_dir`` argument is not supported.
+    Global model Ray cluster always use the default Ray temporary directory
+    path.
+
+    All arguments are the same with ``setup_ray_cluster`` API except that:
+    - the ``ray_temp_root_dir`` argument is not supported.
+    Global model Ray cluster always use the default Ray temporary directory
+    path.
+    - A new argument "is_blocking" (default ``True``) is added.
+    If "is_blocking" is True,
+    then keep the call blocking until it is interrupted.
+    once the call is interrupted, the global Ray on spark cluster is shut down and
+    `setup_global_ray_cluster` call terminates.
+    If "is_blocking" is False,
+    once Ray cluster setup completes, return immediately.
+    """
+
+    cluster_address = _setup_ray_cluster_internal(
+        max_worker_nodes=max_worker_nodes,
+        min_worker_nodes=min_worker_nodes,
+        num_cpus_worker_node=num_cpus_worker_node,
+        num_cpus_head_node=num_cpus_head_node,
+        num_gpus_worker_node=num_gpus_worker_node,
+        num_gpus_head_node=num_gpus_head_node,
+        heap_memory_worker_node=memory_worker_node,
+        heap_memory_head_node=memory_head_node,
+        object_store_memory_worker_node=object_store_memory_worker_node,
+        object_store_memory_head_node=object_store_memory_head_node,
+        head_node_options=head_node_options,
+        worker_node_options=worker_node_options,
+        ray_temp_root_dir=None,
+        strict_mode=strict_mode,
+        collect_log_to_path=collect_log_to_path,
+        autoscale_upscaling_speed=autoscale_upscaling_speed,
+        autoscale_idle_timeout_minutes=autoscale_idle_timeout_minutes,
+        is_global=True,
+    )
+
+    if not is_blocking:
+        return cluster_address
+
+    global _global_ray_cluster_cancel_event
+    try:
+        _global_ray_cluster_cancel_event = Event()
+        # serve forever until user cancel the command.
+        _global_ray_cluster_cancel_event.wait()
+    finally:
+        _global_ray_cluster_cancel_event = None
+        # once the program is interrupted,
+        # or the corresponding databricks notebook command is interrupted
+        # shut down the Ray cluster.
+        shutdown_ray_cluster()
+
+
+def _start_ray_worker_nodes(
+    *,
+    spark_job_server,
+    spark_job_group_id,
+    spark_job_group_desc,
+    num_worker_nodes,
+    using_stage_scheduling,
+    ray_head_ip,
+    ray_head_port,
+    ray_temp_dir,
+    num_cpus_per_node,
+    num_gpus_per_node,
+    heap_memory_per_node,
+    object_store_memory_per_node,
+    worker_node_options,
+    collect_log_to_path,
+    node_id,
+):
+    # NB:
+    # In order to start ray worker nodes on spark cluster worker machines,
+    # We launch a background spark job:
+    #  1. Each spark task launches one ray worker node. This design ensures all ray
+    #     worker nodes have the same shape (same cpus / gpus / memory configuration).
+    #     If ray worker nodes have a non-uniform shape, the Ray cluster setup will
+    #     be non-deterministic and could create issues with node sizing.
+    #  2. A ray worker node is started via the `ray start` CLI. In each spark task,
+    #     a child process is started and will execute a `ray start ...` command in
+    #     blocking mode.
+    #  3. Each task will acquire a file lock for 10s to ensure that the ray worker
+    #     init will acquire a port connection to the ray head node that does not
+    #     contend with other worker processes on the same Spark worker node.
+    #  4. When the ray cluster is shutdown, killing ray worker nodes is implemented by
+    #     `sparkContext.cancelJobGroup` to cancel the background spark job, sending a
+    #     SIGKILL signal to all spark tasks. Once the spark tasks are killed,
+    #     `ray_start_node` process detects parent died event then it kills ray
+    #     worker node.
+    spark = spark_job_server.spark
+    spark_job_server_port = spark_job_server.server_address[1]
+    ray_node_custom_env = spark_job_server.ray_node_custom_env
+
+    def ray_cluster_job_mapper(_):
+        from pyspark.taskcontext import TaskContext
+
+        _worker_logger = logging.getLogger("ray.util.spark.worker")
+
+        context = TaskContext.get()
+
+        (
+            worker_port_range_begin,
+            worker_port_range_end,
+        ) = _preallocate_ray_worker_port_range()
+
+        # 10001 is used as ray client server port of global mode ray cluster.
+        ray_worker_node_dashboard_agent_port = get_random_unused_port(
+            ray_head_ip, min_port=10002, max_port=20000
+        )
+        ray_worker_node_cmd = [
+            sys.executable,
+            "-m",
+            "ray.util.spark.start_ray_node",
+            f"--num-cpus={num_cpus_per_node}",
+            "--block",
+            f"--address={ray_head_ip}:{ray_head_port}",
+            f"--memory={heap_memory_per_node}",
+            f"--object-store-memory={object_store_memory_per_node}",
+            f"--min-worker-port={worker_port_range_begin}",
+            f"--max-worker-port={worker_port_range_end - 1}",
+            f"--dashboard-agent-listen-port={ray_worker_node_dashboard_agent_port}",
+            *_convert_ray_node_options(worker_node_options),
+        ]
+        if ray_temp_dir is not None:
+            ray_worker_node_cmd.append(f"--temp-dir={ray_temp_dir}")
+
+        ray_worker_node_extra_envs = {
+            RAY_ON_SPARK_COLLECT_LOG_TO_PATH: collect_log_to_path or "",
+            RAY_ON_SPARK_START_RAY_PARENT_PID: str(os.getpid()),
+            "RAY_ENABLE_WINDOWS_OR_OSX_CLUSTER": "1",
+            **ray_node_custom_env,
+        }
+
+        if num_gpus_per_node > 0:
+            task_resources = context.resources()
+
+            if "gpu" not in task_resources:
+                raise RuntimeError(
+                    "Couldn't get the gpu id, Please check the GPU resource "
+                    "configuration"
+                )
+            gpu_addr_list = [
+                int(addr.strip()) for addr in task_resources["gpu"].addresses
+            ]
+
+            available_physical_gpus = get_spark_task_assigned_physical_gpus(
+                gpu_addr_list
+            )
+            ray_worker_node_cmd.append(
+                f"--num-gpus={len(available_physical_gpus)}",
+            )
+            ray_worker_node_extra_envs["CUDA_VISIBLE_DEVICES"] = ",".join(
+                [str(gpu_id) for gpu_id in available_physical_gpus]
+            )
+
+        _worker_logger.info(
+            f"Start Ray worker, command: {' '.join(ray_worker_node_cmd)}"
+        )
+
+        try:
+            is_task_reschedule_failure = False
+            # Check node id availability
+            response = requests.post(
+                url=(
+                    f"http://{ray_head_ip}:{spark_job_server_port}"
+                    "/check_node_id_availability"
+                ),
+                json={
+                    "node_id": node_id,
+                    "spark_job_group_id": spark_job_group_id,
+                },
+            )
+            if not response.json()["available"]:
+                # The case happens when a Ray node is down unexpected
+                # caused by spark worker node down and spark tries to
+                # reschedule the spark task, so it triggers node
+                # creation with duplicated node id.
+                # in this case, finish the spark task immediately
+                # so spark won't try to reschedule this task
+                # and Ray autoscaler will trigger a new node creation
+                # with new node id, and a new spark job will be created
+                # for holding it.
+                is_task_reschedule_failure = True
+                raise RuntimeError(
+                    "Starting Ray worker node twice with the same node id "
+                    "is not allowed."
+                )
+
+            # Notify job server the task has been launched.
+            requests.post(
+                url=(
+                    f"http://{ray_head_ip}:{spark_job_server_port}"
+                    "/notify_task_launched"
+                ),
+                json={
+                    "spark_job_group_id": spark_job_group_id,
+                },
+            )
+
+            # Note:
+            # When a pyspark job cancelled, the UDF python worker process are killed by
+            # signal "SIGKILL", then `start_ray_node` process will detect the parent
+            # died event (see `ray.util.spark.start_ray_node.check_parent_alive`) and
+            # then kill ray worker node process and execute cleanup routine.
+            exec_cmd(
+                ray_worker_node_cmd,
+                synchronous=True,
+                extra_env=ray_worker_node_extra_envs,
+            )
+        except Exception as e:
+            # In the following 2 cases, exception is raised:
+            # (1)
+            # Starting Ray worker node fails, the `e` will contain detail
+            # subprocess stdout/stderr output.
+            # (2)
+            # In autoscaling mode, when Ray worker node is down, autoscaler will
+            # try to start new Ray worker node if necessary,
+            # and it creates a new spark job to launch Ray worker node process,
+            # note the old spark job will reschedule the failed spark task
+            # and raise error of "Starting Ray worker node twice with the same
+            # node id is not allowed".
+            #
+            # For either case (1) or case (2),
+            # to avoid Spark triggers more spark task retries, we swallow
+            # exception here to make spark the task exit normally.
+            err_msg = f"Ray worker node process exit, reason: {e}."
+            _logger.warning(err_msg)
+
+            yield err_msg, is_task_reschedule_failure
+
+    spark.sparkContext.setJobGroup(
+        spark_job_group_id,
+        spark_job_group_desc,
+    )
+
+    # Starting a normal spark job (not barrier spark job) to run ray worker
+    # nodes, the design purpose is:
+    # 1. Using normal spark job, spark tasks can automatically retry
+    # individually, we don't need to write additional retry logic, But, in
+    # barrier mode, if one spark task fails, it will cause all other spark
+    # tasks killed.
+    # 2. Using normal spark job, we can support failover when a spark worker
+    # physical machine crashes. (spark will try to re-schedule the spark task
+    # to other spark worker nodes)
+    # 3. Using barrier mode job, if the cluster resources does not satisfy
+    # "idle spark task slots >= argument num_spark_task", then the barrier
+    # job gets stuck and waits until enough idle task slots available, this
+    # behavior is not user-friendly, on a shared spark cluster, user is hard
+    # to estimate how many idle tasks available at a time, But, if using normal
+    # spark job, it can launch job with less spark tasks (i.e. user will see a
+    # ray cluster setup with less worker number initially), and when more task
+    # slots become available, it continues to launch tasks on new available
+    # slots, and user can see the ray cluster worker number increases when more
+    # slots available.
+    job_rdd = spark.sparkContext.parallelize(
+        list(range(num_worker_nodes)), num_worker_nodes
+    )
+
+    if using_stage_scheduling:
+        resource_profile = _create_resource_profile(
+            num_cpus_per_node,
+            num_gpus_per_node,
+        )
+        job_rdd = job_rdd.withResources(resource_profile)
+
+    hook_entry = _create_hook_entry(is_global=(ray_temp_dir is None))
+    hook_entry.on_spark_job_created(spark_job_group_id)
+
+    err_msg, is_task_reschedule_failure = job_rdd.mapPartitions(
+        ray_cluster_job_mapper
+    ).collect()[0]
+    if not is_task_reschedule_failure:
+        spark_job_server.last_worker_error = err_msg
+        return err_msg
+
+    return None
+
+
+@PublicAPI
+def shutdown_ray_cluster() -> None:
+    """
+    Shut down the active ray cluster.
+    """
+    global _active_ray_cluster
+
+    with _active_ray_cluster_rwlock:
+        if _active_ray_cluster is None:
+            raise RuntimeError("No active ray cluster to shut down.")
+
+        _active_ray_cluster.shutdown()
+        _active_ray_cluster = None
+
+
+_global_ray_cluster_cancel_event = None
+
+
+@DeveloperAPI
+class AutoscalingCluster:
+    """Create a ray on spark autoscaling cluster."""
+
+    def __init__(
+        self,
+        head_resources: dict,
+        worker_node_types: dict,
+        extra_provider_config: dict,
+        upscaling_speed: float,
+        idle_timeout_minutes: float,
+    ):
+        """Create the cluster.
+
+        Args:
+            head_resources: resources of the head node, including CPU.
+            worker_node_types: autoscaler node types config for worker nodes.
+        """
+        self._head_resources = head_resources.copy()
+        self._head_resources["NODE_ID_AS_RESOURCE"] = HEAD_NODE_ID
+        self._config = self._generate_config(
+            head_resources,
+            worker_node_types,
+            extra_provider_config,
+            upscaling_speed,
+            idle_timeout_minutes,
+        )
+
+    def _generate_config(
+        self,
+        head_resources,
+        worker_node_types,
+        extra_provider_config,
+        upscaling_speed,
+        idle_timeout_minutes,
+    ):
+        base_config = yaml.safe_load(
+            open(
+                os.path.join(
+                    os.path.dirname(ray.__file__),
+                    "autoscaler/spark/defaults.yaml",
+                )
+            )
+        )
+        custom_config = copy.deepcopy(base_config)
+        custom_config["available_node_types"] = worker_node_types
+        custom_config["available_node_types"]["ray.head.default"] = {
+            "resources": head_resources,
+            "node_config": {},
+            "max_workers": 0,
+        }
+
+        custom_config["max_workers"] = sum(
+            v["max_workers"] for _, v in worker_node_types.items()
+        )
+
+        custom_config["provider"].update(extra_provider_config)
+
+        custom_config["upscaling_speed"] = upscaling_speed
+        custom_config["idle_timeout_minutes"] = idle_timeout_minutes
+
+        return custom_config
+
+    def start(
+        self,
+        ray_head_ip,
+        ray_head_port,
+        ray_client_server_port,
+        ray_temp_dir,
+        dashboard_options,
+        head_node_options,
+        collect_log_to_path,
+        ray_node_custom_env,
+    ):
+        """Start the cluster.
+
+        After this call returns, you can connect to the cluster with
+        ray.init("auto").
+        """
+        from ray.util.spark.cluster_init import (
+            RAY_ON_SPARK_COLLECT_LOG_TO_PATH,
+            _append_resources_config,
+            _convert_ray_node_options,
+        )
+
+        if ray_temp_dir is not None:
+            autoscale_config = os.path.join(ray_temp_dir, "autoscaling_config.json")
+        else:
+            autoscale_config = os.path.join(
+                _get_default_ray_tmp_dir(), "autoscaling_config.json"
+            )
+        with open(autoscale_config, "w") as f:
+            f.write(json.dumps(self._config))
+
+        (
+            worker_port_range_begin,
+            worker_port_range_end,
+        ) = _preallocate_ray_worker_port_range()
+
+        ray_head_node_cmd = [
+            sys.executable,
+            "-m",
+            "ray.util.spark.start_ray_node",
+            "--block",
+            "--head",
+            f"--node-ip-address={ray_head_ip}",
+            f"--port={ray_head_port}",
+            f"--ray-client-server-port={ray_client_server_port}",
+            f"--autoscaling-config={autoscale_config}",
+            f"--min-worker-port={worker_port_range_begin}",
+            f"--max-worker-port={worker_port_range_end - 1}",
+            *dashboard_options,
+        ]
+
+        if ray_temp_dir is not None:
+            ray_head_node_cmd.append(f"--temp-dir={ray_temp_dir}")
+
+        if "CPU" in self._head_resources:
+            ray_head_node_cmd.append(
+                "--num-cpus={}".format(self._head_resources.pop("CPU"))
+            )
+        if "GPU" in self._head_resources:
+            ray_head_node_cmd.append(
+                "--num-gpus={}".format(self._head_resources.pop("GPU"))
+            )
+        if "memory" in self._head_resources:
+            ray_head_node_cmd.append(
+                "--memory={}".format(self._head_resources.pop("memory"))
+            )
+        if "object_store_memory" in self._head_resources:
+            ray_head_node_cmd.append(
+                "--object-store-memory={}".format(
+                    self._head_resources.pop("object_store_memory")
+                )
+            )
+
+        head_node_options = _append_resources_config(
+            head_node_options, self._head_resources
+        )
+        ray_head_node_cmd.extend(_convert_ray_node_options(head_node_options))
+
+        extra_env = {
+            "AUTOSCALER_UPDATE_INTERVAL_S": "1",
+            RAY_ON_SPARK_COLLECT_LOG_TO_PATH: collect_log_to_path or "",
+            RAY_ON_SPARK_START_RAY_PARENT_PID: str(os.getpid()),
+            **ray_node_custom_env,
+        }
+
+        self.ray_head_node_cmd = ray_head_node_cmd
+
+        return _start_ray_head_node(
+            ray_head_node_cmd, synchronous=False, extra_env=extra_env
+        )
+
+
+def _start_ray_head_node(ray_head_node_cmd, synchronous, extra_env):
+    def preexec_function():
+        # Make `start_ray_node` script and Ray node process run
+        # in a separate group,
+        # otherwise Ray node will be in the same group of parent process,
+        # if parent process is a Jupyter notebook kernel, when user
+        # clicks interrupt cell button, SIGINT signal is sent, then Ray node will
+        # receive SIGINT signal, and it causes Ray node process dies.
+        # `start_ray_node` script should also run in a separate group
+        # because on Databricks Runtime, because if Databricks notebook
+        # is detached, if the children processes don't exit within 1s,
+        # they will receive SIGKILL, this behavior makes start_ray_node
+        # doesn't have enough time to complete cleanup work like removing
+        # temp directory and collecting logs.
+        os.setpgrp()
+
+    return exec_cmd(
+        ray_head_node_cmd,
+        synchronous=synchronous,
+        extra_env=extra_env,
+        preexec_fn=preexec_function,
+    )
+
+
+_sigterm_signal_installed = False
+
+
+def _install_sigterm_signal():
+    global _sigterm_signal_installed
+
+    if _sigterm_signal_installed:
+        return
+
+    try:
+        _origin_sigterm_handler = signal.getsignal(signal.SIGTERM)
+
+        def _sigterm_handler(signum, frame):
+            try:
+                shutdown_ray_cluster()
+            except Exception:
+                # swallow exception to continue executing the following code in the
+                # handler
+                pass
+            signal.signal(
+                signal.SIGTERM, _origin_sigterm_handler
+            )  # Reset to original signal
+            os.kill(
+                os.getpid(), signal.SIGTERM
+            )  # Re-raise the signal to trigger original behavior
+
+        signal.signal(signal.SIGTERM, _sigterm_handler)
+        _sigterm_signal_installed = True
+    except Exception:
+        _logger.warning("Install Ray-on-Spark SIGTERM handler failed.")
diff --git a/.venv/lib/python3.11/site-packages/ray/util/spark/databricks_hook.py b/.venv/lib/python3.11/site-packages/ray/util/spark/databricks_hook.py
new file mode 100644
index 0000000000000000000000000000000000000000..8558c309f3989058ef39f6049d80d625f6bc4569
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/util/spark/databricks_hook.py
@@ -0,0 +1,221 @@
+import os
+
+from .start_hook_base import RayOnSparkStartHook
+from .utils import get_spark_session
+import logging
+import threading
+import time
+
+_logger = logging.getLogger(__name__)
+
+DATABRICKS_HOST = "DATABRICKS_HOST"
+DATABRICKS_TOKEN = "DATABRICKS_TOKEN"
+DATABRICKS_CLIENT_ID = "DATABRICKS_CLIENT_ID"
+DATABRICKS_CLIENT_SECRET = "DATABRICKS_CLIENT_SECRET"
+
+
+def verify_databricks_auth_env():
+    return (DATABRICKS_HOST in os.environ and DATABRICKS_TOKEN in os.environ) or (
+        DATABRICKS_HOST in os.environ
+        and DATABRICKS_CLIENT_ID in os.environ
+        and DATABRICKS_CLIENT_SECRET in os.environ
+    )
+
+
+def get_databricks_function(func_name):
+    import IPython
+
+    ip_shell = IPython.get_ipython()
+    if ip_shell is None:
+        raise RuntimeError("No IPython environment.")
+    return ip_shell.ns_table["user_global"][func_name]
+
+
+def get_databricks_display_html_function():
+    return get_databricks_function("displayHTML")
+
+
+def get_db_entry_point():
+    """
+    Return databricks entry_point instance, it is for calling some
+    internal API in databricks runtime
+    """
+    from dbruntime import UserNamespaceInitializer
+
+    user_namespace_initializer = UserNamespaceInitializer.getOrCreate()
+    return user_namespace_initializer.get_spark_entry_point()
+
+
+def display_databricks_driver_proxy_url(spark_context, port, title):
+    """
+    This helper function create a proxy URL for databricks driver webapp forwarding.
+    In databricks runtime, user does not have permission to directly access web
+    service binding on driver machine port, but user can visit it by a proxy URL with
+    following format: "/driver-proxy/o/{orgId}/{clusterId}/{port}/".
+    """
+    driverLocal = spark_context._jvm.com.databricks.backend.daemon.driver.DriverLocal
+    commandContextTags = driverLocal.commandContext().get().toStringMap().apply("tags")
+    orgId = commandContextTags.apply("orgId")
+    clusterId = commandContextTags.apply("clusterId")
+
+    proxy_link = f"/driver-proxy/o/{orgId}/{clusterId}/{port}/"
+    proxy_url = f"https://dbc-dp-{orgId}.cloud.databricks.com{proxy_link}"
+
+    print("To monitor and debug Ray from Databricks, view the dashboard at ")
+    print(f" {proxy_url}")
+
+    get_databricks_display_html_function()(
+        f"""
+      <div style="margin-top: 16px;margin-bottom: 16px">
+          <a href="{proxy_link}">
+              Open {title} in a new tab
+          </a>
+      </div>
+    """
+    )
+
+
+DATABRICKS_AUTO_SHUTDOWN_POLL_INTERVAL_SECONDS = 3
+DATABRICKS_RAY_ON_SPARK_AUTOSHUTDOWN_MINUTES = (
+    "DATABRICKS_RAY_ON_SPARK_AUTOSHUTDOWN_MINUTES"
+)
+
+
+_DATABRICKS_DEFAULT_TMP_ROOT_DIR = "/local_disk0/tmp"
+
+
+class DefaultDatabricksRayOnSparkStartHook(RayOnSparkStartHook):
+    def get_default_temp_root_dir(self):
+        return _DATABRICKS_DEFAULT_TMP_ROOT_DIR
+
+    def on_ray_dashboard_created(self, port):
+        display_databricks_driver_proxy_url(
+            get_spark_session().sparkContext, port, "Ray Cluster Dashboard"
+        )
+
+    def on_cluster_created(self, ray_cluster_handler):
+        db_api_entry = get_db_entry_point()
+
+        if self.is_global:
+            # Disable auto shutdown if
+            # 1) autoscaling enabled
+            #  because in autoscaling mode, background spark job will be killed
+            #  automatically when ray cluster is idle.
+            # 2) global mode cluster
+            #  Because global mode cluster is designed to keep running until
+            #  user request to shut down it, and global mode cluster is shared
+            #  by other users, the code here cannot track usage from other users
+            #  so that we don't know whether it is safe to shut down the global
+            #  cluster automatically.
+            auto_shutdown_minutes = 0
+        else:
+            auto_shutdown_minutes = float(
+                os.environ.get(DATABRICKS_RAY_ON_SPARK_AUTOSHUTDOWN_MINUTES, "30")
+            )
+        if auto_shutdown_minutes == 0:
+            _logger.info(
+                "The Ray cluster will keep running until you manually detach the "
+                "Databricks notebook or call "
+                "`ray.util.spark.shutdown_ray_cluster()`."
+            )
+            return
+        if auto_shutdown_minutes < 0:
+            raise ValueError(
+                "You must set "
+                f"'{DATABRICKS_RAY_ON_SPARK_AUTOSHUTDOWN_MINUTES}' "
+                "to a value >= 0."
+            )
+
+        try:
+            db_api_entry.getIdleTimeMillisSinceLastNotebookExecution()
+        except Exception:
+            _logger.warning(
+                "Failed to retrieve idle time since last notebook execution, "
+                "so that we cannot automatically shut down Ray cluster when "
+                "Databricks notebook is inactive for the specified minutes. "
+                "You need to manually detach Databricks notebook "
+                "or call `ray.util.spark.shutdown_ray_cluster()` to shut down "
+                "Ray cluster on spark."
+            )
+            return
+
+        _logger.info(
+            "The Ray cluster will be shut down automatically if you don't run "
+            "commands on the Databricks notebook for "
+            f"{auto_shutdown_minutes} minutes. You can change the "
+            "auto-shutdown minutes by setting "
+            f"'{DATABRICKS_RAY_ON_SPARK_AUTOSHUTDOWN_MINUTES}' environment "
+            "variable, setting it to 0 means that the Ray cluster keeps running "
+            "until you manually call `ray.util.spark.shutdown_ray_cluster()` or "
+            "detach Databricks notebook."
+        )
+
+        def auto_shutdown_watcher():
+            auto_shutdown_millis = auto_shutdown_minutes * 60 * 1000
+            while True:
+                if ray_cluster_handler.is_shutdown:
+                    # The cluster is shut down. The watcher thread exits.
+                    return
+
+                idle_time = db_api_entry.getIdleTimeMillisSinceLastNotebookExecution()
+
+                if idle_time > auto_shutdown_millis:
+                    from ray.util.spark import cluster_init
+
+                    with cluster_init._active_ray_cluster_rwlock:
+                        if ray_cluster_handler is cluster_init._active_ray_cluster:
+                            cluster_init.shutdown_ray_cluster()
+                    return
+
+                time.sleep(DATABRICKS_AUTO_SHUTDOWN_POLL_INTERVAL_SECONDS)
+
+        threading.Thread(target=auto_shutdown_watcher, daemon=True).start()
+
+    def on_spark_job_created(self, job_group_id):
+        db_api_entry = get_db_entry_point()
+        db_api_entry.registerBackgroundSparkJobGroup(job_group_id)
+
+    def custom_environment_variables(self):
+        conf = {
+            **super().custom_environment_variables(),
+            # Hardcode `GLOO_SOCKET_IFNAME` to `eth0` for Databricks runtime.
+            # Torch on DBR does not reliably detect the correct interface to use,
+            # and ends up selecting the loopback interface, breaking cross-node
+            # commnication.
+            "GLOO_SOCKET_IFNAME": "eth0",
+            # 'DISABLE_MLFLOW_INTEGRATION' is the environmental variable to disable
+            # huggingface transformers MLflow integration,
+            # it doesn't work well in Databricks runtime,
+            # So disable it by default.
+            "DISABLE_MLFLOW_INTEGRATION": "TRUE",
+        }
+
+        if verify_databricks_auth_env():
+            conf[DATABRICKS_HOST] = os.environ[DATABRICKS_HOST]
+            if DATABRICKS_TOKEN in os.environ:
+                # PAT auth
+                conf[DATABRICKS_TOKEN] = os.environ[DATABRICKS_TOKEN]
+            else:
+                # OAuth
+                conf[DATABRICKS_CLIENT_ID] = os.environ[DATABRICKS_CLIENT_ID]
+                conf[DATABRICKS_CLIENT_SECRET] = os.environ[DATABRICKS_CLIENT_SECRET]
+        else:
+            warn_msg = (
+                "MLflow support is not correctly configured within Ray tasks."
+                "To enable MLflow integration, "
+                "you need to set environmental variables DATABRICKS_HOST + "
+                "DATABRICKS_TOKEN, or set environmental variables "
+                "DATABRICKS_HOST + DATABRICKS_CLIENT_ID + DATABRICKS_CLIENT_SECRET "
+                "before calling `ray.util.spark.setup_ray_cluster`, these variables "
+                "are used to set up authentication with Databricks MLflow "
+                "service. For details, you can refer to Databricks documentation at "
+                "<a href='https://docs.databricks.com/en/dev-tools/auth/pat.html'>"
+                "Databricks PAT auth</a> or "
+                "<a href='https://docs.databricks.com/en/dev-tools/auth/"
+                "oauth-m2m.html'>Databricks OAuth</a>."
+            )
+            get_databricks_display_html_function()(
+                f"<b style='color:red;'>{warn_msg}<br></b>"
+            )
+
+        return conf
diff --git a/.venv/lib/python3.11/site-packages/ray/util/spark/start_hook_base.py b/.venv/lib/python3.11/site-packages/ray/util/spark/start_hook_base.py
new file mode 100644
index 0000000000000000000000000000000000000000..d51dbec3a02bb5616b3b77646793dbcbf333c385
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/util/spark/start_hook_base.py
@@ -0,0 +1,18 @@
+class RayOnSparkStartHook:
+    def __init__(self, is_global):
+        self.is_global = is_global
+
+    def get_default_temp_root_dir(self):
+        return "/tmp"
+
+    def on_ray_dashboard_created(self, port):
+        pass
+
+    def on_cluster_created(self, ray_cluster_handler):
+        pass
+
+    def on_spark_job_created(self, job_group_id):
+        pass
+
+    def custom_environment_variables(self):
+        return {}
diff --git a/.venv/lib/python3.11/site-packages/ray/util/spark/start_ray_node.py b/.venv/lib/python3.11/site-packages/ray/util/spark/start_ray_node.py
new file mode 100644
index 0000000000000000000000000000000000000000..76489b15b9e5eb9230921334194bc4c0677f60d7
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/util/spark/start_ray_node.py
@@ -0,0 +1,212 @@
+import os.path
+import subprocess
+import sys
+import time
+import shutil
+import fcntl
+import signal
+import socket
+import logging
+import threading
+
+from ray.util.spark.cluster_init import (
+    RAY_ON_SPARK_COLLECT_LOG_TO_PATH,
+    RAY_ON_SPARK_START_RAY_PARENT_PID,
+)
+from ray._private.ray_process_reaper import SIGTERM_GRACE_PERIOD_SECONDS
+
+
+# Spark on ray implementation does not directly invoke `ray start ...` script to create
+# ray node subprocess, instead, it creates a subprocess to run this
+# `ray.util.spark.start_ray_node` module, and in this module it invokes `ray start ...`
+# script to start ray node, the purpose of `start_ray_node` module is to set up a
+# exit handler for cleaning ray temp directory when ray node exits.
+# When spark driver python process dies, or spark python worker dies, because
+# `start_ray_node` starts a daemon thread of `check_parent_alive`, it will detect
+# parent process died event and then trigger cleanup work.
+
+
+_logger = logging.getLogger(__name__)
+
+
+if __name__ == "__main__":
+    arg_list = sys.argv[1:]
+
+    collect_log_to_path = os.environ[RAY_ON_SPARK_COLLECT_LOG_TO_PATH]
+
+    temp_dir_arg_prefix = "--temp-dir="
+    temp_dir = None
+
+    for arg in arg_list:
+        if arg.startswith(temp_dir_arg_prefix):
+            temp_dir = arg[len(temp_dir_arg_prefix) :]
+
+    if temp_dir is not None:
+        temp_dir = os.path.normpath(temp_dir)
+    else:
+        # This case is for global mode Ray on spark cluster
+        from ray.util.spark.cluster_init import _get_default_ray_tmp_dir
+
+        temp_dir = _get_default_ray_tmp_dir()
+
+    # Multiple Ray nodes might be launched in the same machine,
+    # so set `exist_ok` to True
+    os.makedirs(temp_dir, exist_ok=True)
+
+    ray_cli_cmd = "ray"
+    lock_file = temp_dir + ".lock"
+
+    lock_fd = os.open(lock_file, os.O_RDWR | os.O_CREAT | os.O_TRUNC)
+
+    # Mutilple ray nodes might start on the same machine, and they are using the
+    # same temp directory, adding a shared lock representing current ray node is
+    # using the temp directory.
+    fcntl.flock(lock_fd, fcntl.LOCK_SH)
+
+    process = subprocess.Popen(
+        # 'ray start ...' command uses python that is set by
+        # Shebang #! ..., the Shebang line is hardcoded in ray script,
+        # it can't be changed to other python executable path.
+        # to enforce using current python executable,
+        # turn the subprocess command to
+        # '`sys.executable` `which ray` start ...'
+        [sys.executable, shutil.which(ray_cli_cmd), "start", *arg_list],
+        text=True,
+    )
+
+    exit_handler_executed = False
+    sigterm_handler_executed = False
+    ON_EXIT_HANDLER_WAIT_TIME = 3
+
+    def on_exit_handler():
+        global exit_handler_executed
+
+        if exit_handler_executed:
+            # wait for exit_handler execution completed in other threads.
+            time.sleep(ON_EXIT_HANDLER_WAIT_TIME)
+            return
+
+        exit_handler_executed = True
+
+        try:
+            # Wait for a while to ensure the children processes of the ray node all
+            # exited.
+            time.sleep(SIGTERM_GRACE_PERIOD_SECONDS + 0.5)
+
+            if process.poll() is None:
+                # "ray start ..." command process is still alive. Force to kill it.
+                process.kill()
+
+            # Release the shared lock, representing current ray node does not use the
+            # temp dir.
+            fcntl.flock(lock_fd, fcntl.LOCK_UN)
+
+            try:
+                # acquiring exclusive lock to ensure copy logs and removing dir safely.
+                fcntl.flock(lock_fd, fcntl.LOCK_EX | fcntl.LOCK_NB)
+                lock_acquired = True
+            except BlockingIOError:
+                # The file has active shared lock or exclusive lock, representing there
+                # are other ray nodes running, or other node running cleanup temp-dir
+                # routine. skip cleaning temp-dir, and skip copy logs to destination
+                # directory as well.
+                lock_acquired = False
+
+            if lock_acquired:
+                # This is the final terminated ray node on current spark worker,
+                # start copy logs (including all local ray nodes logs) to destination.
+                if collect_log_to_path:
+                    try:
+                        log_dir_prefix = os.path.basename(temp_dir)
+                        if log_dir_prefix == "ray":
+                            # global mode cluster case, append a timestamp to it to
+                            # avoid name conflict with last Ray global cluster log dir.
+                            log_dir_prefix = (
+                                log_dir_prefix + f"-global-{int(time.time())}"
+                            )
+                        base_dir = os.path.join(
+                            collect_log_to_path, log_dir_prefix + "-logs"
+                        )
+                        # Note: multiple Ray node launcher process might
+                        # execute this line code, so we set exist_ok=True here.
+                        os.makedirs(base_dir, exist_ok=True)
+                        copy_log_dest_path = os.path.join(
+                            base_dir,
+                            socket.gethostname(),
+                        )
+                        ray_session_dir = os.readlink(
+                            os.path.join(temp_dir, "session_latest")
+                        )
+                        shutil.copytree(
+                            os.path.join(ray_session_dir, "logs"),
+                            copy_log_dest_path,
+                        )
+                    except Exception as e:
+                        _logger.warning(
+                            "Collect logs to destination directory failed, "
+                            f"error: {repr(e)}."
+                        )
+
+                # Start cleaning the temp-dir,
+                shutil.rmtree(temp_dir, ignore_errors=True)
+        except Exception:
+            # swallow any exception.
+            pass
+        finally:
+            fcntl.flock(lock_fd, fcntl.LOCK_UN)
+            os.close(lock_fd)
+
+    def check_parent_alive() -> None:
+        orig_parent_pid = int(os.environ[RAY_ON_SPARK_START_RAY_PARENT_PID])
+        while True:
+            time.sleep(0.5)
+            if os.getppid() != orig_parent_pid:
+                # Note raising SIGTERM signal in a background thread
+                # doesn't work
+                sigterm_handler()
+                break
+
+    threading.Thread(target=check_parent_alive, daemon=True).start()
+
+    try:
+
+        def sighup_handler(*args):
+            pass
+
+        # When spark application is terminated, this process will receive
+        # SIGHUP (comes from pyspark application termination).
+        # Ignore the SIGHUP signal, because in this case,
+        # `check_parent_alive` will capture parent process died event
+        # and execute killing node and cleanup routine
+        # but if we enable default SIGHUP handler, it will kill
+        # the process immediately and it causes `check_parent_alive`
+        # have no time to exeucte cleanup routine.
+        signal.signal(signal.SIGHUP, sighup_handler)
+
+        def sigterm_handler(*args):
+            global sigterm_handler_executed
+            if not sigterm_handler_executed:
+                sigterm_handler_executed = True
+                process.terminate()
+                on_exit_handler()
+            else:
+                # wait for exit_handler execution completed in other threads.
+                time.sleep(ON_EXIT_HANDLER_WAIT_TIME)
+            # Sigterm exit code is 143.
+            os._exit(143)
+
+        signal.signal(signal.SIGTERM, sigterm_handler)
+        while True:
+            try:
+                ret_code = process.wait()
+                break
+            except KeyboardInterrupt:
+                # Jupyter notebook interrupt button triggers SIGINT signal and
+                # `start_ray_node` (subprocess) will receive SIGINT signal and it
+                # causes KeyboardInterrupt exception being raised.
+                pass
+        on_exit_handler()
+        sys.exit(ret_code)
+    except Exception:
+        on_exit_handler()
+        raise
diff --git a/.venv/lib/python3.11/site-packages/ray/util/spark/utils.py b/.venv/lib/python3.11/site-packages/ray/util/spark/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..65bfa4a52f2b60256c2d0a5744de5c8d2eb8f0e3
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/util/spark/utils.py
@@ -0,0 +1,522 @@
+import subprocess
+import os
+import sys
+import random
+import threading
+import collections
+import logging
+import shutil
+import time
+
+
+_logger = logging.getLogger("ray.util.spark.utils")
+
+
+def is_in_databricks_runtime():
+    return "DATABRICKS_RUNTIME_VERSION" in os.environ
+
+
+def gen_cmd_exec_failure_msg(cmd, return_code, tail_output_deque):
+    cmd_str = " ".join(cmd)
+    tail_output = "".join(tail_output_deque)
+    return (
+        f"Command {cmd_str} failed with return code {return_code}, tail output are "
+        f"included below.\n{tail_output}\n"
+    )
+
+
+def get_configured_spark_executor_memory_bytes(spark):
+    value_str = spark.conf.get("spark.executor.memory", "1g").lower()
+    value_num = int(value_str[:-1])
+    value_unit = value_str[-1]
+    unit_map = {
+        "k": 1024,
+        "m": 1024 * 1024,
+        "g": 1024 * 1024 * 1024,
+        "t": 1024 * 1024 * 1024 * 1024,
+    }
+    return value_num * unit_map[value_unit]
+
+
+def exec_cmd(
+    cmd,
+    *,
+    extra_env=None,
+    synchronous=True,
+    **kwargs,
+):
+    """
+    A convenience wrapper of `subprocess.Popen` for running a command from a Python
+    script.
+    If `synchronous` is True, wait until the process terminated and if subprocess
+    return code is not 0, raise error containing last 100 lines output.
+    If `synchronous` is False, return an `Popen` instance and a deque instance holding
+    tail outputs.
+    The subprocess stdout / stderr output will be streamly redirected to current
+    process stdout.
+    """
+    illegal_kwargs = set(kwargs.keys()).intersection({"text", "stdout", "stderr"})
+    if illegal_kwargs:
+        raise ValueError(f"`kwargs` cannot contain {list(illegal_kwargs)}")
+
+    env = kwargs.pop("env", None)
+    if extra_env is not None and env is not None:
+        raise ValueError("`extra_env` and `env` cannot be used at the same time")
+
+    env = env if extra_env is None else {**os.environ, **extra_env}
+
+    process = subprocess.Popen(
+        cmd,
+        env=env,
+        text=True,
+        stdout=subprocess.PIPE,
+        stderr=subprocess.STDOUT,
+        **kwargs,
+    )
+
+    tail_output_deque = collections.deque(maxlen=100)
+
+    def redirect_log_thread_fn():
+        for line in process.stdout:
+            # collect tail logs by `tail_output_deque`
+            tail_output_deque.append(line)
+
+            # redirect to stdout.
+            sys.stdout.write(line)
+
+    threading.Thread(target=redirect_log_thread_fn, args=()).start()
+
+    if not synchronous:
+        return process, tail_output_deque
+
+    return_code = process.wait()
+    if return_code != 0:
+        raise RuntimeError(
+            gen_cmd_exec_failure_msg(cmd, return_code, tail_output_deque)
+        )
+
+
+def is_port_in_use(host, port):
+    import socket
+    from contextlib import closing
+
+    with closing(socket.socket(socket.AF_INET, socket.SOCK_STREAM)) as sock:
+        return sock.connect_ex((host, port)) == 0
+
+
+def _wait_service_up(host, port, timeout):
+    beg_time = time.time()
+
+    while time.time() - beg_time < timeout:
+        if is_port_in_use(host, port):
+            return True
+        time.sleep(1)
+
+    return False
+
+
+def get_random_unused_port(
+    host, min_port=1024, max_port=65535, max_retries=100, exclude_list=None
+):
+    """
+    Get random unused port.
+    """
+    # Use true random generator
+    rng = random.SystemRandom()
+
+    exclude_list = exclude_list or []
+    for _ in range(max_retries):
+        port = rng.randint(min_port, max_port)
+        if port in exclude_list:
+            continue
+        if not is_port_in_use(host, port):
+            return port
+    raise RuntimeError(
+        f"Get available port between range {min_port} and {max_port} failed."
+    )
+
+
+def get_spark_session():
+    from pyspark.sql import SparkSession
+
+    spark_session = SparkSession.getActiveSession()
+    if spark_session is None:
+        raise RuntimeError(
+            "Spark session haven't been initiated yet. Please use "
+            "`SparkSession.builder` to create a spark session and connect to a spark "
+            "cluster."
+        )
+    return spark_session
+
+
+def get_spark_application_driver_host(spark):
+    return spark.conf.get("spark.driver.host")
+
+
+def get_max_num_concurrent_tasks(spark_context, resource_profile):
+    """Gets the current max number of concurrent tasks."""
+    # pylint: disable=protected-access=
+    ssc = spark_context._jsc.sc()
+    if resource_profile is not None:
+
+        def dummpy_mapper(_):
+            pass
+
+        # Runs a dummy spark job to register the `res_profile`
+        spark_context.parallelize([1], 1).withResources(resource_profile).map(
+            dummpy_mapper
+        ).collect()
+
+        return ssc.maxNumConcurrentTasks(resource_profile._java_resource_profile)
+    else:
+        return ssc.maxNumConcurrentTasks(
+            ssc.resourceProfileManager().defaultResourceProfile()
+        )
+
+
+def _get_spark_worker_total_physical_memory():
+    import psutil
+
+    if RAY_ON_SPARK_WORKER_PHYSICAL_MEMORY_BYTES in os.environ:
+        return int(os.environ[RAY_ON_SPARK_WORKER_PHYSICAL_MEMORY_BYTES])
+    return psutil.virtual_memory().total
+
+
+def _get_spark_worker_total_shared_memory():
+    import shutil
+
+    if RAY_ON_SPARK_WORKER_SHARED_MEMORY_BYTES in os.environ:
+        return int(os.environ[RAY_ON_SPARK_WORKER_SHARED_MEMORY_BYTES])
+
+    return shutil.disk_usage("/dev/shm").total
+
+
+# The maximum proportion for Ray worker node object store memory size
+_RAY_ON_SPARK_MAX_OBJECT_STORE_MEMORY_PROPORTION = 0.8
+
+# The buffer offset for calculating Ray node memory.
+_RAY_ON_SPARK_NODE_MEMORY_BUFFER_OFFSET = 0.8
+
+
+def calc_mem_ray_head_node(configured_heap_memory_bytes, configured_object_store_bytes):
+    import psutil
+    import shutil
+
+    if RAY_ON_SPARK_DRIVER_PHYSICAL_MEMORY_BYTES in os.environ:
+        available_physical_mem = int(
+            os.environ[RAY_ON_SPARK_DRIVER_PHYSICAL_MEMORY_BYTES]
+        )
+    else:
+        available_physical_mem = psutil.virtual_memory().total
+
+    available_physical_mem = (
+        available_physical_mem * _RAY_ON_SPARK_NODE_MEMORY_BUFFER_OFFSET
+    )
+
+    if RAY_ON_SPARK_DRIVER_SHARED_MEMORY_BYTES in os.environ:
+        available_shared_mem = int(os.environ[RAY_ON_SPARK_DRIVER_SHARED_MEMORY_BYTES])
+    else:
+        available_shared_mem = shutil.disk_usage("/dev/shm").total
+
+    available_shared_mem = (
+        available_shared_mem * _RAY_ON_SPARK_NODE_MEMORY_BUFFER_OFFSET
+    )
+
+    heap_mem_bytes, object_store_bytes, warning_msg = _calc_mem_per_ray_node(
+        available_physical_mem,
+        available_shared_mem,
+        configured_heap_memory_bytes,
+        configured_object_store_bytes,
+    )
+
+    if warning_msg is not None:
+        _logger.warning(warning_msg)
+
+    return heap_mem_bytes, object_store_bytes
+
+
+def _calc_mem_per_ray_worker_node(
+    num_task_slots,
+    physical_mem_bytes,
+    shared_mem_bytes,
+    configured_heap_memory_bytes,
+    configured_object_store_bytes,
+):
+    available_physical_mem_per_node = int(
+        physical_mem_bytes / num_task_slots * _RAY_ON_SPARK_NODE_MEMORY_BUFFER_OFFSET
+    )
+    available_shared_mem_per_node = int(
+        shared_mem_bytes / num_task_slots * _RAY_ON_SPARK_NODE_MEMORY_BUFFER_OFFSET
+    )
+    return _calc_mem_per_ray_node(
+        available_physical_mem_per_node,
+        available_shared_mem_per_node,
+        configured_heap_memory_bytes,
+        configured_object_store_bytes,
+    )
+
+
+def _calc_mem_per_ray_node(
+    available_physical_mem_per_node,
+    available_shared_mem_per_node,
+    configured_heap_memory_bytes,
+    configured_object_store_bytes,
+):
+    from ray._private.ray_constants import (
+        DEFAULT_OBJECT_STORE_MEMORY_PROPORTION,
+        OBJECT_STORE_MINIMUM_MEMORY_BYTES,
+    )
+
+    warning_msg = None
+
+    object_store_bytes = configured_object_store_bytes or (
+        available_physical_mem_per_node * DEFAULT_OBJECT_STORE_MEMORY_PROPORTION
+    )
+
+    # If allow Ray using slow storage oas object store,
+    # we don't need to cap object store size by /dev/shm capacity
+    if not os.environ.get("RAY_OBJECT_STORE_ALLOW_SLOW_STORAGE"):
+        if object_store_bytes > available_shared_mem_per_node:
+            object_store_bytes = available_shared_mem_per_node
+
+    object_store_bytes_upper_bound = (
+        available_physical_mem_per_node
+        * _RAY_ON_SPARK_MAX_OBJECT_STORE_MEMORY_PROPORTION
+    )
+
+    if object_store_bytes > object_store_bytes_upper_bound:
+        object_store_bytes = object_store_bytes_upper_bound
+        warning_msg = (
+            "Your configured `object_store_memory_per_node` value "
+            "is too high and it is capped by 80% of per-Ray node "
+            "allocated memory."
+        )
+
+    if object_store_bytes < OBJECT_STORE_MINIMUM_MEMORY_BYTES:
+        if object_store_bytes == available_shared_mem_per_node:
+            warning_msg = (
+                "Your operating system is configured with too small /dev/shm "
+                "size, so `object_store_memory_worker_node` value is configured "
+                f"to minimal size ({OBJECT_STORE_MINIMUM_MEMORY_BYTES} bytes),"
+                f"Please increase system /dev/shm size."
+            )
+        else:
+            warning_msg = (
+                "You configured too small Ray node object store memory size, "
+                "so `object_store_memory_worker_node` value is configured "
+                f"to minimal size ({OBJECT_STORE_MINIMUM_MEMORY_BYTES} bytes),"
+                "Please increase 'object_store_memory_worker_node' argument value."
+            )
+
+        object_store_bytes = OBJECT_STORE_MINIMUM_MEMORY_BYTES
+
+    object_store_bytes = int(object_store_bytes)
+
+    if configured_heap_memory_bytes is None:
+        heap_mem_bytes = int(available_physical_mem_per_node - object_store_bytes)
+    else:
+        heap_mem_bytes = int(configured_heap_memory_bytes)
+
+    return heap_mem_bytes, object_store_bytes, warning_msg
+
+
+# User can manually set these environment variables
+# if ray on spark code accessing corresponding information failed.
+# Note these environment variables must be set in spark executor side,
+# you should set them via setting spark config of
+# `spark.executorEnv.[EnvironmentVariableName]`
+RAY_ON_SPARK_WORKER_CPU_CORES = "RAY_ON_SPARK_WORKER_CPU_CORES"
+RAY_ON_SPARK_WORKER_GPU_NUM = "RAY_ON_SPARK_WORKER_GPU_NUM"
+RAY_ON_SPARK_WORKER_PHYSICAL_MEMORY_BYTES = "RAY_ON_SPARK_WORKER_PHYSICAL_MEMORY_BYTES"
+RAY_ON_SPARK_WORKER_SHARED_MEMORY_BYTES = "RAY_ON_SPARK_WORKER_SHARED_MEMORY_BYTES"
+
+# User can manually set these environment variables on spark driver node
+# if ray on spark code accessing corresponding information failed.
+RAY_ON_SPARK_DRIVER_PHYSICAL_MEMORY_BYTES = "RAY_ON_SPARK_DRIVER_PHYSICAL_MEMORY_BYTES"
+RAY_ON_SPARK_DRIVER_SHARED_MEMORY_BYTES = "RAY_ON_SPARK_DRIVER_SHARED_MEMORY_BYTES"
+
+
+def _get_cpu_cores():
+    import multiprocessing
+
+    if RAY_ON_SPARK_WORKER_CPU_CORES in os.environ:
+        # In some cases, spark standalone cluster might configure virtual cpu cores
+        # for spark worker that different with number of physical cpu cores,
+        # but we cannot easily get the virtual cpu cores configured for spark
+        # worker, as a workaround, we provide an environmental variable config
+        # `RAY_ON_SPARK_WORKER_CPU_CORES` for user.
+        return int(os.environ[RAY_ON_SPARK_WORKER_CPU_CORES])
+
+    return multiprocessing.cpu_count()
+
+
+def _get_num_physical_gpus():
+    if RAY_ON_SPARK_WORKER_GPU_NUM in os.environ:
+        # In some cases, spark standalone cluster might configure part of physical
+        # GPUs for spark worker,
+        # but we cannot easily get related configuration,
+        # as a workaround, we provide an environmental variable config
+        # `RAY_ON_SPARK_WORKER_CPU_CORES` for user.
+        return int(os.environ[RAY_ON_SPARK_WORKER_GPU_NUM])
+
+    if shutil.which("nvidia-smi") is None:
+        # GPU driver is not installed.
+        return 0
+    try:
+        completed_proc = subprocess.run(
+            "nvidia-smi --query-gpu=name --format=csv,noheader",
+            shell=True,
+            check=True,
+            text=True,
+            capture_output=True,
+        )
+        return len(completed_proc.stdout.strip().split("\n"))
+    except Exception as e:
+        _logger.info(
+            "'nvidia-smi --query-gpu=name --format=csv,noheader' command execution "
+            f"failed, error: {repr(e)}"
+        )
+        return 0
+
+
+def _get_local_ray_node_slots(
+    num_cpus,
+    num_gpus,
+    num_cpus_per_node,
+    num_gpus_per_node,
+):
+    if num_cpus_per_node > num_cpus:
+        raise ValueError(
+            "cpu number per Ray worker node should be <= spark worker node CPU cores, "
+            f"you set cpu number per Ray worker node to {num_cpus_per_node} but "
+            f"spark worker node CPU core number is {num_cpus}."
+        )
+    num_ray_node_slots = num_cpus // num_cpus_per_node
+
+    if num_gpus_per_node > 0:
+        if num_gpus_per_node > num_gpus:
+            raise ValueError(
+                "gpu number per Ray worker node should be <= spark worker node "
+                "GPU number, you set GPU devices number per Ray worker node to "
+                f"{num_gpus_per_node} but spark worker node GPU devices number "
+                f"is {num_gpus}."
+            )
+        if num_ray_node_slots > num_gpus // num_gpus_per_node:
+            num_ray_node_slots = num_gpus // num_gpus_per_node
+
+    return num_ray_node_slots
+
+
+def _get_avail_mem_per_ray_worker_node(
+    num_cpus_per_node,
+    num_gpus_per_node,
+    heap_memory_per_node,
+    object_store_memory_per_node,
+):
+    """
+    Returns tuple of (
+        ray_worker_node_heap_mem_bytes,
+        ray_worker_node_object_store_bytes,
+        error_message, # always None
+        warning_message,
+    )
+    """
+    num_cpus = _get_cpu_cores()
+    if num_gpus_per_node > 0:
+        num_gpus = _get_num_physical_gpus()
+    else:
+        num_gpus = 0
+
+    num_ray_node_slots = _get_local_ray_node_slots(
+        num_cpus, num_gpus, num_cpus_per_node, num_gpus_per_node
+    )
+
+    physical_mem_bytes = _get_spark_worker_total_physical_memory()
+    shared_mem_bytes = _get_spark_worker_total_shared_memory()
+
+    (
+        ray_worker_node_heap_mem_bytes,
+        ray_worker_node_object_store_bytes,
+        warning_msg,
+    ) = _calc_mem_per_ray_worker_node(
+        num_ray_node_slots,
+        physical_mem_bytes,
+        shared_mem_bytes,
+        heap_memory_per_node,
+        object_store_memory_per_node,
+    )
+    return (
+        ray_worker_node_heap_mem_bytes,
+        ray_worker_node_object_store_bytes,
+        None,
+        warning_msg,
+    )
+
+
+def get_avail_mem_per_ray_worker_node(
+    spark,
+    heap_memory_per_node,
+    object_store_memory_per_node,
+    num_cpus_per_node,
+    num_gpus_per_node,
+):
+    """
+    Return the available heap memory and object store memory for each ray worker,
+    and error / warning message if it has.
+    Return value is a tuple of
+    (ray_worker_node_heap_mem_bytes, ray_worker_node_object_store_bytes,
+     error_message, warning_message)
+    NB: We have one ray node per spark task.
+    """
+
+    def mapper(_):
+        try:
+            return _get_avail_mem_per_ray_worker_node(
+                num_cpus_per_node,
+                num_gpus_per_node,
+                heap_memory_per_node,
+                object_store_memory_per_node,
+            )
+        except Exception as e:
+            import traceback
+
+            trace_msg = "\n".join(traceback.format_tb(e.__traceback__))
+            return -1, -1, repr(e) + trace_msg, None
+
+    # Running memory inference routine on spark executor side since the spark worker
+    # nodes may have a different machine configuration compared to the spark driver
+    # node.
+    (
+        inferred_ray_worker_node_heap_mem_bytes,
+        inferred_ray_worker_node_object_store_bytes,
+        err,
+        warning_msg,
+    ) = (
+        spark.sparkContext.parallelize([1], 1).map(mapper).collect()[0]
+    )
+
+    if err is not None:
+        raise RuntimeError(
+            f"Inferring ray worker node available memory failed, error: {err}. "
+            "You can bypass this error by setting following spark configs: "
+            "spark.executorEnv.RAY_ON_SPARK_WORKER_CPU_CORES, "
+            "spark.executorEnv.RAY_ON_SPARK_WORKER_GPU_NUM, "
+            "spark.executorEnv.RAY_ON_SPARK_WORKER_PHYSICAL_MEMORY_BYTES, "
+            "spark.executorEnv.RAY_ON_SPARK_WORKER_SHARED_MEMORY_BYTES."
+        )
+    if warning_msg is not None:
+        _logger.warning(warning_msg)
+    return (
+        inferred_ray_worker_node_heap_mem_bytes,
+        inferred_ray_worker_node_object_store_bytes,
+    )
+
+
+def get_spark_task_assigned_physical_gpus(gpu_addr_list):
+    if "CUDA_VISIBLE_DEVICES" in os.environ:
+        visible_cuda_dev_list = [
+            int(dev.strip()) for dev in os.environ["CUDA_VISIBLE_DEVICES"].split(",")
+        ]
+        return [visible_cuda_dev_list[addr] for addr in gpu_addr_list]
+    else:
+        return gpu_addr_list
diff --git a/.venv/lib/python3.11/site-packages/ray/util/timer.py b/.venv/lib/python3.11/site-packages/ray/util/timer.py
new file mode 100644
index 0000000000000000000000000000000000000000..2f36aef155ea4dfaf4c693642b4bcf4e1054d046
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/util/timer.py
@@ -0,0 +1,65 @@
+import time
+
+
+class _Timer:
+    """A running stat for conveniently logging the duration of a code block.
+
+    Example:
+        wait_timer = TimerStat()
+        with wait_timer:
+            ray.wait(...)
+
+    Note that this class is *not* thread-safe.
+    """
+
+    def __init__(self, window_size=10):
+        self._window_size = window_size
+        self._samples = []
+        self._units_processed = []
+        self._start_time = None
+        self._total_time = 0.0
+        self.count = 0
+
+    def __enter__(self):
+        assert self._start_time is None, "concurrent updates not supported"
+        self._start_time = time.time()
+
+    def __exit__(self, exc_type, exc_value, tb):
+        assert self._start_time is not None
+        time_delta = time.time() - self._start_time
+        self.push(time_delta)
+        self._start_time = None
+
+    def push(self, time_delta):
+        self._samples.append(time_delta)
+        if len(self._samples) > self._window_size:
+            self._samples.pop(0)
+        self.count += 1
+        self._total_time += time_delta
+
+    def push_units_processed(self, n):
+        self._units_processed.append(n)
+        if len(self._units_processed) > self._window_size:
+            self._units_processed.pop(0)
+
+    def has_units_processed(self):
+        return len(self._units_processed) > 0
+
+    @property
+    def mean(self):
+        if len(self._samples) == 0:
+            return 0.0
+        return float(sum(self._samples)) / len(self._samples)
+
+    @property
+    def mean_units_processed(self):
+        if len(self._units_processed) == 0:
+            return 0.0
+        return float(sum(self._units_processed)) / len(self._units_processed)
+
+    @property
+    def mean_throughput(self):
+        time_total = float(sum(self._samples))
+        if not time_total:
+            return 0.0
+        return float(sum(self._units_processed)) / time_total
diff --git a/.venv/lib/python3.11/site-packages/ray/util/xgboost/__init__.py b/.venv/lib/python3.11/site-packages/ray/util/xgboost/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..ce2f204c260fcef980e6505804e72977d5a5a1cf
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/ray/util/xgboost/__init__.py
@@ -0,0 +1,4 @@
+raise DeprecationWarning(
+    "ray.util.xgboost has been removed as of Ray 2.0. Instead, use the `xgboost-ray` "
+    "library directly or the `XGBoostTrainer` in Ray Train."
+)
diff --git a/.venv/lib/python3.11/site-packages/ray/util/xgboost/__pycache__/__init__.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/util/xgboost/__pycache__/__init__.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..933da75d95dbae4793f5e4e4603471b55cf2a17f
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/util/xgboost/__pycache__/__init__.cpython-311.pyc differ