koichi12 commited on Feb 12, 2025

Commit

ed5a2c3

verified ·

1 Parent(s): 8479d0d

Add files using upload-large-folder tool

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.venv/lib/python3.11/site-packages/ray/autoscaler/_private/aliyun/__init__.py +0 -0
.venv/lib/python3.11/site-packages/ray/autoscaler/_private/aliyun/__pycache__/__init__.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/autoscaler/_private/aliyun/__pycache__/config.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/autoscaler/_private/aliyun/__pycache__/node_provider.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/autoscaler/_private/aliyun/__pycache__/utils.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/autoscaler/_private/aliyun/config.py +116 -0
.venv/lib/python3.11/site-packages/ray/autoscaler/_private/aliyun/node_provider.py +324 -0
.venv/lib/python3.11/site-packages/ray/autoscaler/_private/aliyun/utils.py +461 -0
.venv/lib/python3.11/site-packages/ray/autoscaler/_private/aws/__pycache__/utils.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/autoscaler/_private/kuberay/__init__.py +0 -0
.venv/lib/python3.11/site-packages/ray/autoscaler/_private/kuberay/__pycache__/__init__.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/autoscaler/_private/kuberay/__pycache__/autoscaling_config.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/autoscaler/_private/kuberay/__pycache__/node_provider.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/autoscaler/_private/kuberay/__pycache__/run_autoscaler.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/autoscaler/_private/kuberay/__pycache__/utils.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/autoscaler/_private/kuberay/node_provider.py +536 -0
.venv/lib/python3.11/site-packages/ray/autoscaler/_private/kuberay/run_autoscaler.py +119 -0
.venv/lib/python3.11/site-packages/ray/autoscaler/_private/kuberay/utils.py +111 -0
.venv/lib/python3.11/site-packages/ray/autoscaler/_private/readonly/__init__.py +0 -0
.venv/lib/python3.11/site-packages/ray/autoscaler/_private/readonly/__pycache__/__init__.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/autoscaler/_private/readonly/__pycache__/node_provider.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/autoscaler/_private/readonly/node_provider.py +80 -0
.venv/lib/python3.11/site-packages/ray/autoscaler/aws/__init__.py +0 -0
.venv/lib/python3.11/site-packages/ray/autoscaler/aws/__pycache__/__init__.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/autoscaler/aws/cloudwatch/prometheus.yml +15 -0
.venv/lib/python3.11/site-packages/ray/autoscaler/aws/cloudwatch/ray_prometheus_waiter.sh +23 -0
.venv/lib/python3.11/site-packages/ray/autoscaler/aws/defaults.yaml +144 -0
.venv/lib/python3.11/site-packages/ray/autoscaler/azure/__init__.py +0 -0
.venv/lib/python3.11/site-packages/ray/autoscaler/azure/__pycache__/__init__.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/autoscaler/azure/defaults.yaml +152 -0
.venv/lib/python3.11/site-packages/ray/autoscaler/sdk/__init__.py +29 -0
.venv/lib/python3.11/site-packages/ray/autoscaler/sdk/__pycache__/__init__.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/autoscaler/sdk/__pycache__/sdk.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/autoscaler/sdk/sdk.py +343 -0
.venv/lib/python3.11/site-packages/ray/autoscaler/v2/__init__.py +0 -0
.venv/lib/python3.11/site-packages/ray/autoscaler/v2/autoscaler.py +201 -0
.venv/lib/python3.11/site-packages/ray/autoscaler/v2/event_logger.py +157 -0
.venv/lib/python3.11/site-packages/ray/autoscaler/v2/instance_manager/__init__.py +0 -0
.venv/lib/python3.11/site-packages/ray/autoscaler/v2/instance_manager/common.py +472 -0
.venv/lib/python3.11/site-packages/ray/autoscaler/v2/instance_manager/config.py +541 -0
.venv/lib/python3.11/site-packages/ray/autoscaler/v2/instance_manager/instance_manager.py +270 -0
.venv/lib/python3.11/site-packages/ray/autoscaler/v2/instance_manager/instance_storage.py +151 -0
.venv/lib/python3.11/site-packages/ray/autoscaler/v2/instance_manager/node_provider.py +522 -0
.venv/lib/python3.11/site-packages/ray/autoscaler/v2/instance_manager/ray_installer.py +99 -0
.venv/lib/python3.11/site-packages/ray/autoscaler/v2/instance_manager/reconciler.py +1565 -0
.venv/lib/python3.11/site-packages/ray/autoscaler/v2/instance_manager/storage.py +180 -0
.venv/lib/python3.11/site-packages/ray/autoscaler/v2/metrics_reporter.py +100 -0
.venv/lib/python3.11/site-packages/ray/autoscaler/v2/monitor.py +302 -0
.venv/lib/python3.11/site-packages/ray/autoscaler/v2/scheduler.py +1642 -0
.venv/lib/python3.11/site-packages/ray/autoscaler/v2/schema.py +351 -0

.venv/lib/python3.11/site-packages/ray/autoscaler/_private/aliyun/__init__.py ADDED Viewed

File without changes

.venv/lib/python3.11/site-packages/ray/autoscaler/_private/aliyun/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (203 Bytes). View file

.venv/lib/python3.11/site-packages/ray/autoscaler/_private/aliyun/__pycache__/config.cpython-311.pyc ADDED Viewed

Binary file (6.06 kB). View file

.venv/lib/python3.11/site-packages/ray/autoscaler/_private/aliyun/__pycache__/node_provider.cpython-311.pyc ADDED Viewed

Binary file (18.3 kB). View file

.venv/lib/python3.11/site-packages/ray/autoscaler/_private/aliyun/__pycache__/utils.cpython-311.pyc ADDED Viewed

Binary file (24.5 kB). View file

.venv/lib/python3.11/site-packages/ray/autoscaler/_private/aliyun/config.py ADDED Viewed

	@@ -0,0 +1,116 @@

+import logging
+import os
+import stat
+from ray.autoscaler._private.aliyun.utils import AcsClient
+# instance status
+PENDING = "Pending"
+RUNNING = "Running"
+STARTING = "Starting"
+STOPPING = "Stopping"
+STOPPED = "Stopped"
+logger = logging.getLogger(__name__)
+def bootstrap_aliyun(config):
+    # print(config["provider"])
+    # create vpc
+    _get_or_create_vpc(config)
+    # create security group id
+    _get_or_create_security_group(config)
+    # create vswitch
+    _get_or_create_vswitch(config)
+    # create key pair
+    _get_or_import_key_pair(config)
+    # print(config["provider"])
+    return config
+def _client(config):
+    return AcsClient(
+        access_key=config["provider"].get("access_key"),
+        access_key_secret=config["provider"].get("access_key_secret"),
+        region_id=config["provider"]["region"],
+        max_retries=1,
+    )
+def _get_or_create_security_group(config):
+    cli = _client(config)
+    security_groups = cli.describe_security_groups(vpc_id=config["provider"]["vpc_id"])
+    if security_groups is not None and len(security_groups) > 0:
+        config["provider"]["security_group_id"] = security_groups[0]["SecurityGroupId"]
+        return config
+    security_group_id = cli.create_security_group(vpc_id=config["provider"]["vpc_id"])
+    for rule in config["provider"].get("security_group_rule", {}):
+        cli.authorize_security_group(
+            security_group_id=security_group_id,
+            port_range=rule["port_range"],
+            source_cidr_ip=rule["source_cidr_ip"],
+            ip_protocol=rule["ip_protocol"],
+        )
+    config["provider"]["security_group_id"] = security_group_id
+    return
+def _get_or_create_vpc(config):
+    cli = _client(config)
+    vpcs = cli.describe_vpcs()
+    if vpcs is not None and len(vpcs) > 0:
+        config["provider"]["vpc_id"] = vpcs[0].get("VpcId")
+        return
+    vpc_id = cli.create_vpc()
+    if vpc_id is not None:
+        config["provider"]["vpc_id"] = vpc_id
+def _get_or_create_vswitch(config):
+    cli = _client(config)
+    vswitches = cli.describe_v_switches(vpc_id=config["provider"]["vpc_id"])
+    if vswitches is not None and len(vswitches) > 0:
+        config["provider"]["v_switch_id"] = vswitches[0].get("VSwitchId")
+        return
+    v_switch_id = cli.create_v_switch(
+        vpc_id=config["provider"]["vpc_id"],
+        zone_id=config["provider"]["zone_id"],
+        cidr_block=config["provider"]["cidr_block"],
+    )
+    if v_switch_id is not None:
+        config["provider"]["v_switch_id"] = v_switch_id
+def _get_or_import_key_pair(config):
+    cli = _client(config)
+    key_name = config["provider"].get("key_name", "ray")
+    key_path = os.path.expanduser("~/.ssh/{}".format(key_name))
+    keypairs = cli.describe_key_pairs(key_pair_name=key_name)
+    if keypairs is not None and len(keypairs) > 0:
+        if "ssh_private_key" not in config["auth"]:
+            logger.info(
+                "{} keypair exists, use {} as local ssh key".format(key_name, key_path)
+            )
+            config["auth"]["ssh_private_key"] = key_path
+    else:
+        if "ssh_private_key" not in config["auth"]:
+            # create new keypair
+            resp = cli.create_key_pair(key_pair_name=key_name)
+            if resp is not None:
+                with open(key_path, "w+") as f:
+                    f.write(resp.get("PrivateKeyBody"))
+                os.chmod(key_path, stat.S_IRUSR)
+                config["auth"]["ssh_private_key"] = key_path
+        else:
+            public_key_file = config["auth"]["ssh_private_key"] + ".pub"
+            # create new keypair, from local file
+            with open(public_key_file) as f:
+                public_key = f.readline().strip("\n")
+                cli.import_key_pair(key_pair_name=key_name, public_key_body=public_key)
+                return

.venv/lib/python3.11/site-packages/ray/autoscaler/_private/aliyun/node_provider.py ADDED Viewed

	@@ -0,0 +1,324 @@

+import logging
+import random
+import threading
+import time
+from collections import defaultdict
+from typing import Any, Dict, List, Optional
+from ray.autoscaler._private.aliyun.config import (
+    PENDING,
+    RUNNING,
+    STOPPED,
+    STOPPING,
+    bootstrap_aliyun,
+)
+from ray.autoscaler._private.aliyun.utils import AcsClient
+from ray.autoscaler._private.cli_logger import cli_logger
+from ray.autoscaler._private.constants import BOTO_MAX_RETRIES
+from ray.autoscaler._private.log_timer import LogTimer
+from ray.autoscaler.node_provider import NodeProvider
+from ray.autoscaler.tags import (
+    TAG_RAY_CLUSTER_NAME,
+    TAG_RAY_LAUNCH_CONFIG,
+    TAG_RAY_NODE_KIND,
+    TAG_RAY_NODE_NAME,
+    TAG_RAY_NODE_STATUS,
+    TAG_RAY_USER_NODE_TYPE,
+)
+logger = logging.getLogger(__name__)
+TAG_BATCH_DELAY = 1
+STOPPING_NODE_DELAY = 1
+class AliyunNodeProvider(NodeProvider):
+    def __init__(self, provider_config, cluster_name):
+        NodeProvider.__init__(self, provider_config, cluster_name)
+        self.cache_stopped_nodes = provider_config.get("cache_stopped_nodes", True)
+        self.acs = AcsClient(
+            access_key=provider_config["access_key"],
+            access_key_secret=provider_config["access_key_secret"],
+            region_id=provider_config["region"],
+            max_retries=BOTO_MAX_RETRIES,
+        )
+        # Try availability zones round-robin, starting from random offset
+        self.subnet_idx = random.randint(0, 100)
+        # Tags that we believe to actually be on the node.
+        self.tag_cache = {}
+        # Tags that we will soon upload.
+        self.tag_cache_pending = defaultdict(dict)
+        # Number of threads waiting for a batched tag update.
+        self.batch_thread_count = 0
+        self.batch_update_done = threading.Event()
+        self.batch_update_done.set()
+        self.ready_for_new_batch = threading.Event()
+        self.ready_for_new_batch.set()
+        self.tag_cache_lock = threading.Lock()
+        self.count_lock = threading.Lock()
+        # Cache of node objects from the last nodes() call. This avoids
+        # excessive DescribeInstances requests.
+        self.cached_nodes = {}
+    def non_terminated_nodes(self, tag_filters: Dict[str, str]) -> List[str]:
+        tags = [
+            {
+                "Key": TAG_RAY_CLUSTER_NAME,
+                "Value": self.cluster_name,
+            },
+        ]
+        for k, v in tag_filters.items():
+            tags.append(
+                {
+                    "Key": k,
+                    "Value": v,
+                }
+            )
+        instances = self.acs.describe_instances(tags=tags)
+        non_terminated_instance = []
+        for instance in instances:
+            if instance.get("Status") == RUNNING or instance.get("Status") == PENDING:
+                non_terminated_instance.append(instance.get("InstanceId"))
+                self.cached_nodes[instance.get("InstanceId")] = instance
+        return non_terminated_instance
+    def is_running(self, node_id: str) -> bool:
+        instances = self.acs.describe_instances(instance_ids=[node_id])
+        if instances is not None:
+            instance = instances[0]
+            return instance.get("Status") == "Running"
+        cli_logger.error("Invalid node id: %s", node_id)
+        return False
+    def is_terminated(self, node_id: str) -> bool:
+        instances = self.acs.describe_instances(instance_ids=[node_id])
+        if instances is not None:
+            assert len(instances) == 1
+            instance = instances[0]
+            return instance.get("Status") == "Stopped"
+        cli_logger.error("Invalid node id: %s", node_id)
+        return False
+    def node_tags(self, node_id: str) -> Dict[str, str]:
+        instances = self.acs.describe_instances(instance_ids=[node_id])
+        if instances is not None:
+            assert len(instances) == 1
+            instance = instances[0]
+            if instance.get("Tags") is not None:
+                node_tags = dict()
+                for tag in instance.get("Tags").get("Tag"):
+                    node_tags[tag.get("TagKey")] = tag.get("TagValue")
+                return node_tags
+        return dict()
+    def external_ip(self, node_id: str) -> str:
+        while True:
+            instances = self.acs.describe_instances(instance_ids=[node_id])
+            if instances is not None:
+                assert len(instances)
+                instance = instances[0]
+                if (
+                    instance.get("PublicIpAddress") is not None
+                    and instance.get("PublicIpAddress").get("IpAddress") is not None
+                ):
+                    if len(instance.get("PublicIpAddress").get("IpAddress")) > 0:
+                        return instance.get("PublicIpAddress").get("IpAddress")[0]
+            cli_logger.error("PublicIpAddress attribute is not exist. %s" % instance)
+            time.sleep(STOPPING_NODE_DELAY)
+    def internal_ip(self, node_id: str) -> str:
+        while True:
+            instances = self.acs.describe_instances(instance_ids=[node_id])
+            if instances is not None:
+                assert len(instances) == 1
+                instance = instances[0]
+                if (
+                    instance.get("VpcAttributes") is not None
+                    and instance.get("VpcAttributes").get("PrivateIpAddress")
+                    is not None
+                    and len(
+                        instance.get("VpcAttributes")
+                        .get("PrivateIpAddress")
+                        .get("IpAddress")
+                    )
+                    > 0
+                ):
+                    return (
+                        instance.get("VpcAttributes")
+                        .get("PrivateIpAddress")
+                        .get("IpAddress")[0]
+                    )
+            cli_logger.error("InnerIpAddress attribute is not exist. %s" % instance)
+            time.sleep(STOPPING_NODE_DELAY)
+    def set_node_tags(self, node_id: str, tags: Dict[str, str]) -> None:
+        is_batching_thread = False
+        with self.tag_cache_lock:
+            if not self.tag_cache_pending:
+                is_batching_thread = True
+                # Wait for threads in the last batch to exit
+                self.ready_for_new_batch.wait()
+                self.ready_for_new_batch.clear()
+                self.batch_update_done.clear()
+            self.tag_cache_pending[node_id].update(tags)
+        if is_batching_thread:
+            time.sleep(TAG_BATCH_DELAY)
+            with self.tag_cache_lock:
+                self._update_node_tags()
+                self.batch_update_done.set()
+        with self.count_lock:
+            self.batch_thread_count += 1
+        self.batch_update_done.wait()
+        with self.count_lock:
+            self.batch_thread_count -= 1
+            if self.batch_thread_count == 0:
+                self.ready_for_new_batch.set()
+    def _update_node_tags(self):
+        batch_updates = defaultdict(list)
+        for node_id, tags in self.tag_cache_pending.items():
+            for x in tags.items():
+                batch_updates[x].append(node_id)
+            self.tag_cache[node_id] = tags
+        self.tag_cache_pending = defaultdict(dict)
+        self._create_tags(batch_updates)
+    def _create_tags(self, batch_updates):
+        for (k, v), node_ids in batch_updates.items():
+            m = "Set tag {}={} on {}".format(k, v, node_ids)
+            with LogTimer("AliyunNodeProvider: {}".format(m)):
+                if k == TAG_RAY_NODE_NAME:
+                    k = "Name"
+                self.acs.tag_resource(node_ids, [{"Key": k, "Value": v}])
+    def create_node(
+        self, node_config: Dict[str, Any], tags: Dict[str, str], count: int
+    ) -> Optional[Dict[str, Any]]:
+        filter_tags = [
+            {
+                "Key": TAG_RAY_CLUSTER_NAME,
+                "Value": self.cluster_name,
+            },
+            {"Key": TAG_RAY_NODE_KIND, "Value": tags[TAG_RAY_NODE_KIND]},
+            {"Key": TAG_RAY_USER_NODE_TYPE, "Value": tags[TAG_RAY_USER_NODE_TYPE]},
+            {"Key": TAG_RAY_LAUNCH_CONFIG, "Value": tags[TAG_RAY_LAUNCH_CONFIG]},
+            {"Key": TAG_RAY_NODE_NAME, "Value": tags[TAG_RAY_NODE_NAME]},
+        ]
+        reused_nodes_dict = {}
+        if self.cache_stopped_nodes:
+            reuse_nodes_candidate = self.acs.describe_instances(tags=filter_tags)
+            if reuse_nodes_candidate:
+                with cli_logger.group("Stopping instances to reuse"):
+                    reuse_node_ids = []
+                    for node in reuse_nodes_candidate:
+                        node_id = node.get("InstanceId")
+                        status = node.get("Status")
+                        if status != STOPPING and status != STOPPED:
+                            continue
+                        if status == STOPPING:
+                            # wait for node stopped
+                            while (
+                                self.acs.describe_instances(instance_ids=[node_id])[
+                                    0
+                                ].get("Status")
+                                == STOPPING
+                            ):
+                                logging.info("wait for %s stop" % node_id)
+                                time.sleep(STOPPING_NODE_DELAY)
+                        # logger.info("reuse %s" % node_id)
+                        reuse_node_ids.append(node_id)
+                        reused_nodes_dict[node.get("InstanceId")] = node
+                        self.acs.start_instance(node_id)
+                        self.tag_cache[node_id] = node.get("Tags")
+                        self.set_node_tags(node_id, tags)
+                        if len(reuse_node_ids) == count:
+                            break
+                count -= len(reuse_node_ids)
+        created_nodes_dict = {}
+        if count > 0:
+            filter_tags.append(
+                {"Key": TAG_RAY_NODE_STATUS, "Value": tags[TAG_RAY_NODE_STATUS]}
+            )
+            instance_id_sets = self.acs.run_instances(
+                instance_type=node_config["InstanceType"],
+                image_id=node_config["ImageId"],
+                tags=filter_tags,
+                amount=count,
+                vswitch_id=self.provider_config["v_switch_id"],
+                security_group_id=self.provider_config["security_group_id"],
+                key_pair_name=self.provider_config["key_name"],
+            )
+            instances = self.acs.describe_instances(instance_ids=instance_id_sets)
+            if instances is not None:
+                for instance in instances:
+                    created_nodes_dict[instance.get("InstanceId")] = instance
+        all_created_nodes = reused_nodes_dict
+        all_created_nodes.update(created_nodes_dict)
+        return all_created_nodes
+    def terminate_node(self, node_id: str) -> None:
+        logger.info("terminate node: %s" % node_id)
+        if self.cache_stopped_nodes:
+            logger.info(
+                "Stopping instance {} (to terminate instead, "
+                "set `cache_stopped_nodes: False` "
+                "under `provider` in the cluster configuration)"
+            ).format(node_id)
+            self.acs.stop_instance(node_id)
+        else:
+            self.acs.delete_instance(node_id)
+    def terminate_nodes(self, node_ids: List[str]) -> None:
+        if not node_ids:
+            return
+        if self.cache_stopped_nodes:
+            logger.info(
+                "Stopping instances {} (to terminate instead, "
+                "set `cache_stopped_nodes: False` "
+                "under `provider` in the cluster configuration)".format(node_ids)
+            )
+            self.acs.stop_instances(node_ids)
+        else:
+            self.acs.delete_instances(node_ids)
+    def _get_node(self, node_id):
+        """Refresh and get info for this node, updating the cache."""
+        self.non_terminated_nodes({})  # Side effect: updates cache
+        if node_id in self.cached_nodes:
+            return self.cached_nodes[node_id]
+        # Node not in {pending, running} -- retry with a point query. This
+        # usually means the node was recently preempted or terminated.
+        matches = self.acs.describe_instances(instance_ids=[node_id])
+        assert len(matches) == 1, "Invalid instance id {}".format(node_id)
+        return matches[0]
+    def _get_cached_node(self, node_id):
+        """Return node info from cache if possible, otherwise fetches it."""
+        if node_id in self.cached_nodes:
+            return self.cached_nodes[node_id]
+        return self._get_node(node_id)
+    @staticmethod
+    def bootstrap_config(cluster_config):
+        return bootstrap_aliyun(cluster_config)

.venv/lib/python3.11/site-packages/ray/autoscaler/_private/aliyun/utils.py ADDED Viewed

	@@ -0,0 +1,461 @@

+import json
+import logging
+from aliyunsdkcore import client
+from aliyunsdkcore.acs_exception.exceptions import ClientException, ServerException
+from aliyunsdkecs.request.v20140526.AllocatePublicIpAddressRequest import (
+    AllocatePublicIpAddressRequest,
+)
+from aliyunsdkecs.request.v20140526.AuthorizeSecurityGroupRequest import (
+    AuthorizeSecurityGroupRequest,
+)
+from aliyunsdkecs.request.v20140526.CreateInstanceRequest import CreateInstanceRequest
+from aliyunsdkecs.request.v20140526.CreateKeyPairRequest import CreateKeyPairRequest
+from aliyunsdkecs.request.v20140526.CreateSecurityGroupRequest import (
+    CreateSecurityGroupRequest,
+)
+from aliyunsdkecs.request.v20140526.CreateVpcRequest import CreateVpcRequest
+from aliyunsdkecs.request.v20140526.CreateVSwitchRequest import CreateVSwitchRequest
+from aliyunsdkecs.request.v20140526.DeleteInstanceRequest import DeleteInstanceRequest
+from aliyunsdkecs.request.v20140526.DeleteInstancesRequest import DeleteInstancesRequest
+from aliyunsdkecs.request.v20140526.DeleteKeyPairsRequest import DeleteKeyPairsRequest
+from aliyunsdkecs.request.v20140526.DescribeInstancesRequest import (
+    DescribeInstancesRequest,
+)
+from aliyunsdkecs.request.v20140526.DescribeKeyPairsRequest import (
+    DescribeKeyPairsRequest,
+)
+from aliyunsdkecs.request.v20140526.DescribeSecurityGroupsRequest import (
+    DescribeSecurityGroupsRequest,
+)
+from aliyunsdkecs.request.v20140526.DescribeVpcsRequest import DescribeVpcsRequest
+from aliyunsdkecs.request.v20140526.DescribeVSwitchesRequest import (
+    DescribeVSwitchesRequest,
+)
+from aliyunsdkecs.request.v20140526.ImportKeyPairRequest import ImportKeyPairRequest
+from aliyunsdkecs.request.v20140526.RunInstancesRequest import RunInstancesRequest
+from aliyunsdkecs.request.v20140526.StartInstanceRequest import StartInstanceRequest
+from aliyunsdkecs.request.v20140526.StopInstanceRequest import StopInstanceRequest
+from aliyunsdkecs.request.v20140526.StopInstancesRequest import StopInstancesRequest
+from aliyunsdkecs.request.v20140526.TagResourcesRequest import TagResourcesRequest
+class AcsClient:
+    """
+    A wrapper around Aliyun SDK. We use this wrapper in aliyun node provider.
+    Parameters:
+        access_key: The AccessKey ID of your aliyun account.
+        access_key_secret: The AccessKey secret of your aliyun account.
+        region_id: A region is a geographic area where a data center resides.
+                   Region_id is the ID of region (e.g., cn-hangzhou,
+                   us-west-1, etc.)
+        max_retries: The maximum number of retries each connection.
+    """
+    def __init__(self, access_key, access_key_secret, region_id, max_retries):
+        self.cli = client.AcsClient(
+            ak=access_key,
+            secret=access_key_secret,
+            max_retry_time=max_retries,
+            region_id=region_id,
+        )
+    def describe_instances(self, tags=None, instance_ids=None):
+        """Query the details of one or more Elastic Compute Service (ECS) instances.
+        :param tags: The tags of the instance.
+        :param instance_ids: The IDs of ECS instances
+        :return: ECS instance list
+        """
+        request = DescribeInstancesRequest()
+        if tags is not None:
+            request.set_Tags(tags)
+        if instance_ids is not None:
+            request.set_InstanceIds(instance_ids)
+        response = self._send_request(request)
+        if response is not None:
+            instance_list = response.get("Instances").get("Instance")
+            return instance_list
+        return None
+    def create_instance(
+        self,
+        instance_type,
+        image_id,
+        tags,
+        key_pair_name,
+        optimized="optimized",
+        instance_charge_type="PostPaid",
+        spot_strategy="SpotWithPriceLimit",
+        internet_charge_type="PayByTraffic",
+        internet_max_bandwidth_out=5,
+    ):
+        """Create a subscription or pay-as-you-go ECS instance.
+        :param instance_type: The instance type of the ECS.
+        :param image_id: The ID of the image used to create the instance.
+        :param tags: The tags of the instance.
+        :param key_pair_name: The name of the key pair to be bound to
+                              the instance.
+        :param optimized: Specifies whether the instance is I/O optimized
+        :param instance_charge_type: The billing method of the instance.
+                                     Default value: PostPaid.
+        :param spot_strategy: The preemption policy for the pay-as-you-go
+                              instance.
+        :param internet_charge_type: The billing method for network usage.
+                                     Default value: PayByTraffic.
+        :param internet_max_bandwidth_out: The maximum inbound public
+                                           bandwidth. Unit: Mbit/s.
+        :return: The created instance ID.
+        """
+        request = CreateInstanceRequest()
+        request.set_InstanceType(instance_type)
+        request.set_ImageId(image_id)
+        request.set_IoOptimized(optimized)
+        request.set_InstanceChargeType(instance_charge_type)
+        request.set_SpotStrategy(spot_strategy)
+        request.set_InternetChargeType(internet_charge_type)
+        request.set_InternetMaxBandwidthOut(internet_max_bandwidth_out)
+        request.set_KeyPairName(key_pair_name)
+        request.set_Tags(tags)
+        response = self._send_request(request)
+        if response is not None:
+            instance_id = response.get("InstanceId")
+            logging.info("instance %s created task submit successfully.", instance_id)
+            return instance_id
+        logging.error("instance created failed.")
+        return None
+    def run_instances(
+        self,
+        instance_type,
+        image_id,
+        tags,
+        security_group_id,
+        vswitch_id,
+        key_pair_name,
+        amount=1,
+        optimized="optimized",
+        instance_charge_type="PostPaid",
+        spot_strategy="SpotWithPriceLimit",
+        internet_charge_type="PayByTraffic",
+        internet_max_bandwidth_out=1,
+    ):
+        """Create one or more pay-as-you-go or subscription
+            Elastic Compute Service (ECS) instances
+        :param instance_type: The instance type of the ECS.
+        :param image_id: The ID of the image used to create the instance.
+        :param tags: The tags of the instance.
+        :param security_group_id: The ID of the security group to which to
+                                  assign the instance. Instances in the same
+                                  security group can communicate with
+                                  each other.
+        :param vswitch_id: The ID of the vSwitch to which to connect
+                           the instance.
+        :param key_pair_name: The name of the key pair to be bound to
+                              the instance.
+        :param amount: The number of instances that you want to create.
+        :param optimized: Specifies whether the instance is I/O optimized
+        :param instance_charge_type: The billing method of the instance.
+                                     Default value: PostPaid.
+        :param spot_strategy: The preemption policy for the pay-as-you-go
+                              instance.
+        :param internet_charge_type: The billing method for network usage.
+                                     Default value: PayByTraffic.
+        :param internet_max_bandwidth_out: The maximum inbound public
+                                           bandwidth. Unit: Mbit/s.
+        :return: The created instance IDs.
+        """
+        request = RunInstancesRequest()
+        request.set_InstanceType(instance_type)
+        request.set_ImageId(image_id)
+        request.set_IoOptimized(optimized)
+        request.set_InstanceChargeType(instance_charge_type)
+        request.set_SpotStrategy(spot_strategy)
+        request.set_InternetChargeType(internet_charge_type)
+        request.set_InternetMaxBandwidthOut(internet_max_bandwidth_out)
+        request.set_Tags(tags)
+        request.set_Amount(amount)
+        request.set_SecurityGroupId(security_group_id)
+        request.set_VSwitchId(vswitch_id)
+        request.set_KeyPairName(key_pair_name)
+        response = self._send_request(request)
+        if response is not None:
+            instance_ids = response.get("InstanceIdSets").get("InstanceIdSet")
+            return instance_ids
+        logging.error("instance created failed.")
+        return None
+    def create_security_group(self, vpc_id):
+        """Create a security group
+        :param vpc_id: The ID of the VPC in which to create
+                       the security group.
+        :return: The created security group ID.
+        """
+        request = CreateSecurityGroupRequest()
+        request.set_VpcId(vpc_id)
+        response = self._send_request(request)
+        if response is not None:
+            security_group_id = response.get("SecurityGroupId")
+            return security_group_id
+        return None
+    def describe_security_groups(self, vpc_id=None, tags=None):
+        """Query basic information of security groups.
+        :param vpc_id: The ID of the VPC to which the security group belongs.
+        :param tags: The tags of the security group.
+        :return: Security group list.
+        """
+        request = DescribeSecurityGroupsRequest()
+        if vpc_id is not None:
+            request.set_VpcId(vpc_id)
+        if tags is not None:
+            request.set_Tags(tags)
+        response = self._send_request(request)
+        if response is not None:
+            security_groups = response.get("SecurityGroups").get("SecurityGroup")
+            return security_groups
+        logging.error("describe security group failed.")
+        return None
+    def authorize_security_group(
+        self, ip_protocol, port_range, security_group_id, source_cidr_ip
+    ):
+        """Create an inbound security group rule.
+        :param ip_protocol: The transport layer protocol.
+        :param port_range: The range of destination ports relevant to
+                           the transport layer protocol.
+        :param security_group_id: The ID of the destination security group.
+        :param source_cidr_ip: The range of source IPv4 addresses.
+                               CIDR blocks and IPv4 addresses are supported.
+        """
+        request = AuthorizeSecurityGroupRequest()
+        request.set_IpProtocol(ip_protocol)
+        request.set_PortRange(port_range)
+        request.set_SecurityGroupId(security_group_id)
+        request.set_SourceCidrIp(source_cidr_ip)
+        self._send_request(request)
+    def create_v_switch(self, vpc_id, zone_id, cidr_block):
+        """Create vSwitches to divide the VPC into one or more subnets
+        :param vpc_id: The ID of the VPC to which the VSwitch belongs.
+        :param zone_id: The ID of the zone to which
+                        the target VSwitch belongs.
+        :param cidr_block: The CIDR block of the VSwitch.
+        :return:
+        """
+        request = CreateVSwitchRequest()
+        request.set_ZoneId(zone_id)
+        request.set_VpcId(vpc_id)
+        request.set_CidrBlock(cidr_block)
+        response = self._send_request(request)
+        if response is not None:
+            return response.get("VSwitchId")
+        else:
+            logging.error("create_v_switch vpc_id %s failed.", vpc_id)
+        return None
+    def create_vpc(self):
+        """Creates a virtual private cloud (VPC).
+        :return: The created VPC ID.
+        """
+        request = CreateVpcRequest()
+        response = self._send_request(request)
+        if response is not None:
+            return response.get("VpcId")
+        return None
+    def describe_vpcs(self):
+        """Queries one or more VPCs in a region.
+        :return: VPC list.
+        """
+        request = DescribeVpcsRequest()
+        response = self._send_request(request)
+        if response is not None:
+            return response.get("Vpcs").get("Vpc")
+        return None
+    def tag_resource(self, resource_ids, tags, resource_type="instance"):
+        """Create and bind tags to specified ECS resources.
+        :param resource_ids: The IDs of N resources.
+        :param tags: The tags of the resource.
+        :param resource_type: The type of the resource.
+        """
+        request = TagResourcesRequest()
+        request.set_Tags(tags)
+        request.set_ResourceType(resource_type)
+        request.set_ResourceIds(resource_ids)
+        response = self._send_request(request)
+        if response is not None:
+            logging.info("instance %s create tag successfully.", resource_ids)
+        else:
+            logging.error("instance %s create tag failed.", resource_ids)
+    def start_instance(self, instance_id):
+        """Start an ECS instance.
+        :param instance_id: The Ecs instance ID.
+        """
+        request = StartInstanceRequest()
+        request.set_InstanceId(instance_id)
+        response = self._send_request(request)
+        if response is not None:
+            logging.info("instance %s start successfully.", instance_id)
+        else:
+            logging.error("instance %s start failed.", instance_id)
+    def stop_instance(self, instance_id, force_stop=False):
+        """Stop an ECS instance that is in the Running state.
+        :param instance_id: The Ecs instance ID.
+        :param force_stop: Specifies whether to forcibly stop the instance.
+        :return:
+        """
+        request = StopInstanceRequest()
+        request.set_InstanceId(instance_id)
+        request.set_ForceStop(force_stop)
+        logging.info("Stop %s command submit successfully.", instance_id)
+        self._send_request(request)
+    def stop_instances(self, instance_ids, stopped_mode="StopCharging"):
+        """Stop one or more ECS instances that are in the Running state.
+        :param instance_ids: The IDs of instances.
+        :param stopped_mode: Specifies whether billing for the instance
+                             continues after the instance is stopped.
+        """
+        request = StopInstancesRequest()
+        request.set_InstanceIds(instance_ids)
+        request.set_StoppedMode(stopped_mode)
+        response = self._send_request(request)
+        if response is None:
+            logging.error("stop_instances failed")
+    def delete_instance(self, instance_id):
+        """Release a pay-as-you-go instance or
+            an expired subscription instance.
+        :param instance_id: The ID of the instance that you want to release.
+        """
+        request = DeleteInstanceRequest()
+        request.set_InstanceId(instance_id)
+        request.set_Force(True)
+        logging.info("Delete %s command submit successfully", instance_id)
+        self._send_request(request)
+    def delete_instances(self, instance_ids):
+        """Release one or more pay-as-you-go instances or
+            expired subscription instances.
+        :param instance_ids: The IDs of instances that you want to release.
+        """
+        request = DeleteInstancesRequest()
+        request.set_Force(True)
+        request.set_InstanceIds(instance_ids)
+        self._send_request(request)
+    def allocate_public_address(self, instance_id):
+        """Assign a public IP address to an ECS instance.
+        :param instance_id: The ID of the instance to which you want to
+                            assign a public IP address.
+        :return: The assigned ip.
+        """
+        request = AllocatePublicIpAddressRequest()
+        request.set_InstanceId(instance_id)
+        response = self._send_request(request)
+        if response is not None:
+            return response.get("IpAddress")
+    def create_key_pair(self, key_pair_name):
+        """Create an SSH key pair.
+        :param key_pair_name: The name of the key pair.
+        :return: The created keypair data.
+        """
+        request = CreateKeyPairRequest()
+        request.set_KeyPairName(key_pair_name)
+        response = self._send_request(request)
+        if response is not None:
+            logging.info("Create Key Pair %s Successfully", response.get("KeyPairId"))
+            return response
+        else:
+            logging.error("Create Key Pair Failed")
+            return None
+    def import_key_pair(self, key_pair_name, public_key_body):
+        """Import the public key of an RSA-encrypted key pair
+            that is generated by a third-party tool.
+        :param key_pair_name: The name of the key pair.
+        :param public_key_body: The public key of the key pair.
+        """
+        request = ImportKeyPairRequest()
+        request.set_KeyPairName(key_pair_name)
+        request.set_PublicKeyBody(public_key_body)
+        self._send_request(request)
+    def delete_key_pairs(self, key_pair_names):
+        """Delete one or more SSH key pairs.
+        :param key_pair_names: The name of the key pair.
+        :return:
+        """
+        request = DeleteKeyPairsRequest()
+        request.set_KeyPairNames(key_pair_names)
+        self._send_request(request)
+    def describe_key_pairs(self, key_pair_name=None):
+        """Query one or more key pairs.
+        :param key_pair_name: The name of the key pair.
+        :return:
+        """
+        request = DescribeKeyPairsRequest()
+        if key_pair_name is not None:
+            request.set_KeyPairName(key_pair_name)
+        response = self._send_request(request)
+        if response is not None:
+            return response.get("KeyPairs").get("KeyPair")
+        else:
+            return None
+    def describe_v_switches(self, vpc_id=None):
+        """Queries one or more VSwitches.
+        :param vpc_id: The ID of the VPC to which the VSwitch belongs.
+        :return: VSwitch list.
+        """
+        request = DescribeVSwitchesRequest()
+        if vpc_id is not None:
+            request.set_VpcId(vpc_id)
+        response = self._send_request(request)
+        if response is not None:
+            return response.get("VSwitches").get("VSwitch")
+        else:
+            logging.error("Describe VSwitches Failed.")
+            return None
+    def _send_request(self, request):
+        """send open api request"""
+        request.set_accept_format("json")
+        try:
+            response_str = self.cli.do_action_with_exception(request)
+            response_detail = json.loads(response_str)
+            return response_detail
+        except (ClientException, ServerException) as e:
+            logging.error(request.get_action_name())
+            logging.error(e)
+            return None

.venv/lib/python3.11/site-packages/ray/autoscaler/_private/aws/__pycache__/utils.cpython-311.pyc ADDED Viewed

Binary file (8.36 kB). View file

.venv/lib/python3.11/site-packages/ray/autoscaler/_private/kuberay/__init__.py ADDED Viewed

File without changes

.venv/lib/python3.11/site-packages/ray/autoscaler/_private/kuberay/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (204 Bytes). View file

.venv/lib/python3.11/site-packages/ray/autoscaler/_private/kuberay/__pycache__/autoscaling_config.cpython-311.pyc ADDED Viewed

Binary file (16.6 kB). View file

.venv/lib/python3.11/site-packages/ray/autoscaler/_private/kuberay/__pycache__/node_provider.cpython-311.pyc ADDED Viewed

Binary file (24.6 kB). View file

.venv/lib/python3.11/site-packages/ray/autoscaler/_private/kuberay/__pycache__/run_autoscaler.cpython-311.pyc ADDED Viewed

Binary file (5.01 kB). View file

.venv/lib/python3.11/site-packages/ray/autoscaler/_private/kuberay/__pycache__/utils.cpython-311.pyc ADDED Viewed

Binary file (4.45 kB). View file

.venv/lib/python3.11/site-packages/ray/autoscaler/_private/kuberay/node_provider.py ADDED Viewed

	@@ -0,0 +1,536 @@

+import datetime
+import json
+import logging
+import os
+from abc import ABC, abstractmethod
+from collections import defaultdict
+from typing import Any, Dict, List, Optional, Tuple
+import requests
+from ray.autoscaler._private.constants import WORKER_LIVENESS_CHECK_KEY
+from ray.autoscaler._private.util import NodeID, NodeIP, NodeKind, NodeStatus, NodeType
+from ray.autoscaler.batching_node_provider import (
+    BatchingNodeProvider,
+    NodeData,
+    ScaleRequest,
+)
+from ray.autoscaler.tags import (
+    NODE_KIND_HEAD,
+    NODE_KIND_WORKER,
+    STATUS_UP_TO_DATE,
+    STATUS_UPDATE_FAILED,
+    TAG_RAY_USER_NODE_TYPE,
+)
+# Key for KubeRay label that identifies a Ray pod as head or worker.
+KUBERAY_LABEL_KEY_KIND = "ray.io/node-type"
+# Key for KubeRay label that identifies the worker group (autoscaler node type) of a
+# Ray pod.
+KUBERAY_LABEL_KEY_TYPE = "ray.io/group"
+# These should be synced with:
+# https://github.com/ray-project/kuberay/blob/f2d94ffe213dd8f69481b09c474047cb899fa73b/ray-operator/apis/ray/v1/raycluster_types.go#L165-L171 # noqa
+# Kind label value indicating the pod is the head.
+KUBERAY_KIND_HEAD = "head"
+# Kind label value indicating the pod is the worker.
+KUBERAY_KIND_WORKER = "worker"
+# KubeRay CRD version
+KUBERAY_CRD_VER = os.getenv("KUBERAY_CRD_VER", "v1alpha1")
+KUBERAY_REQUEST_TIMEOUT_S = int(os.getenv("KUBERAY_REQUEST_TIMEOUT_S", 60))
+RAY_HEAD_POD_NAME = os.getenv("RAY_HEAD_POD_NAME")
+# https://kubernetes.io/docs/tasks/run-application/access-api-from-pod
+# While running in a Pod, your container can create an HTTPS URL for the
+# Kubernetes API server by fetching the KUBERNETES_SERVICE_HOST and
+# KUBERNETES_SERVICE_PORT_HTTPS environment variables.
+KUBERNETES_SERVICE_HOST = os.getenv(
+    "KUBERNETES_SERVICE_HOST", "https://kubernetes.default"
+)
+KUBERNETES_SERVICE_PORT = os.getenv("KUBERNETES_SERVICE_PORT_HTTPS", "443")
+KUBERNETES_HOST = f"{KUBERNETES_SERVICE_HOST}:{KUBERNETES_SERVICE_PORT}"
+# Key for GKE label that identifies which multi-host replica a pod belongs to
+REPLICA_INDEX_KEY = "replicaIndex"
+TOKEN_REFRESH_PERIOD = datetime.timedelta(minutes=1)
+# Design:
+# Each modification the autoscaler wants to make is posted to the API server goal state
+# (e.g. if the autoscaler wants to scale up, it increases the number of
+# replicas of the worker group it wants to scale, if it wants to scale down
+# it decreases the number of replicas and adds the exact pods that should be
+# terminated to the scaleStrategy).
+# KubeRayNodeProvider inherits from BatchingNodeProvider.
+# Thus, the autoscaler's create and terminate requests are batched into a single
+# Scale Request object which is submitted at the end of autoscaler update.
+# KubeRay node provider converts the ScaleRequest into a RayCluster CR patch
+# and applies the patch in the submit_scale_request method.
+# To reduce potential for race conditions, KubeRayNodeProvider
+# aborts the autoscaler update if the operator has not yet processed workersToDelete -
+# see KubeRayNodeProvider.safe_to_scale().
+# Once it is confirmed that workersToDelete have been cleaned up, KubeRayNodeProvider
+# clears the workersToDelete list.
+# Note: Log handlers set up in autoscaling monitor entrypoint.
+logger = logging.getLogger(__name__)
+def node_data_from_pod(pod: Dict[str, Any]) -> NodeData:
+    """Converts a Ray pod extracted from K8s into Ray NodeData.
+    NodeData is processed by BatchingNodeProvider.
+    """
+    kind, type = kind_and_type(pod)
+    status = status_tag(pod)
+    ip = pod_ip(pod)
+    replica_index = _replica_index_label(pod)
+    return NodeData(
+        kind=kind, type=type, replica_index=replica_index, status=status, ip=ip
+    )
+def kind_and_type(pod: Dict[str, Any]) -> Tuple[NodeKind, NodeType]:
+    """Determine Ray node kind (head or workers) and node type (worker group name)
+    from a Ray pod's labels.
+    """
+    labels = pod["metadata"]["labels"]
+    kind = (
+        NODE_KIND_HEAD
+        if labels[KUBERAY_LABEL_KEY_KIND] == KUBERAY_KIND_HEAD
+        else NODE_KIND_WORKER
+    )
+    type = labels[KUBERAY_LABEL_KEY_TYPE]
+    return kind, type
+def _replica_index_label(pod: Dict[str, Any]) -> Optional[str]:
+    """Returns the replicaIndex label for a Pod in a multi-host TPU worker group.
+    The replicaIndex label is set by the GKE TPU Ray webhook and is of
+    the form {$WORKER_GROUP_NAME-$REPLICA_INDEX} where $REPLICA_INDEX
+    is an integer from 0 to Replicas-1.
+    """
+    labels = pod["metadata"]["labels"]
+    return labels.get(REPLICA_INDEX_KEY, None)
+def pod_ip(pod: Dict[str, Any]) -> NodeIP:
+    return pod["status"].get("podIP", "IP not yet assigned")
+def status_tag(pod: Dict[str, Any]) -> NodeStatus:
+    """Convert pod state to Ray autoscaler node status.
+    See the doc string of the class
+    batching_node_provider.NodeData for the semantics of node status.
+    """
+    if (
+        "containerStatuses" not in pod["status"]
+        or not pod["status"]["containerStatuses"]
+    ):
+        return "pending"
+    state = pod["status"]["containerStatuses"][0]["state"]
+    if "pending" in state:
+        return "pending"
+    if "running" in state:
+        return STATUS_UP_TO_DATE
+    if "waiting" in state:
+        return "waiting"
+    if "terminated" in state:
+        return STATUS_UPDATE_FAILED
+    raise ValueError("Unexpected container state.")
+def worker_delete_patch(group_index: str, workers_to_delete: List[NodeID]):
+    path = f"/spec/workerGroupSpecs/{group_index}/scaleStrategy"
+    value = {"workersToDelete": workers_to_delete}
+    return replace_patch(path, value)
+def worker_replica_patch(group_index: str, target_replicas: int):
+    path = f"/spec/workerGroupSpecs/{group_index}/replicas"
+    value = target_replicas
+    return replace_patch(path, value)
+def replace_patch(path: str, value: Any) -> Dict[str, Any]:
+    return {"op": "replace", "path": path, "value": value}
+def load_k8s_secrets() -> Tuple[Dict[str, str], str]:
+    """
+    Loads secrets needed to access K8s resources.
+    Returns:
+        headers: Headers with K8s access token
+        verify: Path to certificate
+    """
+    with open("/var/run/secrets/kubernetes.io/serviceaccount/token") as secret:
+        token = secret.read()
+    headers = {
+        "Authorization": "Bearer " + token,
+    }
+    verify = "/var/run/secrets/kubernetes.io/serviceaccount/ca.crt"
+    return headers, verify
+def url_from_resource(
+    namespace: str,
+    path: str,
+    kuberay_crd_version: str = KUBERAY_CRD_VER,
+    kubernetes_host: str = KUBERNETES_HOST,
+) -> str:
+    """Convert resource path to REST URL for Kubernetes API server.
+    Args:
+        namespace: The K8s namespace of the resource
+        path: The part of the resource path that starts with the resource type.
+            Supported resource types are "pods" and "rayclusters".
+        kuberay_crd_version: The API version of the KubeRay CRD.
+            Looks like "v1alpha1", "v1".
+        kubernetes_host: The host of the Kubernetes API server.
+            Uses $KUBERNETES_SERVICE_HOST and
+            $KUBERNETES_SERVICE_PORT to construct the kubernetes_host if not provided.
+            When set by Kubernetes,
+            $KUBERNETES_SERVICE_HOST could be an IP address. That's why the https
+            scheme is added here.
+            Defaults to "https://kubernetes.default:443".
+    """
+    if kubernetes_host.startswith("http://"):
+        raise ValueError("Kubernetes host must be accessed over HTTPS.")
+    if not kubernetes_host.startswith("https://"):
+        kubernetes_host = "https://" + kubernetes_host
+    if path.startswith("pods"):
+        api_group = "/api/v1"
+    elif path.startswith("rayclusters"):
+        api_group = "/apis/ray.io/" + kuberay_crd_version
+    else:
+        raise NotImplementedError("Tried to access unknown entity at {}".format(path))
+    return kubernetes_host + api_group + "/namespaces/" + namespace + "/" + path
+def _worker_group_index(raycluster: Dict[str, Any], group_name: str) -> int:
+    """Extract worker group index from RayCluster."""
+    group_names = [
+        spec["groupName"] for spec in raycluster["spec"].get("workerGroupSpecs", [])
+    ]
+    return group_names.index(group_name)
+def _worker_group_max_replicas(
+    raycluster: Dict[str, Any], group_index: int
+) -> Optional[int]:
+    """Extract the maxReplicas of a worker group.
+    If maxReplicas is unset, return None, to be interpreted as "no constraint".
+    At time of writing, it should be impossible for maxReplicas to be unset, but it's
+    better to handle this anyway.
+    """
+    return raycluster["spec"]["workerGroupSpecs"][group_index].get("maxReplicas")
+def _worker_group_replicas(raycluster: Dict[str, Any], group_index: int):
+    # 1 is the default replicas value used by the KubeRay operator
+    return raycluster["spec"]["workerGroupSpecs"][group_index].get("replicas", 1)
+class IKubernetesHttpApiClient(ABC):
+    """
+    An interface for a Kubernetes HTTP API client.
+    This interface could be used to mock the Kubernetes API client in tests.
+    """
+    @abstractmethod
+    def get(self, path: str) -> Dict[str, Any]:
+        """Wrapper for REST GET of resource with proper headers."""
+        pass
+    @abstractmethod
+    def patch(self, path: str, payload: List[Dict[str, Any]]) -> Dict[str, Any]:
+        """Wrapper for REST PATCH of resource with proper headers."""
+        pass
+class KubernetesHttpApiClient(IKubernetesHttpApiClient):
+    def __init__(self, namespace: str, kuberay_crd_version: str = KUBERAY_CRD_VER):
+        self._kuberay_crd_version = kuberay_crd_version
+        self._namespace = namespace
+        self._token_expires_at = datetime.datetime.now() + TOKEN_REFRESH_PERIOD
+        self._headers, self._verify = None, None
+    def _get_refreshed_headers_and_verify(self):
+        if (datetime.datetime.now() >= self._token_expires_at) or (
+            self._headers is None or self._verify is None
+        ):
+            logger.info("Refreshing K8s API client token and certs.")
+            self._headers, self._verify = load_k8s_secrets()
+            self._token_expires_at = datetime.datetime.now() + TOKEN_REFRESH_PERIOD
+            return self._headers, self._verify
+        else:
+            return self._headers, self._verify
+    def get(self, path: str) -> Dict[str, Any]:
+        """Wrapper for REST GET of resource with proper headers.
+        Args:
+            path: The part of the resource path that starts with the resource type.
+        Returns:
+            The JSON response of the GET request.
+        Raises:
+            HTTPError: If the GET request fails.
+        """
+        url = url_from_resource(
+            namespace=self._namespace,
+            path=path,
+            kuberay_crd_version=self._kuberay_crd_version,
+        )
+        headers, verify = self._get_refreshed_headers_and_verify()
+        result = requests.get(
+            url,
+            headers=headers,
+            timeout=KUBERAY_REQUEST_TIMEOUT_S,
+            verify=verify,
+        )
+        if not result.status_code == 200:
+            result.raise_for_status()
+        return result.json()
+    def patch(self, path: str, payload: List[Dict[str, Any]]) -> Dict[str, Any]:
+        """Wrapper for REST PATCH of resource with proper headers
+        Args:
+            path: The part of the resource path that starts with the resource type.
+            payload: The JSON patch payload.
+        Returns:
+            The JSON response of the PATCH request.
+        Raises:
+            HTTPError: If the PATCH request fails.
+        """
+        url = url_from_resource(
+            namespace=self._namespace,
+            path=path,
+            kuberay_crd_version=self._kuberay_crd_version,
+        )
+        headers, verify = self._get_refreshed_headers_and_verify()
+        result = requests.patch(
+            url,
+            json.dumps(payload),
+            headers={**headers, "Content-type": "application/json-patch+json"},
+            verify=verify,
+        )
+        if not result.status_code == 200:
+            result.raise_for_status()
+        return result.json()
+class KubeRayNodeProvider(BatchingNodeProvider):  # type: ignore
+    def __init__(
+        self,
+        provider_config: Dict[str, Any],
+        cluster_name: str,
+    ):
+        logger.info("Creating KubeRayNodeProvider.")
+        self.namespace = provider_config["namespace"]
+        self.cluster_name = cluster_name
+        self.k8s_api_client = KubernetesHttpApiClient(self.namespace)
+        assert (
+            provider_config.get(WORKER_LIVENESS_CHECK_KEY, True) is False
+        ), f"To use KubeRayNodeProvider, must set `{WORKER_LIVENESS_CHECK_KEY}:False`."
+        BatchingNodeProvider.__init__(self, provider_config, cluster_name)
+    def get_node_data(self) -> Dict[NodeID, NodeData]:
+        """Queries K8s for pods in the RayCluster. Converts that pod data into a
+        map of pod name to Ray NodeData, as required by BatchingNodeProvider.
+        """
+        # Store the raycluster CR
+        self._raycluster = self._get(f"rayclusters/{self.cluster_name}")
+        # Get the pods resource version.
+        # Specifying a resource version in list requests is important for scalability:
+        # https://kubernetes.io/docs/reference/using-api/api-concepts/#semantics-for-get-and-list
+        resource_version = self._get_pods_resource_version()
+        if resource_version:
+            logger.info(
+                f"Listing pods for RayCluster {self.cluster_name}"
+                f" in namespace {self.namespace}"
+                f" at pods resource version >= {resource_version}."
+            )
+        # Filter pods by cluster_name.
+        label_selector = requests.utils.quote(f"ray.io/cluster={self.cluster_name}")
+        resource_path = f"pods?labelSelector={label_selector}"
+        if resource_version:
+            resource_path += (
+                f"&resourceVersion={resource_version}"
+                + "&resourceVersionMatch=NotOlderThan"
+            )
+        pod_list = self._get(resource_path)
+        fetched_resource_version = pod_list["metadata"]["resourceVersion"]
+        logger.info(
+            f"Fetched pod data at resource version" f" {fetched_resource_version}."
+        )
+        # Extract node data from the pod list.
+        node_data_dict = {}
+        for pod in pod_list["items"]:
+            # Kubernetes sets metadata.deletionTimestamp immediately after admitting a
+            # request to delete an object. Full removal of the object may take some time
+            # after the deletion timestamp is set. See link for details:
+            # https://kubernetes.io/docs/reference/using-api/api-concepts/#resource-deletion
+            if "deletionTimestamp" in pod["metadata"]:
+                # Ignore pods marked for termination.
+                continue
+            pod_name = pod["metadata"]["name"]
+            node_data_dict[pod_name] = node_data_from_pod(pod)
+        return node_data_dict
+    def submit_scale_request(self, scale_request: ScaleRequest):
+        """Converts the scale request generated by BatchingNodeProvider into
+        a patch that modifies the RayCluster CR's replicas and/or workersToDelete
+        fields. Then submits the patch to the K8s API server.
+        """
+        # Transform the scale request into a patch payload.
+        patch_payload = self._scale_request_to_patch_payload(
+            scale_request, self._raycluster
+        )
+        # Submit the patch to K8s.
+        logger.info(
+            "Autoscaler is submitting the following patch to RayCluster "
+            f"{self.cluster_name} in namespace {self.namespace}."
+        )
+        logger.info(patch_payload)
+        self._submit_raycluster_patch(patch_payload)
+    def safe_to_scale(self) -> bool:
+        """Returns False iff non_terminated_nodes contains any pods in the RayCluster's
+        workersToDelete lists.
+        Explanation:
+        If there are any workersToDelete which are non-terminated,
+        we should wait for the operator to do its job and delete those
+        pods. Therefore, we back off the autoscaler update.
+        If, on the other hand, all of the workersToDelete have already been cleaned up,
+        then we patch away the workersToDelete lists and return True.
+        In the future, we may consider having the operator clean up workersToDelete
+        on it own:
+        https://github.com/ray-project/kuberay/issues/733
+        Note (Dmitri):
+        It is stylistically bad that this function has a side effect.
+        """
+        # Get the list of nodes.
+        node_set = set(self.node_data_dict.keys())
+        worker_groups = self._raycluster["spec"].get("workerGroupSpecs", [])
+        # Accumulates the indices of worker groups with non-empty workersToDelete
+        non_empty_worker_group_indices = []
+        for group_index, worker_group in enumerate(worker_groups):
+            workersToDelete = worker_group.get("scaleStrategy", {}).get(
+                "workersToDelete", []
+            )
+            if workersToDelete:
+                non_empty_worker_group_indices.append(group_index)
+            for worker in workersToDelete:
+                if worker in node_set:
+                    # The operator hasn't removed this worker yet. Abort
+                    # the autoscaler update.
+                    logger.warning(f"Waiting for operator to remove worker {worker}.")
+                    return False
+        # All required workersToDelete have been removed.
+        # Clean up the workersToDelete field.
+        patch_payload = []
+        for group_index in non_empty_worker_group_indices:
+            patch = worker_delete_patch(group_index, workers_to_delete=[])
+            patch_payload.append(patch)
+        if patch_payload:
+            logger.info("Cleaning up workers to delete.")
+            logger.info(f"Submitting patch {patch_payload}.")
+            self._submit_raycluster_patch(patch_payload)
+        # It's safe to proceed with the autoscaler update.
+        return True
+    def _get_pods_resource_version(self) -> str:
+        """
+        Extract a recent pods resource version by reading the head pod's
+        metadata.resourceVersion of the response.
+        """
+        if not RAY_HEAD_POD_NAME:
+            return None
+        pod_resp = self._get(f"pods/{RAY_HEAD_POD_NAME}")
+        return pod_resp["metadata"]["resourceVersion"]
+    def _scale_request_to_patch_payload(
+        self, scale_request: ScaleRequest, raycluster: Dict[str, Any]
+    ) -> List[Dict[str, Any]]:
+        """Converts autoscaler scale request into a RayCluster CR patch payload."""
+        patch_payload = []
+        # Collect patches for replica counts.
+        for node_type, target_replicas in scale_request.desired_num_workers.items():
+            group_index = _worker_group_index(raycluster, node_type)
+            group_max_replicas = _worker_group_max_replicas(raycluster, group_index)
+            # Cap the replica count to maxReplicas.
+            if group_max_replicas is not None and group_max_replicas < target_replicas:
+                logger.warning(
+                    "Autoscaler attempted to create "
+                    + "more than maxReplicas pods of type {}.".format(node_type)
+                )
+                target_replicas = group_max_replicas
+            # Check if we need to change the target count.
+            if target_replicas == _worker_group_replicas(raycluster, group_index):
+                # No patch required.
+                continue
+            # Need to patch replica count. Format the patch and add it to the payload.
+            patch = worker_replica_patch(group_index, target_replicas)
+            patch_payload.append(patch)
+        # Maps node_type to nodes to delete for that group.
+        deletion_groups = defaultdict(list)
+        for worker in scale_request.workers_to_delete:
+            node_type = self.node_tags(worker)[TAG_RAY_USER_NODE_TYPE]
+            deletion_groups[node_type].append(worker)
+        for node_type, workers_to_delete in deletion_groups.items():
+            group_index = _worker_group_index(raycluster, node_type)
+            patch = worker_delete_patch(group_index, workers_to_delete)
+            patch_payload.append(patch)
+        return patch_payload
+    def _submit_raycluster_patch(self, patch_payload: List[Dict[str, Any]]):
+        """Submits a patch to modify a RayCluster CR."""
+        path = "rayclusters/{}".format(self.cluster_name)
+        self._patch(path, patch_payload)
+    def _get(self, path: str) -> Dict[str, Any]:
+        """Wrapper for REST GET of resource with proper headers."""
+        return self.k8s_api_client.get(path)
+    def _patch(self, path: str, payload: List[Dict[str, Any]]) -> Dict[str, Any]:
+        """Wrapper for REST PATCH of resource with proper headers."""
+        return self.k8s_api_client.patch(path, payload)

.venv/lib/python3.11/site-packages/ray/autoscaler/_private/kuberay/run_autoscaler.py ADDED Viewed

	@@ -0,0 +1,119 @@

+import logging
+import os
+import subprocess
+import time
+import ray
+from ray._private import ray_constants
+from ray._private.ray_logging import setup_component_logger
+from ray._private.services import get_node_ip_address
+from ray._private.utils import try_to_create_directory
+from ray._raylet import GcsClient
+from ray.autoscaler._private.kuberay.autoscaling_config import AutoscalingConfigProducer
+from ray.autoscaler._private.monitor import Monitor
+from ray.autoscaler.v2.instance_manager.config import KubeRayConfigReader
+from ray.autoscaler.v2.utils import is_autoscaler_v2
+logger = logging.getLogger(__name__)
+BACKOFF_S = 5
+def _get_log_dir() -> str:
+    return os.path.join(
+        ray._private.utils.get_ray_temp_dir(),
+        ray._private.ray_constants.SESSION_LATEST,
+        "logs",
+    )
+def run_kuberay_autoscaler(cluster_name: str, cluster_namespace: str):
+    """Wait until the Ray head container is ready. Then start the autoscaler."""
+    head_ip = get_node_ip_address()
+    ray_address = f"{head_ip}:6379"
+    while True:
+        try:
+            # Autoscaler Ray version might not exactly match GCS version, so skip the
+            # version check when checking GCS status.
+            subprocess.check_call(
+                [
+                    "ray",
+                    "health-check",
+                    "--address",
+                    ray_address,
+                    "--skip-version-check",
+                ]
+            )
+            logger.info("The Ray head is ready. Starting the autoscaler.")
+            break
+        except subprocess.CalledProcessError:
+            logger.warning(
+                f"The Ray head is not ready. Will check again in {BACKOFF_S} seconds."
+            )
+            time.sleep(BACKOFF_S)
+    # The Ray head container sets up the log directory. Thus, we set up logging
+    # only after the Ray head is ready.
+    _setup_logging()
+    # autoscaling_config_producer reads the RayCluster CR from K8s and uses the CR
+    # to output an autoscaling config.
+    autoscaling_config_producer = AutoscalingConfigProducer(
+        cluster_name, cluster_namespace
+    )
+    gcs_client = GcsClient(ray_address)
+    if is_autoscaler_v2(fetch_from_server=True, gcs_client=gcs_client):
+        from ray.autoscaler.v2.monitor import AutoscalerMonitor as MonitorV2
+        MonitorV2(
+            address=gcs_client.address,
+            config_reader=KubeRayConfigReader(autoscaling_config_producer),
+            log_dir=_get_log_dir(),
+            monitor_ip=head_ip,
+        ).run()
+    else:
+        Monitor(
+            address=gcs_client.address,
+            # The `autoscaling_config` arg can be a dict or a `Callable: () -> dict`.
+            # In this case, it's a callable.
+            autoscaling_config=autoscaling_config_producer,
+            monitor_ip=head_ip,
+            # Let the autoscaler process exit after it hits 5 exceptions.
+            # (See ray.autoscaler._private.constants.AUTOSCALER_MAX_NUM_FAILURES.)
+            # Kubernetes will then restart the autoscaler container.
+            retry_on_failure=False,
+        ).run()
+def _setup_logging() -> None:
+    """Log to autoscaler log file
+    (typically, /tmp/ray/session_latest/logs/monitor.*)
+    Also log to pod stdout (logs viewable with `kubectl logs <head-pod> -c autoscaler`).
+    """
+    log_dir = _get_log_dir()
+    # The director should already exist, but try (safely) to create it just in case.
+    try_to_create_directory(log_dir)
+    # Write logs at info level to monitor.log.
+    setup_component_logger(
+        logging_level=ray_constants.LOGGER_LEVEL,
+        logging_format=ray_constants.LOGGER_FORMAT,
+        log_dir=log_dir,
+        filename=ray_constants.MONITOR_LOG_FILE_NAME,  # monitor.log
+        max_bytes=ray_constants.LOGGING_ROTATE_BYTES,
+        backup_count=ray_constants.LOGGING_ROTATE_BACKUP_COUNT,
+    )
+    # For the autoscaler, the root logger _also_ needs to write to stderr, not just
+    # ray_constants.MONITOR_LOG_FILE_NAME.
+    level = logging.getLevelName(ray_constants.LOGGER_LEVEL.upper())
+    stderr_handler = logging._StderrHandler()
+    stderr_handler.setFormatter(logging.Formatter(ray_constants.LOGGER_FORMAT))
+    stderr_handler.setLevel(level)
+    logging.root.setLevel(level)
+    logging.root.addHandler(stderr_handler)
+    # The stdout handler was set up in the Ray CLI entry point.
+    # See ray.scripts.scripts::cli().

.venv/lib/python3.11/site-packages/ray/autoscaler/_private/kuberay/utils.py ADDED Viewed

	@@ -0,0 +1,111 @@

+# Source:
+# https://github.com/kubernetes-client/python/blob/master/kubernetes/utils/quantity.py
+from decimal import Decimal, InvalidOperation
+from functools import reduce
+from typing import Optional
+# Mapping used to get generation for TPU-{accelerator}-head resource
+# https://cloud.google.com/kubernetes-engine/docs/how-to/tpus#run
+gke_tpu_accelerator_to_generation = {
+    "tpu-v4-podslice": "v4",
+    "tpu-v5-lite-device": "v5e",
+    "tpu-v5-lite-podslice": "v5e",
+    "tpu-v5p-slice": "v5p",
+    "tpu-v6e-slice": "v6e",
+}
+def parse_quantity(quantity):
+    """
+    Parse kubernetes canonical form quantity like 200Mi to a decimal number.
+    Supported SI suffixes:
+    base1024: Ki | Mi | Gi | Ti | Pi | Ei
+    base1000: n | u | m | "" | k | M | G | T | P | E
+    See
+    https://github.com/kubernetes/apimachinery/blob/master/pkg/api/resource/quantity.go
+    Input:
+    quantity: string. kubernetes canonical form quantity
+    Returns:
+    Decimal
+    Raises:
+    ValueError on invalid or unknown input
+    """
+    if isinstance(quantity, (int, float, Decimal)):
+        return Decimal(quantity)
+    exponents = {
+        "n": -3,
+        "u": -2,
+        "m": -1,
+        "K": 1,
+        "k": 1,
+        "M": 2,
+        "G": 3,
+        "T": 4,
+        "P": 5,
+        "E": 6,
+    }
+    quantity = str(quantity)
+    number = quantity
+    suffix = None
+    if len(quantity) >= 2 and quantity[-1] == "i":
+        if quantity[-2] in exponents:
+            number = quantity[:-2]
+            suffix = quantity[-2:]
+    elif len(quantity) >= 1 and quantity[-1] in exponents:
+        number = quantity[:-1]
+        suffix = quantity[-1:]
+    try:
+        number = Decimal(number)
+    except InvalidOperation:
+        raise ValueError("Invalid number format: {}".format(number))
+    if suffix is None:
+        return number
+    if suffix.endswith("i"):
+        base = 1024
+    elif len(suffix) == 1:
+        base = 1000
+    else:
+        raise ValueError("{} has unknown suffix".format(quantity))
+    # handle SI inconsistency
+    if suffix == "ki":
+        raise ValueError("{} has unknown suffix".format(quantity))
+    if suffix[0] not in exponents:
+        raise ValueError("{} has unknown suffix".format(quantity))
+    exponent = Decimal(exponents[suffix[0]])
+    return number * (base**exponent)
+def tpu_node_selectors_to_type(topology: str, accelerator: str) -> Optional[str]:
+    """Convert Kubernetes gke-tpu nodeSelectors to TPU accelerator_type
+    for a kuberay TPU worker group.
+    Args:
+        topology: value of the cloud.google.com/gke-tpu-topology Kubernetes
+            nodeSelector, describes the physical topology of the TPU podslice.
+        accelerator: value of the cloud.google.com/gke-tpu-accelerator nodeSelector,
+            the name of the TPU accelerator, e.g. tpu-v4-podslice
+    Returns:
+        A string, accelerator_type, e.g. "v4-8".
+    """
+    if topology and accelerator:
+        generation = gke_tpu_accelerator_to_generation[accelerator]
+        # Reduce e.g. "2x2x2" to 8
+        chip_dimensions = [int(chip_count) for chip_count in topology.split("x")]
+        num_chips = reduce(lambda x, y: x * y, chip_dimensions)
+        default_num_cores_per_chip = 1
+        if generation == "v4" or generation == "v5p":
+            default_num_cores_per_chip = 2
+        num_cores = num_chips * default_num_cores_per_chip
+        return f"{generation}-{num_cores}"
+    return None

.venv/lib/python3.11/site-packages/ray/autoscaler/_private/readonly/__init__.py ADDED Viewed

File without changes

.venv/lib/python3.11/site-packages/ray/autoscaler/_private/readonly/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (205 Bytes). View file

.venv/lib/python3.11/site-packages/ray/autoscaler/_private/readonly/__pycache__/node_provider.cpython-311.pyc ADDED Viewed

Binary file (4.39 kB). View file

.venv/lib/python3.11/site-packages/ray/autoscaler/_private/readonly/node_provider.py ADDED Viewed

	@@ -0,0 +1,80 @@

+from typing import List, Tuple
+from ray.autoscaler._private.util import format_readonly_node_type
+from ray.autoscaler.node_provider import NodeProvider
+from ray.autoscaler.tags import (
+    NODE_KIND_HEAD,
+    STATUS_UP_TO_DATE,
+    TAG_RAY_NODE_KIND,
+    TAG_RAY_NODE_NAME,
+    TAG_RAY_NODE_STATUS,
+    TAG_RAY_USER_NODE_TYPE,
+)
+class ReadOnlyNodeProvider(NodeProvider):
+    """A node provider that merely reports the current cluster state.
+    This is used for laptop mode / manual cluster setup modes, in order to
+    provide status reporting in the same way for users."""
+    def __init__(self, provider_config, cluster_name):
+        NodeProvider.__init__(self, provider_config, cluster_name)
+        self.nodes = {}
+    def is_readonly(self):
+        return True
+    def _set_nodes(self, nodes: List[Tuple[str, str]]):
+        """Update the set of nodes in the cluster.
+        Args:
+            nodes: List of (node_id, node_manager_address) tuples.
+        """
+        new_nodes = {}
+        for node_id, node_manager_address in nodes:
+            # We make up a fake node type for each node (since each node
+            # could have its own unique configuration).
+            new_nodes[node_id] = {
+                # Keep prefix in sync with node config gen in monitor.py
+                "node_type": format_readonly_node_type(node_id),
+                "ip": node_manager_address,
+            }
+        self.nodes = new_nodes
+    def non_terminated_nodes(self, tag_filters):
+        return list(self.nodes.keys())
+    def is_running(self, node_id):
+        return node_id in self.nodes
+    def is_terminated(self, node_id):
+        return node_id not in self.nodes
+    def node_tags(self, node_id):
+        tags = {
+            TAG_RAY_NODE_KIND: NODE_KIND_HEAD,
+            TAG_RAY_USER_NODE_TYPE: self.nodes[node_id]["node_type"],
+            TAG_RAY_NODE_NAME: node_id,
+            TAG_RAY_NODE_STATUS: STATUS_UP_TO_DATE,
+        }
+        return tags
+    def external_ip(self, node_id):
+        return node_id
+    def internal_ip(self, node_id):
+        return node_id
+    def set_node_tags(self, node_id, tags):
+        raise AssertionError("Readonly node provider cannot be updated")
+    def create_node(self, node_config, tags, count):
+        raise AssertionError("Readonly node provider cannot be updated")
+    def terminate_node(self, node_id):
+        raise AssertionError("Readonly node provider cannot be updated")
+    @staticmethod
+    def bootstrap_config(cluster_config):
+        return cluster_config

.venv/lib/python3.11/site-packages/ray/autoscaler/aws/__init__.py ADDED Viewed

File without changes

.venv/lib/python3.11/site-packages/ray/autoscaler/aws/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (191 Bytes). View file

.venv/lib/python3.11/site-packages/ray/autoscaler/aws/cloudwatch/prometheus.yml ADDED Viewed

	@@ -0,0 +1,15 @@

+# Prometheus config file
+# my global config
+global:
+  scrape_interval:     10s
+  evaluation_interval: 10s
+  scrape_timeout: 10s
+# use ray file-based service discovery file as scrape target.
+scrape_configs:
+- job_name: 'ray'
+  file_sd_configs:
+  - files:
+    - '/tmp/ray/prom_metrics_service_discovery.json'
+    refresh_interval: 1m

.venv/lib/python3.11/site-packages/ray/autoscaler/aws/cloudwatch/ray_prometheus_waiter.sh ADDED Viewed

	@@ -0,0 +1,23 @@

+#!/bin/bash
+MAX_ATTEMPTS=120
+DELAY_SECONDS=10
+RAY_PROM_METRICS_FILE_PATH="/tmp/ray/prom_metrics_service_discovery.json"
+CLUSTER_NAME=$1
+while [ $MAX_ATTEMPTS -gt 0 ]; do
+  if [ -f $RAY_PROM_METRICS_FILE_PATH ]; then
+    echo "Ray Prometheus metrics service discovery file found at: $RAY_PROM_METRICS_FILE_PATH."
+    echo "Restarting cloudwatch agent.This may take a few minutes..."
+    sudo /opt/aws/amazon-cloudwatch-agent/bin/amazon-cloudwatch-agent-ctl -m ec2 -a stop
+    echo "Cloudwatch agent stopped, starting cloudwatch agent..."
+    sudo /opt/aws/amazon-cloudwatch-agent/bin/amazon-cloudwatch-agent-ctl -a fetch-config -m ec2 -s -c "ssm:AmazonCloudWatch-ray_agent_config_$CLUSTER_NAME"
+    echo "Cloudwatch agent successfully restarted!"
+    exit 0
+  else
+    echo "Ray Prometheus metrics service discovery file not found at: $RAY_PROM_METRICS_FILE_PATH. Will check again in $DELAY_SECONDS seconds..."
+    sleep $DELAY_SECONDS
+    MAX_ATTEMPTS=$((MAX_ATTEMPTS-1))
+  fi
+done
+echo "Ray Prometheus metrics service discovery file not found at: $RAY_PROM_METRICS_FILE_PATH. Ray system metrics will not be available in CloudWatch."
+exit 1

.venv/lib/python3.11/site-packages/ray/autoscaler/aws/defaults.yaml ADDED Viewed

	@@ -0,0 +1,144 @@

+# An unique identifier for the head node and workers of this cluster.
+cluster_name: default
+# The maximum number of workers nodes to launch in addition to the head
+# node.
+max_workers: 2
+# The autoscaler will scale up the cluster faster with higher upscaling speed.
+# E.g., if the task requires adding more nodes then autoscaler will gradually
+# scale up the cluster in chunks of upscaling_speed*currently_running_nodes.
+# This number should be > 0.
+upscaling_speed: 1.0
+# This executes all commands on all nodes in the docker container,
+# and opens all the necessary ports to support the Ray cluster.
+# Empty string means disabled.
+docker: {}
+# If a node is idle for this many minutes, it will be removed.
+idle_timeout_minutes: 5
+# Cloud-provider specific configuration.
+provider:
+    type: aws
+    region: us-west-2
+    # Availability zone(s), comma-separated, that nodes may be launched in.
+    # Nodes will be launched in the first listed availability zone and will
+    # be tried in the subsequent availability zones if launching fails.
+    availability_zone: us-west-2a,us-west-2b
+    # Whether to allow node reuse. If set to False, nodes will be terminated
+    # instead of stopped.
+    cache_stopped_nodes: True # If not present, the default is True.
+# How Ray will authenticate with newly launched nodes.
+auth:
+    ssh_user: ubuntu
+# By default Ray creates a new private keypair, but you can also use your own.
+# If you do so, make sure to also set "KeyName" in the head and worker node
+# configurations below.
+#    ssh_private_key: /path/to/your/key.pem
+# Tell the autoscaler the allowed node types and the resources they provide.
+# The key is the name of the node type, which is just for debugging purposes.
+# The node config specifies the launch config and physical instance type.
+available_node_types:
+    ray.head.default:
+        # The node type's CPU and GPU resources are auto-detected based on AWS instance type.
+        # If desired, you can override the autodetected CPU and GPU resources advertised to the autoscaler.
+        # You can also set custom resources.
+        # For example, to mark a node type as having 1 CPU, 1 GPU, and 5 units of a resource called "custom", set
+        # resources: {"CPU": 1, "GPU": 1, "custom": 5}
+        resources: {}
+        # Provider-specific config for this node type, e.g. instance type. By default
+        # Ray will auto-configure unspecified fields such as SubnetId and KeyName.
+        # For more documentation on available fields, see:
+        # http://boto3.readthedocs.io/en/latest/reference/services/ec2.html#EC2.ServiceResource.create_instances
+        node_config:
+            InstanceType: m5.large
+            # You can provision additional disk space with a conf as follows
+            BlockDeviceMappings:
+                - DeviceName: /dev/sda1
+                  Ebs:
+                      VolumeSize: 256
+            # Additional options in the boto docs.
+    ray.worker.default:
+        # The minimum number of nodes of this type to launch.
+        # This number should be >= 0.
+        min_workers: 0
+        # The node type's CPU and GPU resources are auto-detected based on AWS instance type.
+        # If desired, you can override the autodetected CPU and GPU resources advertised to the autoscaler.
+        # You can also set custom resources.
+        # For example, to mark a node type as having 1 CPU, 1 GPU, and 5 units of a resource called "custom", set
+        # resources: {"CPU": 1, "GPU": 1, "custom": 5}
+        resources: {}
+        # Provider-specific config for this node type, e.g. instance type. By default
+        # Ray will auto-configure unspecified fields such as SubnetId and KeyName.
+        # For more documentation on available fields, see:
+        # http://boto3.readthedocs.io/en/latest/reference/services/ec2.html#EC2.ServiceResource.create_instances
+        node_config:
+            InstanceType: m5.large
+            # Run workers on spot by default. Comment this out to use on-demand.
+            InstanceMarketOptions:
+                MarketType: spot
+                # Additional options can be found in the boto docs, e.g.
+                #   SpotOptions:
+                #       MaxPrice: MAX_HOURLY_PRICE
+            # Additional options in the boto docs.
+# Specify the node type of the head node (as configured above).
+head_node_type: ray.head.default
+# Files or directories to copy to the head and worker nodes. The format is a
+# dictionary from REMOTE_PATH: LOCAL_PATH, e.g.
+file_mounts: {
+#    "/path1/on/remote/machine": "/path1/on/local/machine",
+#    "/path2/on/remote/machine": "/path2/on/local/machine",
+}
+# Files or directories to copy from the head node to the worker nodes. The format is a
+# list of paths. The same path on the head node will be copied to the worker node.
+# This behavior is a subset of the file_mounts behavior. In the vast majority of cases
+# you should just use file_mounts. Only use this if you know what you're doing!
+cluster_synced_files: []
+# Whether changes to directories in file_mounts or cluster_synced_files in the head node
+# should sync to the worker node continuously
+file_mounts_sync_continuously: False
+# Patterns for files to exclude when running rsync up or rsync down
+rsync_exclude: []
+# Pattern files to use for filtering out files when running rsync up or rsync down. The file is searched for
+# in the source directory and recursively through all subdirectories. For example, if .gitignore is provided
+# as a value, the behavior will match git's behavior for finding and using .gitignore files.
+rsync_filter: []
+# List of commands that will be run before `setup_commands`. If docker is
+# enabled, these commands will run outside the container and before docker
+# is setup.
+initialization_commands: []
+# List of shell commands to run to set up nodes.
+setup_commands:
+    - >-
+        (stat $HOME/anaconda3/envs/tensorflow2_p38/ &> /dev/null &&
+        echo 'export PATH="$HOME/anaconda3/envs/tensorflow2_p38/bin:$PATH"' >> ~/.bashrc) || true
+    - which ray || pip install -U "ray[default] @ https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-3.0.0.dev0-cp38-cp38-manylinux2014_x86_64.whl"
+# Custom commands that will be run on the head node after common setup.
+head_setup_commands:
+    - pip install 'boto3>=1.4.8'  # 1.4.8 adds InstanceMarketOptions
+# Custom commands that will be run on worker nodes after common setup.
+worker_setup_commands: []
+# Command to start ray on the head node. You don't need to change this.
+head_start_ray_commands:
+    - ray stop
+    - ulimit -n 65536; ray start --head --port=6379 --object-manager-port=8076 --autoscaling-config=~/ray_bootstrap_config.yaml --dashboard-host=0.0.0.0
+# Command to start ray on worker nodes. You don't need to change this.
+worker_start_ray_commands:
+    - ray stop
+    - ulimit -n 65536; ray start --address=$RAY_HEAD_IP:6379 --object-manager-port=8076

.venv/lib/python3.11/site-packages/ray/autoscaler/azure/__init__.py ADDED Viewed

File without changes

.venv/lib/python3.11/site-packages/ray/autoscaler/azure/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (193 Bytes). View file

.venv/lib/python3.11/site-packages/ray/autoscaler/azure/defaults.yaml ADDED Viewed

	@@ -0,0 +1,152 @@

+# An unique identifier for the head node and workers of this cluster.
+cluster_name: default
+# The maximum number of workers nodes to launch in addition to the head
+# node.
+max_workers: 2
+# The autoscaler will scale up the cluster faster with higher upscaling speed.
+# E.g., if the task requires adding more nodes then autoscaler will gradually
+# scale up the cluster in chunks of upscaling_speed*currently_running_nodes.
+# This number should be > 0.
+upscaling_speed: 1.0
+# This executes all commands on all nodes in the docker container,
+# and opens all the necessary ports to support the Ray cluster.
+# Empty object means disabled.
+docker: {}
+# If a node is idle for this many minutes, it will be removed.
+idle_timeout_minutes: 5
+# Cloud-provider specific configuration.
+provider:
+    type: azure
+    # https://azure.microsoft.com/en-us/global-infrastructure/locations
+    location: westus2
+    resource_group: ray-cluster
+    # set subscription id otherwise the default from az cli will be used
+    # subscription_id: 00000000-0000-0000-0000-000000000000
+    # set unique subnet mask or a random mask will be used
+    # subnet_mask: 10.0.0.0/16
+    # set unique id for resources in this cluster
+    # if not set a default id will be generated based on the resource group and cluster name
+    # unique_id: RAY1
+# How Ray will authenticate with newly launched nodes.
+auth:
+    ssh_user: ubuntu
+    # you must specify paths to matching private and public key pair files
+    # use `ssh-keygen -t rsa -b 4096` to generate a new ssh key pair
+    ssh_private_key: ~/.ssh/id_rsa
+    # changes to this should match what is specified in file_mounts
+    ssh_public_key: ~/.ssh/id_rsa.pub
+# More specific customization to node configurations can be made using the ARM template azure-vm-template.json file
+# See documentation here: https://docs.microsoft.com/en-us/azure/templates/microsoft.compute/2019-03-01/virtualmachines
+# Changes to the local file will be used during deployment of the head node, however worker nodes deployment occurs
+# on the head node, so changes to the template must be included in the wheel file used in setup_commands section below
+# Tell the autoscaler the allowed node types and the resources they provide.
+# The key is the name of the node type, which is just for debugging purposes.
+# The node config specifies the launch config and physical instance type.
+available_node_types:
+    ray.head.default:
+        resources: {"CPU": 2}
+        # Provider-specific config, e.g. instance type.
+        node_config:
+            azure_arm_parameters:
+                vmSize: Standard_D2s_v3
+                # List images https://docs.microsoft.com/en-us/azure/virtual-machines/linux/cli-ps-findimage
+                imagePublisher: microsoft-dsvm
+                imageOffer: ubuntu-1804
+                imageSku: 1804-gen2
+                imageVersion: latest
+    ray.worker.default:
+        # The minimum number of nodes of this type to launch.
+        # This number should be >= 0.
+        min_workers: 0
+        # The resources provided by this node type.
+        resources: {"CPU": 2}
+        # Provider-specific config, e.g. instance type.
+        node_config:
+            azure_arm_parameters:
+                vmSize: Standard_D2s_v3
+                # List images https://docs.microsoft.com/en-us/azure/virtual-machines/linux/cli-ps-findimage
+                imagePublisher: microsoft-dsvm
+                imageOffer: ubuntu-1804
+                imageSku: 1804-gen2
+                imageVersion: latest
+                # comment lines below to not use Spot instances
+                priority: Spot
+                # set a maximum price for spot instances if desired
+                # billingProfile:
+                #     maxPrice: -1
+# Specify the node type of the head node (as configured above).
+head_node_type: ray.head.default
+# Files or directories to copy to the head and worker nodes. The format is a
+# dictionary from REMOTE_PATH: LOCAL_PATH, e.g.
+file_mounts: {
+#    "/path1/on/remote/machine": "/path1/on/local/machine",
+#    "/path2/on/remote/machine": "/path2/on/local/machine",
+     "~/.ssh/id_rsa.pub": "~/.ssh/id_rsa.pub"
+}
+# Files or directories to copy from the head node to the worker nodes. The format is a
+# list of paths. The same path on the head node will be copied to the worker node.
+# This behavior is a subset of the file_mounts behavior. In the vast majority of cases
+# you should just use file_mounts. Only use this if you know what you're doing!
+cluster_synced_files: []
+# Whether changes to directories in file_mounts or cluster_synced_files in the head node
+# should sync to the worker node continuously
+file_mounts_sync_continuously: False
+# Patterns for files to exclude when running rsync up or rsync down
+rsync_exclude: []
+# Pattern files to use for filtering out files when running rsync up or rsync down. The file is searched for
+# in the source directory and recursively through all subdirectories. For example, if .gitignore is provided
+# as a value, the behavior will match git's behavior for finding and using .gitignore files.
+rsync_filter: []
+# List of commands that will be run before `setup_commands`. If docker is
+# enabled, these commands will run outside the container and before docker
+# is setup.
+initialization_commands:
+    # get rid of annoying Ubuntu message
+    - touch ~/.sudo_as_admin_successful
+# List of shell commands to run to set up nodes.
+setup_commands:
+    # Note: if you're developing Ray, you probably want to create an AMI that
+    # has your Ray repo pre-cloned. Then, you can replace the pip installs
+    # below with a git checkout <your_sha> (and possibly a recompile).
+    - (which conda && echo 'eval "$(conda shell.bash hook)"' >> ~/.bashrc) || true
+    # - (conda activate py38_pytorch &> /dev/null && echo 'conda activate py38_pytorch' >> ~/.bashrc) || true
+    - (conda activate py38_tensorflow &> /dev/null && echo 'conda activate py38_tensorflow' >> ~/.bashrc) || true
+    - which ray || pip install -U "ray[default] @ https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-3.0.0.dev0-cp38-cp38-manylinux2014_x86_64.whl"
+    # Consider uncommenting these if you also want to run apt-get commands during setup
+    # - sudo pkill -9 apt-get || true
+    # - sudo pkill -9 dpkg || true
+    # - sudo dpkg --configure -a
+# Custom commands that will be run on the head node after common setup.
+head_setup_commands:
+    - pip install -U azure-cli-core==2.29.1 azure-identity==1.7.0 azure-mgmt-compute==23.1.0 azure-mgmt-network==19.0.0 azure-mgmt-resource==20.0.0 msrestazure==0.6.4
+# Custom commands that will be run on worker nodes after common setup.
+worker_setup_commands: []
+# Command to start ray on the head node. You don't need to change this.
+head_start_ray_commands:
+    - ray stop
+    - ulimit -n 65536; ray start --head --port=6379 --object-manager-port=8076 --autoscaling-config=~/ray_bootstrap_config.yaml --dashboard-host=0.0.0.0
+# Command to start ray on worker nodes. You don't need to change this.
+worker_start_ray_commands:
+    - ray stop
+    - ulimit -n 65536; ray start --address=$RAY_HEAD_IP:6379 --object-manager-port=8076

.venv/lib/python3.11/site-packages/ray/autoscaler/sdk/__init__.py ADDED Viewed

	@@ -0,0 +1,29 @@

+from ray.autoscaler.sdk.sdk import (
+    bootstrap_config,
+    configure_logging,
+    create_or_update_cluster,
+    fillout_defaults,
+    get_docker_host_mount_location,
+    get_head_node_ip,
+    get_worker_node_ips,
+    register_callback_handler,
+    request_resources,
+    rsync,
+    run_on_cluster,
+    teardown_cluster,
+)
+__all__ = [
+    "create_or_update_cluster",
+    "teardown_cluster",
+    "run_on_cluster",
+    "rsync",
+    "get_head_node_ip",
+    "get_worker_node_ips",
+    "request_resources",
+    "configure_logging",
+    "bootstrap_config",
+    "fillout_defaults",
+    "register_callback_handler",
+    "get_docker_host_mount_location",
+]

.venv/lib/python3.11/site-packages/ray/autoscaler/sdk/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (804 Bytes). View file

.venv/lib/python3.11/site-packages/ray/autoscaler/sdk/__pycache__/sdk.cpython-311.pyc ADDED Viewed

Binary file (15.8 kB). View file

.venv/lib/python3.11/site-packages/ray/autoscaler/sdk/sdk.py ADDED Viewed

	@@ -0,0 +1,343 @@

+"""IMPORTANT: this is an experimental interface and not currently stable."""
+import json
+import os
+import tempfile
+from contextlib import contextmanager
+from typing import Any, Callable, Dict, Iterator, List, Optional, Union
+from ray.autoscaler._private import commands
+from ray.autoscaler._private.cli_logger import cli_logger
+from ray.autoscaler._private.event_system import CreateClusterEvent  # noqa: F401
+from ray.autoscaler._private.event_system import global_event_system  # noqa: F401
+from ray.util.annotations import DeveloperAPI
+@DeveloperAPI
+def create_or_update_cluster(
+    cluster_config: Union[dict, str],
+    *,
+    no_restart: bool = False,
+    restart_only: bool = False,
+    no_config_cache: bool = False
+) -> Dict[str, Any]:
+    """Create or updates an autoscaling Ray cluster from a config json.
+    Args:
+        cluster_config (Union[str, dict]): Either the config dict of the
+            cluster, or a path pointing to a file containing the config.
+        no_restart: Whether to skip restarting Ray services during the
+            update. This avoids interrupting running jobs and can be used to
+            dynamically adjust autoscaler configuration.
+        restart_only: Whether to skip running setup commands and only
+            restart Ray. This cannot be used with 'no-restart'.
+        no_config_cache: Whether to disable the config cache and fully
+            resolve all environment settings from the Cloud provider again.
+    """
+    with _as_config_file(cluster_config) as config_file:
+        return commands.create_or_update_cluster(
+            config_file=config_file,
+            override_min_workers=None,
+            override_max_workers=None,
+            no_restart=no_restart,
+            restart_only=restart_only,
+            yes=True,
+            override_cluster_name=None,
+            no_config_cache=no_config_cache,
+            redirect_command_output=None,
+            use_login_shells=True,
+        )
+@DeveloperAPI
+def teardown_cluster(
+    cluster_config: Union[dict, str],
+    workers_only: bool = False,
+    keep_min_workers: bool = False,
+) -> None:
+    """Destroys all nodes of a Ray cluster described by a config json.
+    Args:
+        cluster_config (Union[str, dict]): Either the config dict of the
+            cluster, or a path pointing to a file containing the config.
+        workers_only: Whether to keep the head node running and only
+            teardown worker nodes.
+        keep_min_workers: Whether to keep min_workers (as specified
+            in the YAML) still running.
+    """
+    with _as_config_file(cluster_config) as config_file:
+        return commands.teardown_cluster(
+            config_file=config_file,
+            yes=True,
+            workers_only=workers_only,
+            override_cluster_name=None,
+            keep_min_workers=keep_min_workers,
+        )
+@DeveloperAPI
+def run_on_cluster(
+    cluster_config: Union[dict, str],
+    *,
+    cmd: Optional[str] = None,
+    run_env: str = "auto",
+    tmux: bool = False,
+    stop: bool = False,
+    no_config_cache: bool = False,
+    port_forward: Optional[commands.Port_forward] = None,
+    with_output: bool = False
+) -> Optional[str]:
+    """Runs a command on the specified cluster.
+    Args:
+        cluster_config (Union[str, dict]): Either the config dict of the
+            cluster, or a path pointing to a file containing the config.
+        cmd: the command to run, or None for a no-op command.
+        run_env: whether to run the command on the host or in a
+            container. Select between "auto", "host" and "docker".
+        tmux: whether to run in a tmux session
+        stop: whether to stop the cluster after command run
+        no_config_cache: Whether to disable the config cache and fully
+            resolve all environment settings from the Cloud provider again.
+        port_forward ( (int,int) or list[(int,int)]): port(s) to forward.
+        with_output: Whether to capture command output.
+    Returns:
+        The output of the command as a string.
+    """
+    with _as_config_file(cluster_config) as config_file:
+        return commands.exec_cluster(
+            config_file,
+            cmd=cmd,
+            run_env=run_env,
+            screen=False,
+            tmux=tmux,
+            stop=stop,
+            start=False,
+            override_cluster_name=None,
+            no_config_cache=no_config_cache,
+            port_forward=port_forward,
+            with_output=with_output,
+        )
+@DeveloperAPI
+def rsync(
+    cluster_config: Union[dict, str],
+    *,
+    source: Optional[str],
+    target: Optional[str],
+    down: bool,
+    ip_address: Optional[str] = None,
+    use_internal_ip: bool = False,
+    no_config_cache: bool = False,
+    should_bootstrap: bool = True
+):
+    """Rsyncs files to or from the cluster.
+    Args:
+        cluster_config (Union[str, dict]): Either the config dict of the
+            cluster, or a path pointing to a file containing the config.
+        source: rsync source argument.
+        target: rsync target argument.
+        down: whether we're syncing remote -> local.
+        ip_address: Address of node.
+        use_internal_ip: Whether the provided ip_address is
+            public or private.
+        no_config_cache: Whether to disable the config cache and fully
+            resolve all environment settings from the Cloud provider again.
+        should_bootstrap: whether to bootstrap cluster config before syncing
+    Raises:
+        RuntimeError if the cluster head node is not found.
+    """
+    with _as_config_file(cluster_config) as config_file:
+        return commands.rsync(
+            config_file=config_file,
+            source=source,
+            target=target,
+            override_cluster_name=None,
+            down=down,
+            ip_address=ip_address,
+            use_internal_ip=use_internal_ip,
+            no_config_cache=no_config_cache,
+            all_nodes=False,
+            should_bootstrap=should_bootstrap,
+        )
+@DeveloperAPI
+def get_head_node_ip(cluster_config: Union[dict, str]) -> str:
+    """Returns head node IP for given configuration file if exists.
+    Args:
+        cluster_config (Union[str, dict]): Either the config dict of the
+            cluster, or a path pointing to a file containing the config.
+    Returns:
+        The ip address of the cluster head node.
+    Raises:
+        RuntimeError if the cluster is not found.
+    """
+    with _as_config_file(cluster_config) as config_file:
+        return commands.get_head_node_ip(config_file)
+@DeveloperAPI
+def get_worker_node_ips(cluster_config: Union[dict, str]) -> List[str]:
+    """Returns worker node IPs for given configuration file.
+    Args:
+        cluster_config (Union[str, dict]): Either the config dict of the
+            cluster, or a path pointing to a file containing the config.
+    Returns:
+        List of worker node ip addresses.
+    Raises:
+        RuntimeError if the cluster is not found.
+    """
+    with _as_config_file(cluster_config) as config_file:
+        return commands.get_worker_node_ips(config_file)
+@DeveloperAPI
+def request_resources(
+    num_cpus: Optional[int] = None, bundles: Optional[List[dict]] = None
+) -> None:
+    """Command the autoscaler to scale to accommodate the specified requests.
+    The cluster will immediately attempt to scale to accommodate the requested
+    resources, bypassing normal upscaling speed constraints. This takes into
+    account existing resource usage.
+    For example, suppose you call ``request_resources(num_cpus=100)`` and
+    there are 45 currently running tasks, each requiring 1 CPU. Then, enough
+    nodes will be added so up to 100 tasks can run concurrently. It does
+    **not** add enough nodes so that 145 tasks can run.
+    This call is only a hint to the autoscaler. The actual resulting cluster
+    size may be slightly larger or smaller than expected depending on the
+    internal bin packing algorithm and max worker count restrictions.
+    Args:
+        num_cpus: Scale the cluster to ensure this number of CPUs are
+            available. This request is persistent until another call to
+            request_resources() is made to override.
+        bundles (List[ResourceDict]): Scale the cluster to ensure this set of
+            resource shapes can fit. This request is persistent until another
+            call to request_resources() is made to override.
+    Examples:
+        >>> from ray.autoscaler.sdk import request_resources
+        >>> # Request 1000 CPUs.
+        >>> request_resources(num_cpus=1000) # doctest: +SKIP
+        >>> # Request 64 CPUs and also fit a 1-GPU/4-CPU task.
+        >>> request_resources( # doctest: +SKIP
+        ...     num_cpus=64, bundles=[{"GPU": 1, "CPU": 4}])
+        >>> # Same as requesting num_cpus=3.
+        >>> request_resources( # doctest: +SKIP
+        ...     bundles=[{"CPU": 1}, {"CPU": 1}, {"CPU": 1}])
+    """
+    if num_cpus is not None and not isinstance(num_cpus, int):
+        raise TypeError("num_cpus should be of type int.")
+    if bundles is not None:
+        if isinstance(bundles, List):
+            for bundle in bundles:
+                if isinstance(bundle, Dict):
+                    for key in bundle.keys():
+                        if not (isinstance(key, str) and isinstance(bundle[key], int)):
+                            raise TypeError(
+                                "each bundle key should be str and value as int."
+                            )
+                else:
+                    raise TypeError("each bundle should be a Dict.")
+        else:
+            raise TypeError("bundles should be of type List")
+    return commands.request_resources(num_cpus, bundles)
+@DeveloperAPI
+def configure_logging(
+    log_style: Optional[str] = None,
+    color_mode: Optional[str] = None,
+    verbosity: Optional[int] = None,
+):
+    """Configures logging for cluster command calls.
+    Args:
+        log_style: If 'pretty', outputs with formatting and color.
+            If 'record', outputs record-style without formatting.
+            'auto' defaults to 'pretty', and disables pretty logging
+            if stdin is *not* a TTY. Defaults to "auto".
+        color_mode (str):
+            Can be "true", "false", or "auto".
+            Enables or disables `colorful`.
+            If `color_mode` is "auto", is set to `not stdout.isatty()`
+        vebosity (int):
+            Output verbosity (0, 1, 2, 3).
+            Low verbosity will disable `verbose` and `very_verbose` messages.
+    """
+    cli_logger.configure(
+        log_style=log_style, color_mode=color_mode, verbosity=verbosity
+    )
+@contextmanager
+@DeveloperAPI
+def _as_config_file(cluster_config: Union[dict, str]) -> Iterator[str]:
+    if isinstance(cluster_config, dict):
+        tmp = tempfile.NamedTemporaryFile("w", prefix="autoscaler-sdk-tmp-")
+        tmp.write(json.dumps(cluster_config))
+        tmp.flush()
+        cluster_config = tmp.name
+    if not os.path.exists(cluster_config):
+        raise ValueError("Cluster config not found {}".format(cluster_config))
+    yield cluster_config
+@DeveloperAPI
+def bootstrap_config(
+    cluster_config: Dict[str, Any], no_config_cache: bool = False
+) -> Dict[str, Any]:
+    """Validate and add provider-specific fields to the config. For example,
+    IAM/authentication may be added here."""
+    return commands._bootstrap_config(cluster_config, no_config_cache)
+@DeveloperAPI
+def fillout_defaults(config: Dict[str, Any]) -> Dict[str, Any]:
+    """Fillout default values for a cluster_config based on the provider."""
+    from ray.autoscaler._private.util import fillout_defaults
+    return fillout_defaults(config)
+@DeveloperAPI
+def register_callback_handler(
+    event_name: str,
+    callback: Union[Callable[[Dict], None], List[Callable[[Dict], None]]],
+) -> None:
+    """Registers a callback handler for autoscaler events.
+    Args:
+        event_name: Event that callback should be called on. See
+            CreateClusterEvent for details on the events available to be
+            registered against.
+        callback: Callable object that is invoked
+            when specified event occurs.
+    """
+    global_event_system.add_callback_handler(event_name, callback)
+@DeveloperAPI
+def get_docker_host_mount_location(cluster_name: str) -> str:
+    """Return host path that Docker mounts attach to."""
+    docker_mount_prefix = "/tmp/ray_tmp_mount/{cluster_name}"
+    return docker_mount_prefix.format(cluster_name=cluster_name)

.venv/lib/python3.11/site-packages/ray/autoscaler/v2/__init__.py ADDED Viewed

File without changes

.venv/lib/python3.11/site-packages/ray/autoscaler/v2/autoscaler.py ADDED Viewed

	@@ -0,0 +1,201 @@

+import logging
+from queue import Queue
+from typing import List, Optional
+from ray._raylet import GcsClient
+from ray.autoscaler._private.providers import _get_node_provider
+from ray.autoscaler.v2.event_logger import AutoscalerEventLogger
+from ray.autoscaler.v2.instance_manager.cloud_providers.kuberay.cloud_provider import (
+    KubeRayProvider,
+)
+from ray.autoscaler.v2.instance_manager.cloud_providers.read_only.cloud_provider import (  # noqa
+    ReadOnlyProvider,
+)
+from ray.autoscaler.v2.instance_manager.config import (
+    AutoscalingConfig,
+    IConfigReader,
+    Provider,
+)
+from ray.autoscaler.v2.instance_manager.instance_manager import (
+    InstanceManager,
+    InstanceUpdatedSubscriber,
+)
+from ray.autoscaler.v2.instance_manager.instance_storage import InstanceStorage
+from ray.autoscaler.v2.instance_manager.node_provider import (
+    ICloudInstanceProvider,
+    NodeProviderAdapter,
+)
+from ray.autoscaler.v2.instance_manager.reconciler import Reconciler
+from ray.autoscaler.v2.instance_manager.storage import InMemoryStorage
+from ray.autoscaler.v2.instance_manager.subscribers.cloud_instance_updater import (
+    CloudInstanceUpdater,
+)
+from ray.autoscaler.v2.instance_manager.subscribers.ray_stopper import RayStopper
+from ray.autoscaler.v2.metrics_reporter import AutoscalerMetricsReporter
+from ray.autoscaler.v2.scheduler import ResourceDemandScheduler
+from ray.autoscaler.v2.sdk import get_cluster_resource_state
+from ray.core.generated.autoscaler_pb2 import AutoscalingState
+logger = logging.getLogger(__name__)
+class Autoscaler:
+    def __init__(
+        self,
+        session_name: str,
+        config_reader: IConfigReader,
+        gcs_client: GcsClient,
+        event_logger: Optional[AutoscalerEventLogger] = None,
+        metrics_reporter: Optional[AutoscalerMetricsReporter] = None,
+    ) -> None:
+        """
+        Args:
+            session_name: The name of the ray session.
+            config_reader: The config reader.
+            gcs_client: The GCS client.
+            event_logger: The event logger for emitting cluster events.
+            metrics_reporter: The metrics reporter for emitting cluster metrics.
+        """
+        self._config_reader = config_reader
+        config = config_reader.get_cached_autoscaling_config()
+        logger.info(f"Using Autoscaling Config: \n{config.dump()}")
+        self._gcs_client = gcs_client
+        self._cloud_instance_provider = None
+        self._instance_manager = None
+        self._ray_stop_errors_queue = Queue()
+        self._ray_install_errors_queue = Queue()
+        self._event_logger = event_logger
+        self._metrics_reporter = metrics_reporter
+        self._init_cloud_instance_provider(config, config_reader)
+        self._init_instance_manager(
+            session_name=session_name,
+            config=config,
+            cloud_provider=self._cloud_instance_provider,
+            gcs_client=self._gcs_client,
+        )
+        self._scheduler = ResourceDemandScheduler(self._event_logger)
+    def _init_cloud_instance_provider(
+        self, config: AutoscalingConfig, config_reader: IConfigReader
+    ):
+        """
+        Initialize the cloud provider, and its dependencies (the v1 node provider)
+        Args:
+            config: The autoscaling config.
+            config_reader: The config reader.
+        """
+        provider_config = config.get_provider_config()
+        if provider_config["type"] == "kuberay":
+            provider_config["head_node_type"] = config.get_head_node_type()
+            self._cloud_instance_provider = KubeRayProvider(
+                config.get_config("cluster_name"),
+                provider_config,
+            )
+        elif config.provider == Provider.READ_ONLY:
+            provider_config["gcs_address"] = self._gcs_client.address
+            self._cloud_instance_provider = ReadOnlyProvider(
+                provider_config=provider_config,
+            )
+        else:
+            node_provider_v1 = _get_node_provider(
+                provider_config,
+                config.get_config("cluster_name"),
+            )
+            self._cloud_instance_provider = NodeProviderAdapter(
+                v1_provider=node_provider_v1,
+                config_reader=config_reader,
+            )
+    def _init_instance_manager(
+        self,
+        session_name: str,
+        cloud_provider: ICloudInstanceProvider,
+        gcs_client: GcsClient,
+        config: AutoscalingConfig,
+    ):
+        """
+        Initialize the instance manager, and its dependencies.
+        """
+        instance_storage = InstanceStorage(
+            cluster_id=session_name,
+            storage=InMemoryStorage(),
+        )
+        subscribers: List[InstanceUpdatedSubscriber] = []
+        subscribers.append(CloudInstanceUpdater(cloud_provider=cloud_provider))
+        subscribers.append(
+            RayStopper(gcs_client=gcs_client, error_queue=self._ray_stop_errors_queue)
+        )
+        if not config.disable_node_updaters():
+            # Supporting ray installer is only needed for providers that doesn't
+            # install or manage ray (e.g. AWS, GCP). These providers will be
+            # supported in the future.
+            raise NotImplementedError(
+                "RayInstaller is not supported yet in current "
+                "release of the Autoscaler V2. Therefore, providers "
+                "that update nodes (with `disable_node_updaters` set to True) "
+                "are not supported yet. Only KubeRay is supported for now which sets "
+                "disable_node_updaters to True in provider's config."
+            )
+        self._instance_manager = InstanceManager(
+            instance_storage=instance_storage,
+            instance_status_update_subscribers=subscribers,
+        )
+    def update_autoscaling_state(
+        self,
+    ) -> Optional[AutoscalingState]:
+        """
+        Update the autoscaling state of the cluster by reconciling the current
+        state of the cluster resources, the cloud providers as well as instance
+        update subscribers with the desired state.
+        Returns:
+            AutoscalingState: The new autoscaling state of the cluster or None if
+            the state is not updated.
+        Raises:
+            No exception.
+        """
+        try:
+            ray_stop_errors = []
+            while not self._ray_stop_errors_queue.empty():
+                ray_stop_errors.append(self._ray_stop_errors_queue.get())
+            ray_install_errors = []
+            while not self._ray_install_errors_queue.empty():
+                ray_install_errors.append(self._ray_install_errors_queue.get())
+            # Get the current state of the ray cluster resources.
+            ray_cluster_resource_state = get_cluster_resource_state(self._gcs_client)
+            # Refresh the config from the source
+            self._config_reader.refresh_cached_autoscaling_config()
+            autoscaling_config = self._config_reader.get_cached_autoscaling_config()
+            return Reconciler.reconcile(
+                instance_manager=self._instance_manager,
+                scheduler=self._scheduler,
+                cloud_provider=self._cloud_instance_provider,
+                ray_cluster_resource_state=ray_cluster_resource_state,
+                non_terminated_cloud_instances=(
+                    self._cloud_instance_provider.get_non_terminated()
+                ),
+                cloud_provider_errors=self._cloud_instance_provider.poll_errors(),
+                ray_install_errors=ray_install_errors,
+                ray_stop_errors=ray_stop_errors,
+                autoscaling_config=autoscaling_config,
+                metrics_reporter=self._metrics_reporter,
+            )
+        except Exception as e:
+            logger.exception(e)
+            return None

.venv/lib/python3.11/site-packages/ray/autoscaler/v2/event_logger.py ADDED Viewed

	@@ -0,0 +1,157 @@

+import logging
+from collections import defaultdict
+from typing import Dict, List, Optional
+from ray._private.event.event_logger import EventLoggerAdapter
+from ray.autoscaler.v2.instance_manager.config import NodeTypeConfig
+from ray.autoscaler.v2.schema import NodeType
+from ray.autoscaler.v2.utils import ResourceRequestUtil
+from ray.core.generated.autoscaler_pb2 import (
+    ClusterResourceConstraint,
+    GangResourceRequest,
+    ResourceRequest,
+)
+from ray.core.generated.instance_manager_pb2 import LaunchRequest, TerminationRequest
+logger = logging.getLogger(__name__)
+class AutoscalerEventLogger:
+    """
+    Logs events related to the autoscaler.
+    # TODO:
+    - Add more logging for other events.
+    - Rate limit the events if too spammy.
+    """
+    def __init__(self, logger: EventLoggerAdapter):
+        self._logger = logger
+    def log_cluster_scheduling_update(
+        self,
+        node_type_configs: Dict[NodeType, NodeTypeConfig],
+        cluster_shape: Dict[NodeType, int],
+        launch_requests: Optional[List[LaunchRequest]] = None,
+        terminate_requests: Optional[List[TerminationRequest]] = None,
+        infeasible_requests: Optional[List[ResourceRequest]] = None,
+        infeasible_gang_requests: Optional[List[GangResourceRequest]] = None,
+        infeasible_cluster_resource_constraints: Optional[
+            List[ClusterResourceConstraint]
+        ] = None,
+    ) -> None:
+        """
+        Log any update of the cluster scheduling state.
+        """
+        # Log any launch events.
+        if launch_requests:
+            launch_type_count = defaultdict(int)
+            for req in launch_requests:
+                launch_type_count[req.instance_type] += req.count
+            for idx, (instance_type, count) in enumerate(launch_type_count.items()):
+                log_str = f"Adding {count} node(s) of type {instance_type}."
+                self._logger.info(f"{log_str}")
+                logger.info(f"{log_str}")
+        # Log any terminate events.
+        if terminate_requests:
+            termination_by_causes_and_type = defaultdict(int)
+            for req in terminate_requests:
+                termination_by_causes_and_type[(req.cause, req.instance_type)] += 1
+            cause_reason_map = {
+                TerminationRequest.Cause.OUTDATED: "outdated",
+                TerminationRequest.Cause.MAX_NUM_NODES: "max number of worker nodes reached",  # noqa
+                TerminationRequest.Cause.MAX_NUM_NODE_PER_TYPE: "max number of worker nodes per type reached",  # noqa
+                TerminationRequest.Cause.IDLE: "idle",
+            }
+            for idx, ((cause, instance_type), count) in enumerate(
+                termination_by_causes_and_type.items()
+            ):
+                log_str = f"Removing {count} nodes of type {instance_type} ({cause_reason_map[cause]})."  # noqa
+                self._logger.info(f"{log_str}")
+                logger.info(f"{log_str}")
+        # Cluster shape changes.
+        if launch_requests or terminate_requests:
+            total_resources = defaultdict(float)
+            for node_type, count in cluster_shape.items():
+                node_config = node_type_configs[node_type]
+                for resource_name, resource_quantity in node_config.resources.items():
+                    total_resources[resource_name] += resource_quantity * count
+            num_cpus = total_resources.get("CPU", 0)
+            log_str = f"Resized to {int(num_cpus)} CPUs"
+            if "GPU" in total_resources:
+                log_str += f", {int(total_resources['GPU'])} GPUs"
+            if "TPU" in total_resources:
+                log_str += f", {int(total_resources['TPU'])} TPUs"
+            self._logger.info(f"{log_str}.")
+            self._logger.debug(f"Current cluster shape: {dict(cluster_shape)}.")
+        # Log any infeasible requests.
+        if infeasible_requests:
+            requests_by_count = ResourceRequestUtil.group_by_count(infeasible_requests)
+            log_str = "No available node types can fulfill resource requests "
+            for idx, req_count in enumerate(requests_by_count):
+                resource_map = ResourceRequestUtil.to_resource_map(req_count.request)
+                log_str += f"{resource_map}*{req_count.count}"
+                if idx < len(requests_by_count) - 1:
+                    log_str += ", "
+            log_str += (
+                ". Add suitable node types to this cluster to resolve this issue."
+            )
+            self._logger.warning(log_str)
+        if infeasible_gang_requests:
+            # Log for each placement group requests.
+            for gang_request in infeasible_gang_requests:
+                log_str = (
+                    "No available node types can fulfill "
+                    "placement group requests (detail={details}): ".format(
+                        details=gang_request.details
+                    )
+                )
+                requests_by_count = ResourceRequestUtil.group_by_count(
+                    gang_request.requests
+                )
+                for idx, req_count in enumerate(requests_by_count):
+                    resource_map = ResourceRequestUtil.to_resource_map(
+                        req_count.request
+                    )
+                    log_str += f"{resource_map}*{req_count.count}"
+                    if idx < len(requests_by_count) - 1:
+                        log_str += ", "
+                log_str += (
+                    ". Add suitable node types to this cluster to resolve this issue."
+                )
+                self._logger.warning(log_str)
+        if infeasible_cluster_resource_constraints:
+            # We will only have max 1 cluster resource constraint for now since it's
+            # from `request_resources()` sdk, where the most recent call would override
+            # the previous one.
+            for infeasible_constraint in infeasible_cluster_resource_constraints:
+                log_str = "No available node types can fulfill cluster constraint: "
+                for i, requests_by_count in enumerate(
+                    infeasible_constraint.resource_requests
+                ):
+                    resource_map = ResourceRequestUtil.to_resource_map(
+                        requests_by_count.request
+                    )
+                    log_str += f"{resource_map}*{requests_by_count.count}"
+                    if i < len(infeasible_constraint.resource_requests) - 1:
+                        log_str += ", "
+                log_str += (
+                    ". Add suitable node types to this cluster to resolve this issue."
+                )
+                self._logger.warning(log_str)

.venv/lib/python3.11/site-packages/ray/autoscaler/v2/instance_manager/__init__.py ADDED Viewed

File without changes

.venv/lib/python3.11/site-packages/ray/autoscaler/v2/instance_manager/common.py ADDED Viewed

	@@ -0,0 +1,472 @@

+import time
+import uuid
+from typing import Dict, List, Optional, Set
+from ray.core.generated.instance_manager_pb2 import Instance, InstanceUpdateEvent
+class InstanceUtil:
+    """
+    A helper class to group updates and operations on an Instance object defined
+    in instance_manager.proto
+    """
+    # Memoized reachable from sets, where the key is the instance status, and
+    # the value is the set of instance status that is reachable from the key
+    # instance status.
+    _reachable_from: Optional[
+        Dict["Instance.InstanceStatus", Set["Instance.InstanceStatus"]]
+    ] = None
+    @staticmethod
+    def new_instance(
+        instance_id: str,
+        instance_type: str,
+        status: Instance.InstanceStatus,
+        details: str = "",
+    ) -> Instance:
+        """
+        Returns a new instance with the given status.
+        Args:
+            instance_id: The instance id.
+            instance_type: The instance type.
+            status: The status of the new instance.
+            details: The details of the status transition.
+        """
+        instance = Instance()
+        instance.version = 0  # it will be populated by the underlying storage.
+        instance.instance_id = instance_id
+        instance.instance_type = instance_type
+        instance.status = status
+        InstanceUtil._record_status_transition(instance, status, details)
+        return instance
+    @staticmethod
+    def random_instance_id() -> str:
+        """
+        Returns a random instance id.
+        """
+        return str(uuid.uuid4())
+    @staticmethod
+    def is_cloud_instance_allocated(instance_status: Instance.InstanceStatus) -> bool:
+        """
+        Returns True if the instance is in a status where there could exist
+        a cloud instance allocated by the cloud provider.
+        """
+        assert instance_status != Instance.UNKNOWN
+        return instance_status in {
+            Instance.ALLOCATED,
+            Instance.RAY_INSTALLING,
+            Instance.RAY_RUNNING,
+            Instance.RAY_STOPPING,
+            Instance.RAY_STOP_REQUESTED,
+            Instance.RAY_STOPPED,
+            Instance.TERMINATING,
+            Instance.RAY_INSTALL_FAILED,
+            Instance.TERMINATION_FAILED,
+        }
+    @staticmethod
+    def is_ray_running(instance_status: Instance.InstanceStatus) -> bool:
+        """
+        Returns True if the instance is in a status where the ray process is
+        running on the cloud instance.
+            i.e. RAY_RUNNING, RAY_STOP_REQUESTED, RAY_STOPPING
+        """
+        assert instance_status != Instance.UNKNOWN
+        if instance_status in InstanceUtil.get_reachable_statuses(
+            Instance.RAY_STOPPING
+        ):
+            return False
+        if instance_status in InstanceUtil.get_reachable_statuses(Instance.RAY_RUNNING):
+            return True
+        return False
+    @staticmethod
+    def is_ray_pending(instance_status: Instance.InstanceStatus) -> bool:
+        """
+        Returns True if the instance is in a status where the ray process is
+        pending to be started on the cloud instance.
+        """
+        assert instance_status != Instance.UNKNOWN
+        # Not gonna be in a RAY_RUNNING status.
+        if Instance.RAY_RUNNING not in InstanceUtil.get_reachable_statuses(
+            instance_status
+        ):
+            return False
+        # Already running ray.
+        if instance_status in InstanceUtil.get_reachable_statuses(Instance.RAY_RUNNING):
+            return False
+        return True
+    def is_ray_running_reachable(instance_status: Instance.InstanceStatus) -> bool:
+        """
+        Returns True if the instance is in a status where it may transition
+        to RAY_RUNNING status.
+        """
+        return Instance.RAY_RUNNING in InstanceUtil.get_reachable_statuses(
+            instance_status
+        )
+    @staticmethod
+    def set_status(
+        instance: Instance,
+        new_instance_status: Instance.InstanceStatus,
+        details: str = "",
+    ) -> bool:
+        """Transitions the instance to the new state.
+        Args:
+            instance: The instance to update.
+            new_instance_status: The new status to transition to.
+            details: The details of the transition.
+        Returns:
+            True if the status transition is successful, False otherwise.
+        """
+        if (
+            new_instance_status
+            not in InstanceUtil.get_valid_transitions()[instance.status]
+        ):
+            return False
+        instance.status = new_instance_status
+        InstanceUtil._record_status_transition(instance, new_instance_status, details)
+        return True
+    @staticmethod
+    def _record_status_transition(
+        instance: Instance, status: Instance.InstanceStatus, details: str
+    ):
+        """Records the status transition.
+        Args:
+            instance: The instance to update.
+            status: The new status to transition to.
+        """
+        now_ns = time.time_ns()
+        instance.status_history.append(
+            Instance.StatusHistory(
+                instance_status=status,
+                timestamp_ns=now_ns,
+                details=details,
+            )
+        )
+    @staticmethod
+    def has_timeout(instance: Instance, timeout_s: int) -> bool:
+        """
+        Returns True if the instance has been in the current status for more
+        than the timeout_seconds.
+        Args:
+            instance: The instance to check.
+            timeout_seconds: The timeout in seconds.
+        Returns:
+            True if the instance has been in the current status for more than
+            the timeout_s seconds.
+        """
+        cur_status = instance.status
+        status_times_ns = InstanceUtil.get_status_transition_times_ns(
+            instance, select_instance_status=cur_status
+        )
+        assert len(status_times_ns) >= 1, (
+            f"instance {instance.instance_id} has {len(status_times_ns)} "
+            f"{Instance.InstanceStatus.Name(cur_status)} status"
+        )
+        status_time_ns = sorted(status_times_ns)[-1]
+        if time.time_ns() - status_time_ns <= (timeout_s * 1e9):
+            return False
+        return True
+    @staticmethod
+    def get_valid_transitions() -> Dict[
+        "Instance.InstanceStatus", Set["Instance.InstanceStatus"]
+    ]:
+        return {
+            # This is the initial status of a new instance.
+            Instance.QUEUED: {
+                # Cloud provider requested to launch a node for the instance.
+                # This happens when the a launch request is made to the node provider.
+                Instance.REQUESTED,
+            },
+            # When in this status, a launch request to the node provider is made.
+            Instance.REQUESTED: {
+                # Cloud provider allocated a cloud instance for the instance.
+                # This happens when the cloud instance first appears in the list of
+                # running cloud instances from the cloud instance provider.
+                Instance.ALLOCATED,
+                # Retry the allocation, become queueing again.
+                Instance.QUEUED,
+                # Cloud provider fails to allocate one. Either as a timeout or
+                # the launch request fails immediately.
+                Instance.ALLOCATION_FAILED,
+            },
+            # When in this status, the cloud instance is allocated and running. This
+            # happens when the cloud instance is present in node provider's list of
+            # running cloud instances.
+            Instance.ALLOCATED: {
+                # Ray needs to be install and launch on the provisioned cloud instance.
+                # This happens when the cloud instance is allocated, and the autoscaler
+                # is responsible for installing and launching ray on the cloud instance.
+                # For node provider that manages the ray installation and launching,
+                # this state is skipped.
+                Instance.RAY_INSTALLING,
+                # Ray is already installed on the provisioned cloud
+                # instance. It could be any valid ray status.
+                Instance.RAY_RUNNING,
+                Instance.RAY_STOPPING,
+                Instance.RAY_STOPPED,
+                # Instance is requested to be stopped, e.g. instance leaked: no matching
+                # Instance with the same type is found in the autoscaler's state.
+                Instance.TERMINATING,
+                # cloud instance somehow failed.
+                Instance.TERMINATED,
+            },
+            # Ray process is being installed and started on the cloud instance.
+            # This status is skipped for node provider that manages the ray
+            # installation and launching. (e.g. Ray-on-Spark)
+            Instance.RAY_INSTALLING: {
+                # Ray installed and launched successfully, reported by the ray cluster.
+                # Similar to the Instance.ALLOCATED -> Instance.RAY_RUNNING transition,
+                # where the ray process is managed by the node provider.
+                Instance.RAY_RUNNING,
+                # Ray installation failed. This happens when the ray process failed to
+                # be installed and started on the cloud instance.
+                Instance.RAY_INSTALL_FAILED,
+                # Wen the ray node is reported as stopped by the ray cluster.
+                # This could happen that the ray process was stopped quickly after start
+                # such that a ray running node  wasn't discovered and the RAY_RUNNING
+                # transition was skipped.
+                Instance.RAY_STOPPED,
+                # A cloud instance is being terminated (when the instance itself is no
+                # longer needed, e.g. instance is outdated, autoscaler is scaling down)
+                Instance.TERMINATING,
+                # cloud instance somehow failed during the installation process.
+                Instance.TERMINATED,
+            },
+            # Ray process is installed and running on the cloud instance. When in this
+            # status, a ray node must be present in the ray cluster.
+            Instance.RAY_RUNNING: {
+                # Ray is requested to be stopped.
+                Instance.RAY_STOP_REQUESTED,
+                # Ray is stopping (currently draining),
+                # e.g. idle termination.
+                Instance.RAY_STOPPING,
+                # Ray is already stopped, as reported by the ray cluster.
+                Instance.RAY_STOPPED,
+                # A cloud instance is being terminated (when the instance itself is no
+                # longer needed, e.g. instance is outdated, autoscaler is scaling down)
+                Instance.TERMINATING,
+                # cloud instance somehow failed.
+                Instance.TERMINATED,
+            },
+            # Ray process should be stopped on the cloud instance. The RayStopper
+            # subscriber will listen to this status and stop the ray process.
+            Instance.RAY_STOP_REQUESTED: {
+                # Ray is stopping on the cloud instance.
+                Instance.RAY_STOPPING,
+                # Ray stopped already.
+                Instance.RAY_STOPPED,
+                # Ray stop request failed (e.g. idle node no longer idle),
+                # ray is still running.
+                Instance.RAY_RUNNING,
+                # cloud instance somehow failed.
+                Instance.TERMINATED,
+            },
+            # When in this status, the ray process is requested to be stopped to the
+            # ray cluster, but not yet present in the dead ray node list reported by
+            # the ray cluster.
+            Instance.RAY_STOPPING: {
+                # Ray is stopped, and the ray node is present in the dead ray node list
+                # reported by the ray cluster.
+                Instance.RAY_STOPPED,
+                # A cloud instance is being terminated (when the instance itself is no
+                # longer needed, e.g. instance is outdated, autoscaler is scaling down)
+                Instance.TERMINATING,
+                # cloud instance somehow failed.
+                Instance.TERMINATED,
+            },
+            # When in this status, the ray process is stopped, and the ray node is
+            # present in the dead ray node list reported by the ray cluster.
+            Instance.RAY_STOPPED: {
+                # A cloud instance is being terminated (when the instance itself is no
+                # longer needed, e.g. instance is outdated, autoscaler is scaling down)
+                Instance.TERMINATING,
+                # cloud instance somehow failed.
+                Instance.TERMINATED,
+            },
+            # When in this status, the cloud instance is requested to be stopped to
+            # the node provider.
+            Instance.TERMINATING: {
+                # When a cloud instance no longer appears in the list of running cloud
+                # instances from the node provider.
+                Instance.TERMINATED,
+                # When the cloud instance failed to be terminated.
+                Instance.TERMINATION_FAILED,
+            },
+            # When in this status, the cloud instance failed to be terminated by the
+            # node provider. We will keep retrying.
+            Instance.TERMINATION_FAILED: {
+                # Retry the termination, become terminating again.
+                Instance.TERMINATING,
+            },
+            # Whenever a cloud instance disappears from the list of running cloud
+            # instances from the node provider, the instance is marked as stopped. Since
+            # we guarantee 1:1 mapping of a Instance to a cloud instance, this is a
+            # terminal state.
+            Instance.TERMINATED: set(),  # Terminal state.
+            # When in this status, the cloud instance failed to be allocated by the
+            # node provider.
+            Instance.ALLOCATION_FAILED: set(),  # Terminal state.
+            Instance.RAY_INSTALL_FAILED: {
+                # Autoscaler requests to shutdown the instance when ray install failed.
+                Instance.TERMINATING,
+                # cloud instance somehow failed.
+                Instance.TERMINATED,
+            },
+            # Initial state before the instance is created. Should never be used.
+            Instance.UNKNOWN: set(),
+        }
+    @staticmethod
+    def get_status_transitions(
+        instance: Instance,
+        select_instance_status: Optional["Instance.InstanceStatus"] = None,
+    ) -> List["Instance.StatusHistory"]:
+        """
+        Returns the status history of the instance.
+        Args:
+            instance: The instance.
+            select_instance_status: The go-to status to search for, i.e. select
+                only status history when the instance transitions into the status.
+                If None, returns all status updates.
+        """
+        history = []
+        for status_update in instance.status_history:
+            if (
+                select_instance_status
+                and status_update.instance_status != select_instance_status
+            ):
+                continue
+            history.append(status_update)
+        return history
+    @staticmethod
+    def get_last_status_transition(
+        instance: Instance,
+        select_instance_status: Optional["Instance.InstanceStatus"] = None,
+    ) -> Optional["Instance.StatusHistory"]:
+        """
+        Returns the last status transition of the instance.
+        Args:
+            instance: The instance.
+            instance_status: The status to search for. If None, returns the last
+                status update.
+        """
+        history = InstanceUtil.get_status_transitions(instance, select_instance_status)
+        history.sort(key=lambda x: x.timestamp_ns)
+        if history:
+            return history[-1]
+        return None
+    @staticmethod
+    def get_status_transition_times_ns(
+        instance: Instance,
+        select_instance_status: Optional["Instance.InstanceStatus"] = None,
+    ) -> List[int]:
+        """
+        Returns a list of timestamps of the instance status update.
+        Args:
+            instance: The instance.
+            instance_status: The status to search for. If None, returns all
+                status updates timestamps.
+        Returns:
+            The list of timestamps of the instance status updates.
+        """
+        return [
+            e.timestamp_ns
+            for e in InstanceUtil.get_status_transitions(
+                instance, select_instance_status
+            )
+        ]
+    @classmethod
+    def get_reachable_statuses(
+        cls,
+        instance_status: Instance.InstanceStatus,
+    ) -> Set["Instance.InstanceStatus"]:
+        """
+        Returns the set of instance status that is reachable from the given
+        instance status following the status transitions.
+        This method is memoized.
+        Args:
+            instance_status: The instance status to start from.
+        Returns:
+            The set of instance status that is reachable from the given instance
+            status.
+        """
+        if cls._reachable_from is None:
+            cls._compute_reachable()
+        return cls._reachable_from[instance_status]
+    @staticmethod
+    def get_log_str_for_update(instance: Instance, update: InstanceUpdateEvent) -> str:
+        """Returns a log string for the given instance update."""
+        if update.upsert:
+            return (
+                f"New instance "
+                f"{Instance.InstanceStatus.Name(update.new_instance_status)} (id="
+                f"{instance.instance_id}, type={instance.instance_type}, "
+                f"cloud_instance_id={instance.cloud_instance_id}, "
+                f"ray_id={instance.node_id}): {update.details}"
+            )
+        return (
+            f"Update instance "
+            f"{Instance.InstanceStatus.Name(instance.status)}->"
+            f"{Instance.InstanceStatus.Name(update.new_instance_status)} (id="
+            f"{instance.instance_id}, type={instance.instance_type}, "
+            f"cloud_instance_id={instance.cloud_instance_id}, "
+            f"ray_id={instance.node_id}): {update.details}"
+        )
+    @classmethod
+    def _compute_reachable(cls):
+        """
+        Computes and memorize the from status sets for each status machine with
+        a DFS search.
+        """
+        valid_transitions = cls.get_valid_transitions()
+        def dfs(graph, start, visited):
+            """
+            Regular DFS algorithm to find all reachable nodes from a given node.
+            """
+            for next_node in graph[start]:
+                if next_node not in visited:
+                    # We delay adding the visited set here so we could capture
+                    # the self loop.
+                    visited.add(next_node)
+                    dfs(graph, next_node, visited)
+            return visited
+        # Initialize the graphs
+        cls._reachable_from = {}
+        for status in Instance.InstanceStatus.values():
+            # All nodes reachable from 'start'
+            visited = set()
+            cls._reachable_from[status] = dfs(valid_transitions, status, visited)

.venv/lib/python3.11/site-packages/ray/autoscaler/v2/instance_manager/config.py ADDED Viewed

	@@ -0,0 +1,541 @@

+import copy
+import logging
+from abc import ABC, abstractmethod
+from dataclasses import dataclass, field
+from enum import Enum
+from pathlib import Path
+from typing import Any, Dict, List, Optional
+import yaml
+from ray._private.ray_constants import env_integer
+from ray._private.utils import binary_to_hex
+from ray._raylet import GcsClient
+from ray.autoscaler._private.constants import (
+    AUTOSCALER_MAX_CONCURRENT_LAUNCHES,
+    DEFAULT_UPSCALING_SPEED,
+    DISABLE_LAUNCH_CONFIG_CHECK_KEY,
+    DISABLE_NODE_UPDATERS_KEY,
+)
+from ray.autoscaler._private.kuberay.autoscaling_config import AutoscalingConfigProducer
+from ray.autoscaler._private.monitor import BASE_READONLY_CONFIG
+from ray.autoscaler._private.util import (
+    format_readonly_node_type,
+    hash_launch_conf,
+    hash_runtime_conf,
+    prepare_config,
+    validate_config,
+)
+from ray.autoscaler.v2.schema import NodeType
+from ray.autoscaler.v2.sdk import get_cluster_resource_state
+from ray.autoscaler.v2.utils import is_head_node
+logger = logging.getLogger(__name__)
+class Provider(Enum):
+    UNKNOWN = 0
+    ALIYUN = 1
+    AWS = 2
+    AZURE = 3
+    GCP = 4
+    KUBERAY = 5
+    LOCAL = 6
+    READ_ONLY = 7
+class IConfigReader(ABC):
+    """An interface for reading Autoscaling config.
+    A utility class that reads autoscaling configs from various sources:
+        - File
+        - In-memory dict
+        - Remote config service (e.g. KubeRay's config)
+    Example:
+        reader = FileConfigReader("path/to/config.yaml")
+        # Get the recently cached config.
+        config = reader.get_cached_autoscaling_config()
+        ...
+        # Refresh the cached config.
+        reader.refresh_cached_autoscaling_config()
+        config = reader.get_cached_autoscaling_config()
+    """
+    @abstractmethod
+    def get_cached_autoscaling_config(self) -> "AutoscalingConfig":
+        """Returns the recently read autoscaling config.
+        Returns:
+            AutoscalingConfig: The recently read autoscaling config.
+        """
+        pass
+    @abstractmethod
+    def refresh_cached_autoscaling_config(self):
+        """Read the config from the source."""
+        pass
+@dataclass(frozen=True)
+class InstanceReconcileConfig:
+    # The timeout for waiting for a REQUESTED instance to be ALLOCATED.
+    request_status_timeout_s: int = env_integer(
+        "RAY_AUTOSCALER_RECONCILE_REQUEST_STATUS_TIMEOUT_S", 10 * 60
+    )
+    # The timeout for waiting for a ALLOCATED instance to be RAY_RUNNING.
+    allocate_status_timeout_s: int = env_integer(
+        "RAY_AUTOSCALER_RECONCILE_ALLOCATE_STATUS_TIMEOUT_S", 300
+    )
+    # The timeout for waiting for a RAY_INSTALLING instance to be RAY_RUNNING.
+    ray_install_status_timeout_s: int = env_integer(
+        "RAY_AUTOSCALER_RECONCILE_RAY_INSTALL_STATUS_TIMEOUT_S", 30 * 60
+    )
+    # The timeout for waiting for a TERMINATING instance to be TERMINATED.
+    terminating_status_timeout_s: int = env_integer(
+        "RAY_AUTOSCALER_RECONCILE_TERMINATING_STATUS_TIMEOUT_S", 300
+    )
+    # The timeout for waiting for a RAY_STOP_REQUESTED instance
+    # to be RAY_STOPPING or RAY_STOPPED.
+    ray_stop_requested_status_timeout_s: int = env_integer(
+        "RAY_AUTOSCALER_RECONCILE_RAY_STOP_REQUESTED_STATUS_TIMEOUT_S", 300
+    )
+    # The interval for raise a warning when an instance in transient status
+    # is not updated for a long time.
+    transient_status_warn_interval_s: int = env_integer(
+        "RAY_AUTOSCALER_RECONCILE_TRANSIENT_STATUS_WARN_INTERVAL_S", 90
+    )
+    # The number of times to retry requesting to allocate an instance.
+    max_num_retry_request_to_allocate: int = env_integer(
+        "RAY_AUTOSCALER_RECONCILE_MAX_NUM_RETRY_REQUEST_TO_ALLOCATE", 3
+    )
+@dataclass
+class NodeTypeConfig:
+    """
+    NodeTypeConfig is the helper class to provide node type specific configs.
+    This maps to subset of the `available_node_types` field in the
+    autoscaling config.
+    """
+    # Node type name
+    name: NodeType
+    # The minimal number of worker nodes to be launched for this node type.
+    min_worker_nodes: int
+    # The maximal number of worker nodes can be launched for this node type.
+    max_worker_nodes: int
+    # Idle timeout seconds for worker nodes of this node type.
+    idle_timeout_s: Optional[float] = None
+    # The total resources on the node.
+    resources: Dict[str, float] = field(default_factory=dict)
+    # The labels on the node.
+    labels: Dict[str, str] = field(default_factory=dict)
+    # The node config's launch config hash. It's calculated from the auth
+    # config, and the node's config in the `AutoscalingConfig` for the node
+    # type when launching the node. It's used to detect config changes.
+    launch_config_hash: str = ""
+    def __post_init__(self):
+        assert self.min_worker_nodes <= self.max_worker_nodes
+        assert self.min_worker_nodes >= 0
+class AutoscalingConfig:
+    """
+    AutoscalingConfig is the helper class to provide autoscaling
+    related configs.
+    # TODO(rickyx):
+        1. Move the config validation logic here.
+        2. Deprecate the ray-schema.json for validation because it's
+        static thus not possible to validate the config with interdependency
+        of each other.
+    """
+    def __init__(
+        self,
+        configs: Dict[str, Any],
+        skip_content_hash: bool = False,
+    ) -> None:
+        """
+        Args:
+            configs : The raw configs dict.
+            skip_content_hash :
+                Whether to skip file mounts/ray command hash calculation.
+        """
+        self._sync_continuously = False
+        self.update_configs(configs, skip_content_hash)
+    def update_configs(self, configs: Dict[str, Any], skip_content_hash: bool) -> None:
+        self._configs = prepare_config(configs)
+        validate_config(self._configs)
+        if skip_content_hash:
+            return
+        self._calculate_hashes()
+        self._sync_continuously = self._configs.get(
+            "generate_file_mounts_contents_hash", True
+        )
+    def _calculate_hashes(self) -> None:
+        logger.info("Calculating hashes for file mounts and ray commands.")
+        self._runtime_hash, self._file_mounts_contents_hash = hash_runtime_conf(
+            self._configs.get("file_mounts", {}),
+            self._configs.get("cluster_synced_files", []),
+            [
+                self._configs.get("worker_setup_commands", []),
+                self._configs.get("worker_start_ray_commands", []),
+            ],
+            generate_file_mounts_contents_hash=self._configs.get(
+                "generate_file_mounts_contents_hash", True
+            ),
+        )
+    def get_cloud_node_config(self, ray_node_type: NodeType) -> Dict[str, Any]:
+        return copy.deepcopy(
+            self.get_node_type_specific_config(ray_node_type, "node_config") or {}
+        )
+    def get_docker_config(self, ray_node_type: NodeType) -> Dict[str, Any]:
+        """
+        Return the docker config for the specified node type.
+            If it's a head node, the image will be chosen in the following order:
+                1. Node specific docker image.
+                2. The 'docker' config's 'head_image' field.
+                3. The 'docker' config's 'image' field.
+            If it's a worker node, the image will be chosen in the following order:
+                1. Node specific docker image.
+                2. The 'docker' config's 'worker_image' field.
+                3. The 'docker' config's 'image' field.
+        """
+        # TODO(rickyx): It's unfortunate we have multiple fields in ray-schema.json
+        #  that can specify docker images. We should consolidate them.
+        docker_config = copy.deepcopy(self._configs.get("docker", {}))
+        node_specific_docker_config = self._configs["available_node_types"][
+            ray_node_type
+        ].get("docker", {})
+        # Override the global docker config with node specific docker config.
+        docker_config.update(node_specific_docker_config)
+        if self._configs.get("head_node_type") == ray_node_type:
+            if "head_image" in docker_config:
+                logger.info(
+                    "Overwriting image={} by head_image({}) for head node docker.".format(  # noqa: E501
+                        docker_config["image"], docker_config["head_image"]
+                    )
+                )
+                docker_config["image"] = docker_config["head_image"]
+        else:
+            if "worker_image" in docker_config:
+                logger.info(
+                    "Overwriting image={} by worker_image({}) for worker node docker.".format(  # noqa: E501
+                        docker_config["image"], docker_config["worker_image"]
+                    )
+                )
+                docker_config["image"] = docker_config["worker_image"]
+        # These fields should be merged.
+        docker_config.pop("head_image", None)
+        docker_config.pop("worker_image", None)
+        return docker_config
+    def get_worker_start_ray_commands(self) -> List[str]:
+        return self._configs.get("worker_start_ray_commands", [])
+    def get_head_setup_commands(self) -> List[str]:
+        return self._configs.get("head_setup_commands", [])
+    def get_head_start_ray_commands(self) -> List[str]:
+        return self._configs.get("head_start_ray_commands", [])
+    def get_worker_setup_commands(self, ray_node_type: NodeType) -> List[str]:
+        """
+        Return the worker setup commands for the specified node type.
+        If the node type specific worker setup commands are not specified,
+        return the global worker setup commands.
+        """
+        worker_setup_command = self.get_node_type_specific_config(
+            ray_node_type, "worker_setup_commands"
+        )
+        if worker_setup_command is None:
+            # Return global worker setup commands if node type specific
+            # worker setup commands are not specified.
+            logger.info(
+                "Using global worker setup commands for {}".format(ray_node_type)
+            )
+            return self._configs.get("worker_setup_commands", [])
+        return worker_setup_command
+    def get_initialization_commands(self, ray_node_type: NodeType) -> List[str]:
+        """
+        Return the initialization commands for the specified node type.
+        If the node type specific initialization commands are not specified,
+        return the global initialization commands.
+        """
+        initialization_command = self.get_node_type_specific_config(
+            ray_node_type, "initialization_commands"
+        )
+        if initialization_command is None:
+            logger.info(
+                "Using global initialization commands for {}".format(ray_node_type)
+            )
+            return self._configs.get("initialization_commands", [])
+        return initialization_command
+    def get_node_type_specific_config(
+        self, ray_node_type: NodeType, config_name: str
+    ) -> Optional[Any]:
+        node_specific_config = self._configs["available_node_types"].get(
+            ray_node_type, {}
+        )
+        return node_specific_config.get(config_name, None)
+    def get_node_resources(self, ray_node_type: NodeType) -> Dict[str, float]:
+        return copy.deepcopy(
+            self.get_node_type_specific_config(ray_node_type, "resources") or {}
+        )
+    def get_node_labels(self, ray_node_type: NodeType) -> Dict[str, str]:
+        return copy.deepcopy(
+            self.get_node_type_specific_config(ray_node_type, "labels") or {}
+        )
+    def get_config(self, config_name, default=None) -> Any:
+        return self._configs.get(config_name, default)
+    def get_provider_instance_type(self, ray_node_type: NodeType) -> str:
+        provider = self.provider
+        node_config = self.get_node_type_specific_config(ray_node_type, "node_config")
+        if provider in [Provider.AWS, Provider.ALIYUN]:
+            return node_config.get("InstanceType", "")
+        elif provider == Provider.AZURE:
+            return node_config.get("azure_arm_parameters", {}).get("vmSize", "")
+        elif provider == Provider.GCP:
+            return node_config.get("machineType", "")
+        elif provider in [Provider.KUBERAY, Provider.LOCAL, Provider.UNKNOWN]:
+            return ""
+        else:
+            raise ValueError(f"Unknown provider {provider}")
+    def get_node_type_configs(self) -> Dict[NodeType, NodeTypeConfig]:
+        """
+        Returns the node type configs from the `available_node_types` field.
+        Returns:
+            Dict[NodeType, NodeTypeConfig]: The node type configs.
+        """
+        available_node_types = self._configs.get("available_node_types", {})
+        if not available_node_types:
+            return None
+        node_type_configs = {}
+        auth_config = self._configs.get("auth", {})
+        head_node_type = self.get_head_node_type()
+        assert head_node_type
+        for node_type, node_config in available_node_types.items():
+            launch_config_hash = hash_launch_conf(
+                node_config.get("node_config", {}), auth_config
+            )
+            max_workers_nodes = node_config.get("max_workers", 0)
+            if head_node_type == node_type:
+                max_workers_nodes += 1
+            node_type_configs[node_type] = NodeTypeConfig(
+                name=node_type,
+                min_worker_nodes=node_config.get("min_workers", 0),
+                max_worker_nodes=max_workers_nodes,
+                idle_timeout_s=node_config.get("idle_timeout_s", None),
+                resources=node_config.get("resources", {}),
+                labels=node_config.get("labels", {}),
+                launch_config_hash=launch_config_hash,
+            )
+        return node_type_configs
+    def get_head_node_type(self) -> NodeType:
+        """
+        Returns the head node type.
+        If there is only one node type, return the only node type as the head
+        node type.
+        If there are multiple node types, return the head node type specified
+        in the config.
+        """
+        available_node_types = self._configs.get("available_node_types", {})
+        if len(available_node_types) == 1:
+            return list(available_node_types.keys())[0]
+        return self._configs.get("head_node_type")
+    def get_max_num_worker_nodes(self) -> Optional[int]:
+        return self.get_config("max_workers", None)
+    def get_max_num_nodes(self) -> Optional[int]:
+        max_num_workers = self.get_max_num_worker_nodes()
+        if max_num_workers is not None:
+            return max_num_workers + 1  # For head node
+        return None
+    def get_raw_config_mutable(self) -> Dict[str, Any]:
+        return self._configs
+    def get_upscaling_speed(self) -> float:
+        return self.get_config("upscaling_speed", DEFAULT_UPSCALING_SPEED)
+    def get_max_concurrent_launches(self) -> int:
+        return AUTOSCALER_MAX_CONCURRENT_LAUNCHES
+    def disable_node_updaters(self) -> bool:
+        provider_config = self._configs.get("provider", {})
+        return provider_config.get(DISABLE_NODE_UPDATERS_KEY, True)
+    def get_idle_timeout_s(self) -> Optional[float]:
+        """
+        Returns the idle timeout in seconds if present in config, otherwise None.
+        """
+        idle_timeout_s = self.get_config("idle_timeout_minutes", None)
+        return idle_timeout_s * 60 if idle_timeout_s is not None else None
+    def disable_launch_config_check(self) -> bool:
+        provider_config = self.get_provider_config()
+        return provider_config.get(DISABLE_LAUNCH_CONFIG_CHECK_KEY, True)
+    def get_instance_reconcile_config(self) -> InstanceReconcileConfig:
+        # TODO(rickyx): we need a way to customize these configs,
+        # either extending the current ray-schema.json, or just use another
+        # schema validation paths.
+        return InstanceReconcileConfig()
+    def get_provider_config(self) -> Dict[str, Any]:
+        return self._configs.get("provider", {})
+    def dump(self) -> str:
+        return yaml.safe_dump(self._configs)
+    @property
+    def provider(self) -> Provider:
+        provider_str = self._configs.get("provider", {}).get("type", "")
+        if provider_str == "local":
+            return Provider.LOCAL
+        elif provider_str == "aws":
+            return Provider.AWS
+        elif provider_str == "azure":
+            return Provider.AZURE
+        elif provider_str == "gcp":
+            return Provider.GCP
+        elif provider_str == "aliyun":
+            return Provider.ALIYUN
+        elif provider_str == "kuberay":
+            return Provider.KUBERAY
+        elif provider_str == "readonly":
+            return Provider.READ_ONLY
+        else:
+            return Provider.UNKNOWN
+    @property
+    def runtime_hash(self) -> str:
+        return self._runtime_hash
+    @property
+    def file_mounts_contents_hash(self) -> str:
+        return self._file_mounts_contents_hash
+class FileConfigReader(IConfigReader):
+    """A class that reads cluster config from a yaml file."""
+    def __init__(self, config_file: str, skip_content_hash: bool = True) -> None:
+        """
+        Args:
+            config_file: The path to the config file.
+            skip_content_hash:  Whether to skip file mounts/ray command
+                hash calculation. Default to True.
+        """
+        self._config_file_path = Path(config_file).resolve()
+        self._skip_content_hash = skip_content_hash
+        self._cached_config = self._read()
+    def _read(self) -> AutoscalingConfig:
+        with open(self._config_file_path) as f:
+            config = yaml.safe_load(f.read())
+            return AutoscalingConfig(config, skip_content_hash=self._skip_content_hash)
+    def get_cached_autoscaling_config(self) -> AutoscalingConfig:
+        """
+        Returns:
+            AutoscalingConfig: The autoscaling config.
+        """
+        return self._cached_config
+    def refresh_cached_autoscaling_config(self):
+        self._cached_config = self._read()
+class KubeRayConfigReader(IConfigReader):
+    """A class that reads cluster config from a K8s RayCluster CR."""
+    def __init__(self, config_producer: AutoscalingConfigProducer):
+        self._config_producer = config_producer
+        self._cached_config = self._generate_configs_from_k8s()
+    def _generate_configs_from_k8s(self) -> AutoscalingConfig:
+        return AutoscalingConfig(self._config_producer())
+    def get_cached_autoscaling_config(self) -> AutoscalingConfig:
+        """
+        Returns:
+            AutoscalingConfig: The autoscaling config.
+        """
+        return self._cached_config
+    def refresh_cached_autoscaling_config(self):
+        """
+        Reads the configs from the K8s RayCluster CR.
+        This reads from the K8s API server every time to pick up changes.
+        """
+        self._cached_config = self._generate_configs_from_k8s()
+class ReadOnlyProviderConfigReader(IConfigReader):
+    """A class that reads cluster config for a read-only provider.
+    This is used for laptop mode / manual cluster setup modes, in order to
+    provide status reporting in the same way for users."""
+    def __init__(self, gcs_address: str):
+        self._configs = BASE_READONLY_CONFIG
+        self._gcs_client = GcsClient(address=gcs_address)
+    def refresh_cached_autoscaling_config(self) -> AutoscalingConfig:
+        # Update the config with node types from GCS.
+        ray_cluster_resource_state = get_cluster_resource_state(self._gcs_client)
+        # Format each node type's config from the running nodes.
+        available_node_types = {}
+        head_node_type = None
+        for node_state in ray_cluster_resource_state.node_states:
+            node_type = format_readonly_node_type(binary_to_hex(node_state.node_id))
+            if is_head_node(node_state):
+                head_node_type = node_type
+            available_node_types[node_type] = {
+                "resources": dict(node_state.total_resources),
+                "min_workers": 0,
+                "max_workers": 0 if is_head_node(node_state) else 1,
+                "node_config": {},
+            }
+        if available_node_types:
+            self._configs["available_node_types"].update(available_node_types)
+            self._configs["max_workers"] = len(available_node_types)
+            assert head_node_type, "Head node type should be found."
+            self._configs["head_node_type"] = head_node_type
+        # Don't idle terminated nodes in read-only mode.
+        self._configs.pop("idle_timeout_minutes", None)
+    def get_cached_autoscaling_config(self) -> AutoscalingConfig:
+        return AutoscalingConfig(self._configs, skip_content_hash=True)

.venv/lib/python3.11/site-packages/ray/autoscaler/v2/instance_manager/instance_manager.py ADDED Viewed

	@@ -0,0 +1,270 @@

+import logging
+from abc import ABC, abstractmethod
+from typing import List, Optional
+from ray.autoscaler.v2.instance_manager.common import InstanceUtil
+from ray.autoscaler.v2.instance_manager.instance_storage import InstanceStorage
+from ray.core.generated.instance_manager_pb2 import (
+    GetInstanceManagerStateReply,
+    GetInstanceManagerStateRequest,
+    Instance,
+    InstanceUpdateEvent,
+    NodeKind,
+    StatusCode,
+    UpdateInstanceManagerStateReply,
+    UpdateInstanceManagerStateRequest,
+)
+logger = logging.getLogger(__name__)
+class InstanceUpdatedSubscriber(ABC):
+    """Subscribers to instance status changes."""
+    @abstractmethod
+    def notify(self, events: List[InstanceUpdateEvent]) -> None:
+        pass
+class InstanceManager:
+    """
+    See `InstanceManagerService` in instance_manager.proto
+    This handles updates to an instance, or inserts a new instance if
+    it's an insert update. We should only be inserting new instances
+    of the below statuses:
+        1. ALLOCATED: For unmanaged instance not initialized by InstanceManager,
+            e.g. head node
+        2. QUEUED: For new instance being queued to launch.
+        3. TERMINATING: For leaked cloud instance that needs to be terminated.
+    For full status transitions, see:
+    https://docs.google.com/document/d/1NzQjA8Mh-oMc-QxXOa529oneWCoA8sDiVoNkBqqDb4U/edit#heading=h.k9a1sp4qpqj4
+    Not thread safe, should be used as a singleton.
+    """
+    def __init__(
+        self,
+        instance_storage: InstanceStorage,
+        instance_status_update_subscribers: Optional[List[InstanceUpdatedSubscriber]],
+    ):
+        self._instance_storage = instance_storage
+        self._status_update_subscribers = instance_status_update_subscribers or []
+    def update_instance_manager_state(
+        self, request: UpdateInstanceManagerStateRequest
+    ) -> UpdateInstanceManagerStateReply:
+        """
+        Updates the instance manager state.
+        If there's any failure, no updates would be made and the reply
+        would contain the latest version of the instance manager state,
+        and the error info.
+        Args:
+            request: The request to update the instance manager state.
+        Returns:
+            The reply to the request.
+        """
+        # Handle updates
+        ids_to_updates = {update.instance_id: update for update in request.updates}
+        to_update_instances, version = self._instance_storage.get_instances(
+            instance_ids=ids_to_updates.keys()
+        )
+        if request.expected_version >= 0 and request.expected_version != version:
+            err_str = (
+                f"Version mismatch: expected: {request.expected_version}, "
+                f"actual: {version}"
+            )
+            logger.warning(err_str)
+            return self._get_update_im_state_reply(
+                StatusCode.VERSION_MISMATCH,
+                version,
+                err_str,
+            )
+        # Handle instances states update.
+        to_upsert_instances = []
+        for instance_id, update in ids_to_updates.items():
+            if instance_id in to_update_instances:
+                instance = self._update_instance(
+                    to_update_instances[instance_id], update
+                )
+            else:
+                instance = self._create_instance(update)
+            to_upsert_instances.append(instance)
+        # Updates the instance storage.
+        result = self._instance_storage.batch_upsert_instances(
+            updates=to_upsert_instances,
+            expected_storage_version=version,
+        )
+        if not result.success:
+            if result.version != version:
+                err_str = (
+                    f"Version mismatch: expected: {version}, actual: {result.version}"
+                )
+                logger.warning(err_str)
+                return self._get_update_im_state_reply(
+                    StatusCode.VERSION_MISMATCH, result.version, err_str
+                )
+            else:
+                err_str = "Failed to update instance storage."
+                logger.error(err_str)
+                return self._get_update_im_state_reply(
+                    StatusCode.UNKNOWN_ERRORS, result.version, err_str
+                )
+        # Successful updates.
+        for subscriber in self._status_update_subscribers:
+            subscriber.notify(request.updates)
+        return self._get_update_im_state_reply(StatusCode.OK, result.version)
+    def get_instance_manager_state(
+        self, request: GetInstanceManagerStateRequest
+    ) -> GetInstanceManagerStateReply:
+        """
+        Gets the instance manager state.
+        Args:
+            request: The request to get the instance manager state.
+        Returns:
+            The reply to the request.
+        """
+        reply = GetInstanceManagerStateReply()
+        instances, version = self._instance_storage.get_instances()
+        reply.state.instances.extend(instances.values())
+        reply.state.version = version
+        reply.status.code = StatusCode.OK
+        return reply
+    #########################################
+    # Private methods
+    #########################################
+    @staticmethod
+    def _get_update_im_state_reply(
+        status_code: StatusCode, version: int, error_message: str = ""
+    ) -> UpdateInstanceManagerStateReply:
+        """
+        Returns a UpdateInstanceManagerStateReply with the given status code and
+        version.
+        Args:
+            status_code: The status code.
+            version: The version.
+            error_message: The error message if any.
+        Returns:
+            The reply.
+        """
+        reply = UpdateInstanceManagerStateReply()
+        reply.status.code = status_code
+        reply.version = version
+        if error_message:
+            reply.status.message = error_message
+        return reply
+    @staticmethod
+    def _apply_update(instance: Instance, update: InstanceUpdateEvent):
+        """
+        Apply status specific update to the instance.
+        Args:
+            instance: The instance to update.
+            update: The update to apply.
+        """
+        if update.new_instance_status == Instance.ALLOCATED:
+            assert (
+                update.cloud_instance_id
+            ), "ALLOCATED update must have cloud_instance_id"
+            assert update.node_kind in [
+                NodeKind.WORKER,
+                NodeKind.HEAD,
+            ], "ALLOCATED update must have node_kind as WORKER or HEAD"
+            assert update.instance_type, "ALLOCATED update must have instance_type"
+            assert (
+                update.cloud_instance_id
+            ), "ALLOCATED update must have cloud_instance_id"
+            instance.cloud_instance_id = update.cloud_instance_id
+            instance.node_kind = update.node_kind
+            instance.instance_type = update.instance_type
+        elif update.new_instance_status == Instance.RAY_RUNNING:
+            assert update.ray_node_id, "RAY_RUNNING update must have ray_node_id"
+            instance.node_id = update.ray_node_id
+        elif update.new_instance_status == Instance.REQUESTED:
+            assert (
+                update.launch_request_id
+            ), "REQUESTED update must have launch_request_id"
+            assert update.instance_type, "REQUESTED update must have instance_type"
+            instance.launch_request_id = update.launch_request_id
+            instance.instance_type = update.instance_type
+        elif update.new_instance_status == Instance.TERMINATING:
+            assert (
+                update.cloud_instance_id
+            ), "TERMINATING update must have cloud instance id"
+    @staticmethod
+    def _create_instance(update: InstanceUpdateEvent) -> Instance:
+        """
+        Create a new instance from the given update.
+        """
+        assert update.upsert, "upsert must be true for creating new instance."
+        assert update.new_instance_status in [
+            # For unmanaged instance not initialized by InstanceManager,
+            # e.g. head node
+            Instance.ALLOCATED,
+            # For new instance being queued to launch.
+            Instance.QUEUED,
+            # For leaked cloud instance that needs to be terminated.
+            Instance.TERMINATING,
+        ], (
+            "Invalid status for new instance, must be one of "
+            "[ALLOCATED, QUEUED, TERMINATING]"
+        )
+        # Create a new instance first for common fields.
+        instance = InstanceUtil.new_instance(
+            instance_id=update.instance_id,
+            instance_type=update.instance_type,
+            status=update.new_instance_status,
+            details=update.details,
+        )
+        # Apply the status specific updates.
+        logger.info(InstanceUtil.get_log_str_for_update(instance, update))
+        InstanceManager._apply_update(instance, update)
+        return instance
+    @staticmethod
+    def _update_instance(instance: Instance, update: InstanceUpdateEvent) -> Instance:
+        """
+        Update the instance with the given update.
+        Args:
+            instance: The instance to update.
+            update: The update to apply.
+        Returns:
+            The updated instance.
+        """
+        logger.info(InstanceUtil.get_log_str_for_update(instance, update))
+        assert InstanceUtil.set_status(instance, update.new_instance_status), (
+            "Invalid status transition from "
+            f"{Instance.InstanceStatus.Name(instance.status)} to "
+            f"{Instance.InstanceStatus.Name(update.new_instance_status)}"
+        )
+        InstanceManager._apply_update(instance, update)
+        return instance

.venv/lib/python3.11/site-packages/ray/autoscaler/v2/instance_manager/instance_storage.py ADDED Viewed

	@@ -0,0 +1,151 @@

+import copy
+import logging
+from typing import Dict, List, Optional, Set, Tuple
+from ray.autoscaler.v2.instance_manager.storage import Storage, StoreStatus
+from ray.core.generated.instance_manager_pb2 import Instance
+logger = logging.getLogger(__name__)
+class InstanceStorage:
+    """Instance storage stores the states of instances in the storage."""
+    def __init__(
+        self,
+        cluster_id: str,
+        storage: Storage,
+    ) -> None:
+        self._storage = storage
+        self._cluster_id = cluster_id
+        self._table_name = f"instance_table@{cluster_id}"
+    def batch_upsert_instances(
+        self,
+        updates: List[Instance],
+        expected_storage_version: Optional[int] = None,
+    ) -> StoreStatus:
+        """Upsert instances into the storage. If the instance already exists,
+        it will be updated. Otherwise, it will be inserted. If the
+        expected_storage_version is specified, the update will fail if the
+        current storage version does not match the expected version.
+        Note the version of the upserted instances will be set to the current
+        storage version.
+        Args:
+            updates: A list of instances to be upserted.
+            expected_storage_version: The expected storage version.
+        Returns:
+            StoreStatus: A tuple of (success, storage_version).
+        """
+        mutations = {}
+        version = self._storage.get_version()
+        # handle version mismatch
+        if expected_storage_version and expected_storage_version != version:
+            return StoreStatus(False, version)
+        for instance in updates:
+            instance = copy.deepcopy(instance)
+            # the instance version is set to 0, it will be
+            # populated by the storage entry's verion on read
+            instance.version = 0
+            mutations[instance.instance_id] = instance.SerializeToString()
+        result, version = self._storage.batch_update(
+            self._table_name, mutations, {}, expected_storage_version
+        )
+        return StoreStatus(result, version)
+    def upsert_instance(
+        self,
+        instance: Instance,
+        expected_instance_version: Optional[int] = None,
+        expected_storage_verison: Optional[int] = None,
+    ) -> StoreStatus:
+        """Upsert an instance in the storage.
+        If the expected_instance_version is specified, the update will fail
+        if the current instance version does not match the expected version.
+        Similarly, if the expected_storage_version is
+        specified, the update will fail if the current storage version does not
+        match the expected version.
+        Note the version of the upserted instances will be set to the current
+        storage version.
+        Args:
+            instance: The instance to be updated.
+            expected_instance_version: The expected instance version.
+            expected_storage_version: The expected storage version.
+        Returns:
+            StoreStatus: A tuple of (success, storage_version).
+        """
+        instance = copy.deepcopy(instance)
+        # the instance version is set to 0, it will be
+        # populated by the storage entry's verion on read
+        instance.version = 0
+        result, version = self._storage.update(
+            self._table_name,
+            key=instance.instance_id,
+            value=instance.SerializeToString(),
+            expected_entry_version=expected_instance_version,
+            expected_storage_version=expected_storage_verison,
+            insert_only=False,
+        )
+        return StoreStatus(result, version)
+    def get_instances(
+        self,
+        instance_ids: List[str] = None,
+        status_filter: Set[int] = None,
+    ) -> Tuple[Dict[str, Instance], int]:
+        """Get instances from the storage.
+        Args:
+            instance_ids: A list of instance ids to be retrieved. If empty, all
+                instances will be retrieved.
+            status_filter: Only instances with the specified status will be returned.
+        Returns:
+            Tuple[Dict[str, Instance], int]: A tuple of (instances, version).
+                The instances is a dictionary of (instance_id, instance) pairs.
+        """
+        instance_ids = instance_ids or []
+        status_filter = status_filter or set()
+        pairs, version = self._storage.get(self._table_name, instance_ids)
+        instances = {}
+        for instance_id, (instance_data, entry_version) in pairs.items():
+            instance = Instance()
+            instance.ParseFromString(instance_data)
+            instance.version = entry_version
+            if status_filter and instance.status not in status_filter:
+                continue
+            instances[instance_id] = instance
+        return instances, version
+    def batch_delete_instances(
+        self, instance_ids: List[str], expected_storage_version: Optional[int] = None
+    ) -> StoreStatus:
+        """Delete instances from the storage. If the expected_version is
+        specified, the update will fail if the current storage version does not
+        match the expected version.
+        Args:
+            to_delete: A list of instances to be deleted.
+            expected_version: The expected storage version.
+        Returns:
+            StoreStatus: A tuple of (success, storage_version).
+        """
+        version = self._storage.get_version()
+        if expected_storage_version and expected_storage_version != version:
+            return StoreStatus(False, version)
+        result = self._storage.batch_update(
+            self._table_name, {}, instance_ids, expected_storage_version
+        )
+        return result

.venv/lib/python3.11/site-packages/ray/autoscaler/v2/instance_manager/node_provider.py ADDED Viewed

	@@ -0,0 +1,522 @@

+import logging
+import math
+import time
+from abc import ABC, abstractmethod
+from concurrent.futures import ThreadPoolExecutor
+from dataclasses import dataclass
+from queue import Queue
+from typing import Any, Dict, List, Optional
+from ray.autoscaler._private.constants import (
+    AUTOSCALER_MAX_CONCURRENT_LAUNCHES,
+    AUTOSCALER_MAX_LAUNCH_BATCH,
+)
+from ray.autoscaler._private.util import hash_launch_conf
+from ray.autoscaler.node_provider import NodeProvider as NodeProviderV1
+from ray.autoscaler.tags import (
+    NODE_KIND_HEAD,
+    NODE_KIND_UNMANAGED,
+    NODE_KIND_WORKER,
+    STATUS_UNINITIALIZED,
+    TAG_RAY_LAUNCH_CONFIG,
+    TAG_RAY_LAUNCH_REQUEST,
+    TAG_RAY_NODE_KIND,
+    TAG_RAY_NODE_NAME,
+    TAG_RAY_NODE_STATUS,
+    TAG_RAY_USER_NODE_TYPE,
+)
+from ray.autoscaler.v2.instance_manager.config import IConfigReader
+from ray.autoscaler.v2.schema import NodeType
+from ray.core.generated.instance_manager_pb2 import NodeKind
+logger = logging.getLogger(__name__)
+# Type Alias. This is a **unique identifier** for a cloud instance in the cluster.
+# The provider should guarantee that this id is unique across the cluster,
+# such that:
+#   - When a cloud instance is created and running, no other cloud instance in the
+#     cluster has the same id.
+#   - When a cloud instance is terminated, no other cloud instance in the cluster will
+#     be assigned the same id later.
+CloudInstanceId = str
+@dataclass
+class CloudInstance:
+    """
+    A class that represents a cloud instance in the cluster, with necessary metadata
+    of the cloud instance.
+    """
+    # The cloud instance id.
+    cloud_instance_id: CloudInstanceId
+    # The node type of the cloud instance.
+    node_type: NodeType
+    # The node kind, i.e head or worker.
+    node_kind: NodeKind
+    # If the cloud instance is already running.
+    is_running: bool
+    # Update request id from which the cloud instance is launched.
+    # This could be None if the cloud instance couldn't be associated with requests
+    # by the cloud provider: e.g. cloud provider doesn't support per-instance
+    # extra metadata.
+    # This is fine for now since the reconciler should be able to know how
+    # to handle cloud instances w/o request ids.
+    # TODO: make this a required field.
+    request_id: Optional[str] = None
+class CloudInstanceProviderError(Exception):
+    """
+    An base error class that represents an error that happened in the cloud instance
+    provider.
+    """
+    # The timestamp of the error occurred in nanoseconds.
+    timestamp_ns: int
+    def __init__(self, msg, timestamp_ns) -> None:
+        super().__init__(msg)
+        self.timestamp_ns = timestamp_ns
+class LaunchNodeError(CloudInstanceProviderError):
+    # The node type that failed to launch.
+    node_type: NodeType
+    # Number of nodes that failed to launch.
+    count: int
+    # A unique id that identifies from which update request the error originates.
+    request_id: str
+    def __init__(
+        self,
+        node_type: NodeType,
+        count: int,
+        request_id: str,
+        timestamp_ns: int,
+        details: str = "",
+        cause: Optional[Exception] = None,
+    ) -> None:
+        msg = (
+            f"Failed to launch {count} nodes of type {node_type} with "
+            f"request id {request_id}: {details}"
+        )
+        super().__init__(msg, timestamp_ns=timestamp_ns)
+        self.node_type = node_type
+        self.count = count
+        self.request_id = request_id
+        if cause:
+            self.__cause__ = cause
+    def __repr__(self) -> str:
+        return (
+            f"LaunchNodeError(node_type={self.node_type}, count={self.count}, "
+            f"request_id={self.request_id}): {self.__cause__}"
+        )
+class TerminateNodeError(CloudInstanceProviderError):
+    # The cloud instance id of the node that failed to terminate.
+    cloud_instance_id: CloudInstanceId
+    # A unique id that identifies from which update request the error originates.
+    request_id: str
+    def __init__(
+        self,
+        cloud_instance_id: CloudInstanceId,
+        request_id: str,
+        timestamp_ns: int,
+        details: str = "",
+        cause: Optional[Exception] = None,
+    ) -> None:
+        msg = (
+            f"Failed to terminate node {cloud_instance_id} with "
+            f"request id {request_id}: {details}"
+        )
+        super().__init__(msg, timestamp_ns=timestamp_ns)
+        self.cloud_instance_id = cloud_instance_id
+        self.request_id = request_id
+        if cause:
+            self.__cause__ = cause
+    def __repr__(self) -> str:
+        return (
+            f"TerminateNodeError(cloud_instance_id={self.cloud_instance_id}, "
+            f"request_id={self.request_id}): {self.__cause__}"
+        )
+class ICloudInstanceProvider(ABC):
+    """
+    The interface for a cloud instance provider.
+    This interface is a minimal interface that should be implemented by the
+    various cloud instance providers (e.g. AWS, and etc).
+    The cloud instance provider is responsible for managing the cloud instances in the
+    cluster. It provides the following main functionalities:
+        - Launch new cloud instances.
+        - Terminate existing running instances.
+        - Get the non-terminated cloud instances in the cluster.
+        - Poll the errors that happened for the updates to the cloud instance provider.
+    Below properties of the cloud instance provider are assumed with this interface:
+    1. Eventually consistent
+    The cloud instance provider is expected to be eventually consistent with the
+    cluster state. For example, when a cloud instance is request to be terminated
+    or launched, the provider may not immediately reflect the change in its state.
+    However, the provider is expected to eventually reflect the change in its state.
+    2. Asynchronous
+    The provider could also be asynchronous, where the termination/launch
+    request may not immediately return the result of the request.
+    3. Unique cloud instance ids
+    Cloud instance ids are expected to be unique across the cluster.
+    4. Idempotent updates
+    For the update APIs (e.g. ensure_min_nodes, terminate), the provider may use the
+    request ids to provide idempotency.
+    Usage:
+        ```
+            provider: ICloudInstanceProvider = ...
+            # Update the cluster with a desired shape.
+            provider.launch(
+                shape={
+                    "worker_nodes": 10,
+                    "ray_head": 1,
+                },
+                request_id="1",
+            )
+            # Get the non-terminated nodes of the cloud instance provider.
+            running = provider.get_non_terminated()
+            # Poll the errors
+            errors = provider.poll_errors()
+            # Terminate nodes.
+            provider.terminate(
+                ids=["cloud_instance_id_1", "cloud_instance_id_2"],
+                request_id="2",
+            )
+            # Process the state of the provider.
+            ...
+        ```
+    """
+    @abstractmethod
+    def get_non_terminated(self) -> Dict[CloudInstanceId, CloudInstance]:
+        """Get the non-terminated cloud instances in the cluster.
+        Returns:
+            A dictionary of the non-terminated cloud instances in the cluster.
+            The key is the cloud instance id, and the value is the cloud instance.
+        """
+        pass
+    @abstractmethod
+    def terminate(self, ids: List[CloudInstanceId], request_id: str) -> None:
+        """
+        Terminate the cloud instances asynchronously.
+        This method is expected to be idempotent, i.e. if the same request id is used
+        to terminate the same cloud instances, this should be a no-op if
+        the cloud instances are already terminated or being terminated.
+        Args:
+            ids: the cloud instance ids to terminate.
+            request_id: a unique id that identifies the request.
+        """
+        pass
+    @abstractmethod
+    def launch(
+        self,
+        shape: Dict[NodeType, int],
+        request_id: str,
+    ) -> None:
+        """Launch the cloud instances asynchronously.
+        Args:
+            shape: A map from node type to number of nodes to launch.
+            request_id: a unique id that identifies the update request.
+        """
+        pass
+    @abstractmethod
+    def poll_errors(self) -> List[CloudInstanceProviderError]:
+        """
+        Poll the errors that happened since the last poll.
+        This method would also clear the errors that happened since the last poll.
+        Returns:
+            The errors that happened since the last poll.
+        """
+        pass
+@dataclass(frozen=True)
+class CloudInstanceLaunchRequest:
+    """
+    The arguments to launch a node.
+    """
+    # The node type to launch.
+    node_type: NodeType
+    # Number of nodes to launch.
+    count: int
+    # A unique id that identifies the request.
+    request_id: str
+@dataclass(frozen=True)
+class CloudInstanceTerminateRequest:
+    """
+    The arguments to terminate a node.
+    """
+    # The cloud instance id of the node to terminate.
+    cloud_instance_id: CloudInstanceId
+    # A unique id that identifies the request.
+    request_id: str
+class NodeProviderAdapter(ICloudInstanceProvider):
+    """
+    Warps a NodeProviderV1 to a ICloudInstanceProvider.
+    TODO(rickyx):
+    The current adapter right now consists of two sets of APIs:
+    - v1: the old APIs that are used by the autoscaler, where
+    we forward the calls to the NodeProviderV1.
+    - v2: the new APIs that are used by the autoscaler v2, this is
+    defined in the ICloudInstanceProvider interface.
+    We should eventually remove the v1 APIs and only use the v2 APIs.
+    It's currently left as a TODO since changing the v1 APIs would
+    requires a lot of changes in the cluster launcher codebase.
+    """
+    def __init__(
+        self,
+        v1_provider: NodeProviderV1,
+        config_reader: IConfigReader,
+        max_launch_batch_per_type: int = AUTOSCALER_MAX_LAUNCH_BATCH,
+        max_concurrent_launches: int = AUTOSCALER_MAX_CONCURRENT_LAUNCHES,
+    ) -> None:
+        """
+        Args:
+            v1_provider: The v1 node provider to wrap.
+            config_reader: The config reader to read the autoscaling config.
+            max_launch_batch_per_type: The maximum number of nodes to launch per
+                node type in a single batch.
+            max_concurrent_launches: The maximum number of concurrent launches.
+        """
+        super().__init__()
+        self._v1_provider = v1_provider
+        self._config_reader = config_reader
+        # Executor to async launching and terminating nodes.
+        self._main_executor = ThreadPoolExecutor(
+            max_workers=1, thread_name_prefix="ray::NodeProviderAdapter"
+        )
+        # v1 legacy rate limiting on the node provider launch calls.
+        self._max_launch_batch_per_type = max_launch_batch_per_type
+        max_batches = math.ceil(
+            max_concurrent_launches / float(max_launch_batch_per_type)
+        )
+        self._node_launcher_executors = ThreadPoolExecutor(
+            max_workers=max_batches,
+            thread_name_prefix="ray::NodeLauncherPool",
+        )
+        # Queue to retrieve new errors occur in the multi-thread executors
+        # temporarily.
+        self._errors_queue = Queue()
+    def get_non_terminated(self) -> Dict[CloudInstanceId, CloudInstance]:
+        nodes = {}
+        cloud_instance_ids = self._v1_non_terminated_nodes({})
+        # Filter out nodes that are not running.
+        # This is efficient since the provider is expected to cache the
+        # running status of the nodes.
+        for cloud_instance_id in cloud_instance_ids:
+            node_tags = self._v1_node_tags(cloud_instance_id)
+            node_kind_tag = node_tags.get(TAG_RAY_NODE_KIND, NODE_KIND_UNMANAGED)
+            if node_kind_tag == NODE_KIND_UNMANAGED:
+                # Filter out unmanaged nodes.
+                continue
+            elif node_kind_tag == NODE_KIND_WORKER:
+                node_kind = NodeKind.WORKER
+            elif node_kind_tag == NODE_KIND_HEAD:
+                node_kind = NodeKind.HEAD
+            else:
+                raise ValueError(f"Invalid node kind: {node_kind_tag}")
+            nodes[cloud_instance_id] = CloudInstance(
+                cloud_instance_id=cloud_instance_id,
+                node_type=node_tags.get(TAG_RAY_USER_NODE_TYPE, ""),
+                is_running=self._v1_is_running(cloud_instance_id),
+                request_id=node_tags.get(TAG_RAY_LAUNCH_REQUEST, ""),
+                node_kind=node_kind,
+            )
+        return nodes
+    def poll_errors(self) -> List[CloudInstanceProviderError]:
+        errors = []
+        while not self._errors_queue.empty():
+            errors.append(self._errors_queue.get_nowait())
+        return errors
+    def launch(
+        self,
+        shape: Dict[NodeType, int],
+        request_id: str,
+    ) -> None:
+        self._main_executor.submit(self._do_launch, shape, request_id)
+    def terminate(self, ids: List[CloudInstanceId], request_id: str) -> None:
+        self._main_executor.submit(self._do_terminate, ids, request_id)
+    ###########################################
+    # Private APIs
+    ###########################################
+    def _do_launch(
+        self,
+        shape: Dict[NodeType, int],
+        request_id: str,
+    ) -> None:
+        """
+        Launch the cloud instances by calling into the v1 base node provider.
+        Args:
+            shape: The requested to launch node type and number of nodes.
+            request_id: The request id that identifies the request.
+        """
+        for node_type, count in shape.items():
+            # Keep submitting the launch requests to the launch pool in batches.
+            while count > 0:
+                to_launch = min(count, self._max_launch_batch_per_type)
+                self._node_launcher_executors.submit(
+                    self._launch_nodes_by_type,
+                    node_type,
+                    to_launch,
+                    request_id,
+                )
+                count -= to_launch
+    def _do_terminate(self, ids: List[CloudInstanceId], request_id: str) -> None:
+        """
+        Terminate the cloud instances by calling into the v1 base node provider.
+        If errors happen during the termination, the errors will be put into the
+        errors queue.
+        Args:
+            ids: The cloud instance ids to terminate.
+            request_id: The request id that identifies the request.
+        """
+        try:
+            self._v1_terminate_nodes(ids)
+        except Exception as e:
+            for id in ids:
+                error = TerminateNodeError(id, request_id, int(time.time_ns()))
+                error.__cause__ = e
+                self._errors_queue.put(error)
+    def _launch_nodes_by_type(
+        self,
+        node_type: NodeType,
+        count: int,
+        request_id: str,
+    ) -> None:
+        """
+        Launch nodes of the given node type.
+        Args:
+            node_type: The node type to launch.
+            count: Number of nodes to launch.
+            request_id: A unique id that identifies the request.
+        Raises:
+            ValueError: If the node type is invalid.
+            LaunchNodeError: If the launch failed and raised by the underlying provider.
+        """
+        # Check node type is valid.
+        try:
+            config = self._config_reader.get_cached_autoscaling_config()
+            launch_config = config.get_cloud_node_config(node_type)
+            resources = config.get_node_resources(node_type)
+            labels = config.get_node_labels(node_type)
+            # This is to be compatible with the v1 node launcher.
+            # See more in https://github.com/ray-project/ray/blob/6f5a189bc463e52c51a70f8aea41fb2950b443e8/python/ray/autoscaler/_private/node_launcher.py#L78-L85 # noqa
+            # TODO: this should be synced with what's stored in the IM, it should
+            # probably be made as a metadata field in the cloud instance. This is
+            # another incompatibility with KubeRay.
+            launch_hash = hash_launch_conf(launch_config, config.get_config("auth", {}))
+            node_tags = {
+                TAG_RAY_NODE_NAME: "ray-{}-worker".format(
+                    config.get_config("cluster_name", "")
+                ),
+                TAG_RAY_NODE_KIND: NODE_KIND_WORKER,
+                TAG_RAY_NODE_STATUS: STATUS_UNINITIALIZED,
+                TAG_RAY_LAUNCH_CONFIG: launch_hash,
+                TAG_RAY_LAUNCH_REQUEST: request_id,
+                TAG_RAY_USER_NODE_TYPE: node_type,
+            }
+            logger.info("Launching {} nodes of type {}.".format(count, node_type))
+            self._v1_provider.create_node_with_resources_and_labels(
+                launch_config, node_tags, count, resources, labels
+            )
+            logger.info("Launched {} nodes of type {}.".format(count, node_type))
+        except Exception as e:
+            error = LaunchNodeError(node_type, count, request_id, int(time.time_ns()))
+            error.__cause__ = e
+            self._errors_queue.put(error)
+    ###########################################
+    # V1 Legacy APIs
+    ###########################################
+    """
+    Below are the necessary legacy APIs from the V1 node provider.
+    These are needed as of now to provide the needed features
+    for V2 node provider.
+    The goal is to eventually remove these APIs and only use the
+    V2 APIs by modifying the individual node provider to inherit
+    from ICloudInstanceProvider.
+    """
+    def _v1_terminate_nodes(
+        self, ids: List[CloudInstanceId]
+    ) -> Optional[Dict[str, Any]]:
+        return self._v1_provider.terminate_nodes(ids)
+    def _v1_non_terminated_nodes(
+        self, tag_filters: Dict[str, str]
+    ) -> List[CloudInstanceId]:
+        return self._v1_provider.non_terminated_nodes(tag_filters)
+    def _v1_is_running(self, node_id: CloudInstanceId) -> bool:
+        return self._v1_provider.is_running(node_id)
+    def _v1_post_process(self) -> None:
+        self._v1_provider.post_process()
+    def _v1_node_tags(self, node_id: CloudInstanceId) -> Dict[str, str]:
+        return self._v1_provider.node_tags(node_id)
+    def _v1_safe_to_scale(self) -> bool:
+        return self._v1_provider.safe_to_scale()

.venv/lib/python3.11/site-packages/ray/autoscaler/v2/instance_manager/ray_installer.py ADDED Viewed

	@@ -0,0 +1,99 @@

+import dataclasses
+import logging
+import subprocess
+from ray.autoscaler._private.updater import NodeUpdater
+from ray.autoscaler._private.util import with_envs, with_head_node_ip
+from ray.autoscaler.node_provider import NodeProvider as NodeProviderV1
+from ray.autoscaler.v2.instance_manager.config import AutoscalingConfig
+from ray.core.generated.instance_manager_pb2 import Instance
+logger = logging.getLogger(__name__)
+@dataclasses.dataclass(frozen=True)
+class RayInstallError:
+    # Instance manager's instance id.
+    im_instance_id: str
+    # Error details.
+    details: str
+class RayInstaller(object):
+    """
+    RayInstaller is responsible for installing ray on the target instance.
+    """
+    def __init__(
+        self,
+        provider: NodeProviderV1,
+        config: AutoscalingConfig,
+        process_runner=subprocess,
+    ) -> None:
+        self._provider = provider
+        self._config = config
+        self._process_runner = process_runner
+    def install_ray(self, instance: Instance, head_node_ip: str) -> bool:
+        """
+        Install ray on the target instance synchronously.
+        TODO:(rickyx): This runs in another thread, and errors are silently
+        ignored. We should propagate the error to the main thread.
+        """
+        setup_commands = self._config.get_worker_setup_commands(instance.instance_type)
+        ray_start_commands = self._config.get_worker_start_ray_commands()
+        docker_config = self._config.get_docker_config(instance.instance_type)
+        logger.info(
+            f"Creating new (spawn_updater) updater thread for node"
+            f" {instance.cloud_instance_id}."
+        )
+        provider_instance_type_name = self._config.get_provider_instance_type(
+            instance.instance_type
+        )
+        updater = NodeUpdater(
+            node_id=instance.instance_id,
+            provider_config=self._config.get_config("provider"),
+            provider=self._provider,
+            auth_config=self._config.get_config("auth"),
+            cluster_name=self._config.get_config("cluster_name"),
+            file_mounts=self._config.get_config("file_mounts"),
+            initialization_commands=with_head_node_ip(
+                self._config.get_initialization_commands(instance.instance_type),
+                head_node_ip,
+            ),
+            setup_commands=with_head_node_ip(setup_commands, head_node_ip),
+            # This will prepend envs to the begin of the ray start commands, e.g.
+            # `RAY_HEAD_IP=<head_node_ip> \
+            #  RAY_CLOUD_INSTANCE_ID=<instance_id> \
+            #  ray start --head ...`
+            #  See src/ray/common/constants.h for ENV name definitions.
+            ray_start_commands=with_envs(
+                ray_start_commands,
+                {
+                    "RAY_HEAD_IP": head_node_ip,
+                    "RAY_CLOUD_INSTANCE_ID": instance.instance_id,
+                    "RAY_NODE_TYPE_NAME": instance.instance_type,
+                    "RAY_CLOUD_INSTANCE_TYPE_NAME": provider_instance_type_name,
+                },
+            ),
+            runtime_hash=self._config.runtime_hash,
+            file_mounts_contents_hash=self._config.file_mounts_contents_hash,
+            is_head_node=False,
+            cluster_synced_files=self._config.get_config("cluster_synced_files"),
+            rsync_options={
+                "rsync_exclude": self._config.get_config("rsync_exclude"),
+                "rsync_filter": self._config.get_config("rsync_filter"),
+            },
+            use_internal_ip=True,
+            docker_config=docker_config,
+            node_resources=self._config.get_node_resources(instance.instance_type),
+            node_labels=self._config.get_node_labels(instance.instance_type),
+            process_runner=self._process_runner,
+        )
+        try:
+            updater.run()
+        except Exception:
+            # Errors has already been handled.
+            return False
+        return True

.venv/lib/python3.11/site-packages/ray/autoscaler/v2/instance_manager/reconciler.py ADDED Viewed

	@@ -0,0 +1,1565 @@

+import logging
+import math
+import time
+import uuid
+from collections import defaultdict
+from typing import Dict, List, Optional, Set, Tuple
+from ray._private.utils import binary_to_hex
+from ray.autoscaler.v2.instance_manager.common import InstanceUtil
+from ray.autoscaler.v2.instance_manager.config import (
+    AutoscalingConfig,
+    InstanceReconcileConfig,
+    Provider,
+)
+from ray.autoscaler.v2.instance_manager.instance_manager import InstanceManager
+from ray.autoscaler.v2.instance_manager.node_provider import (
+    CloudInstance,
+    CloudInstanceId,
+    CloudInstanceProviderError,
+    ICloudInstanceProvider,
+    LaunchNodeError,
+    TerminateNodeError,
+)
+from ray.autoscaler.v2.instance_manager.ray_installer import RayInstallError
+from ray.autoscaler.v2.instance_manager.subscribers.ray_stopper import RayStopError
+from ray.autoscaler.v2.metrics_reporter import AutoscalerMetricsReporter
+from ray.autoscaler.v2.scheduler import IResourceScheduler, SchedulingRequest
+from ray.autoscaler.v2.schema import AutoscalerInstance, NodeType
+from ray.autoscaler.v2.sdk import is_head_node
+from ray.core.generated.autoscaler_pb2 import (
+    AutoscalingState,
+    ClusterResourceState,
+    FailedInstanceRequest,
+    NodeState,
+    NodeStatus,
+    PendingInstance,
+    PendingInstanceRequest,
+)
+from ray.core.generated.instance_manager_pb2 import GetInstanceManagerStateRequest
+from ray.core.generated.instance_manager_pb2 import Instance as IMInstance
+from ray.core.generated.instance_manager_pb2 import (
+    InstanceUpdateEvent as IMInstanceUpdateEvent,
+)
+from ray.core.generated.instance_manager_pb2 import (
+    NodeKind,
+    StatusCode,
+    UpdateInstanceManagerStateRequest,
+)
+logger = logging.getLogger(__name__)
+class Reconciler:
+    """
+    A singleton class that reconciles the instance states of the instance manager
+    for autoscaler.
+    """
+    @staticmethod
+    def reconcile(
+        instance_manager: InstanceManager,
+        scheduler: IResourceScheduler,
+        cloud_provider: ICloudInstanceProvider,
+        ray_cluster_resource_state: ClusterResourceState,
+        non_terminated_cloud_instances: Dict[CloudInstanceId, CloudInstance],
+        autoscaling_config: AutoscalingConfig,
+        cloud_provider_errors: Optional[List[CloudInstanceProviderError]] = None,
+        ray_install_errors: Optional[List[RayInstallError]] = None,
+        ray_stop_errors: Optional[List[RayStopError]] = None,
+        metrics_reporter: Optional[AutoscalerMetricsReporter] = None,
+        _logger: Optional[logging.Logger] = None,
+    ) -> AutoscalingState:
+        """
+        The reconcile method computes InstanceUpdateEvents for the instance manager
+        by:
+        1. Reconciling the instance manager's instances with external states like
+        the cloud provider's, the ray cluster's states, the ray installer's results.
+        It performs "passive" status transitions for the instances (where the status
+        transition should only be reflecting the external states of the cloud provider
+        and the ray cluster, and should not be actively changing them)
+        2. Stepping the instances to the active states by computing instance status
+        transitions that are needed and updating the instance manager's state.
+        These transitions should be "active" where the transitions have side effects
+        (through InstanceStatusSubscriber) to the cloud provider and the ray cluster.
+        Args:
+            instance_manager: The instance manager to reconcile.
+            ray_cluster_resource_state: The ray cluster's resource state.
+            non_terminated_cloud_instances: The non-terminated cloud instances from
+                the cloud provider.
+            cloud_provider_errors: The errors from the cloud provider.
+            ray_install_errors: The errors from RayInstaller.
+            ray_stop_errors: The errors from RayStopper.
+            metrics_reporter: The metric reporter to report the autoscaler metrics.
+            _logger: The logger (for testing).
+        """
+        cloud_provider_errors = cloud_provider_errors or []
+        ray_install_errors = ray_install_errors or []
+        ray_stop_errors = ray_stop_errors or []
+        autoscaling_state = AutoscalingState()
+        autoscaling_state.last_seen_cluster_resource_state_version = (
+            ray_cluster_resource_state.cluster_resource_state_version
+        )
+        Reconciler._sync_from(
+            instance_manager=instance_manager,
+            ray_nodes=ray_cluster_resource_state.node_states,
+            non_terminated_cloud_instances=non_terminated_cloud_instances,
+            cloud_provider_errors=cloud_provider_errors,
+            ray_install_errors=ray_install_errors,
+            ray_stop_errors=ray_stop_errors,
+            autoscaling_config=autoscaling_config,
+        )
+        Reconciler._step_next(
+            autoscaling_state=autoscaling_state,
+            instance_manager=instance_manager,
+            scheduler=scheduler,
+            cloud_provider=cloud_provider,
+            ray_cluster_resource_state=ray_cluster_resource_state,
+            non_terminated_cloud_instances=non_terminated_cloud_instances,
+            autoscaling_config=autoscaling_config,
+            _logger=_logger,
+        )
+        Reconciler._report_metrics(
+            instance_manager=instance_manager,
+            autoscaling_config=autoscaling_config,
+            metrics_reporter=metrics_reporter,
+        )
+        return autoscaling_state
+    @staticmethod
+    def _sync_from(
+        instance_manager: InstanceManager,
+        ray_nodes: List[NodeState],
+        non_terminated_cloud_instances: Dict[CloudInstanceId, CloudInstance],
+        cloud_provider_errors: List[CloudInstanceProviderError],
+        ray_install_errors: List[RayInstallError],
+        ray_stop_errors: List[RayStopError],
+        autoscaling_config: AutoscalingConfig,
+    ):
+        """
+        Reconcile the instance states of the instance manager from external states like
+        the cloud provider's, the ray cluster's states, the ray installer's results,
+        etc.
+        For each instance, we try to figure out if we need to transition the instance
+        status to a new status, and if so, what the new status should be.
+        These transitions should be purely "passive", meaning they should only be
+        reflecting the external states of the cloud provider and the ray cluster,
+        and should not be actively changing the states of the cloud provider or the ray
+        cluster.
+        More specifically, we will reconcile status transitions for:
+            1.  QUEUED/REQUESTED -> ALLOCATED:
+                When a instance with launch request id (indicating a previous launch
+                request was made) could be assigned to an unassigned cloud instance
+                of the same instance type.
+            2.  REQUESTED -> ALLOCATION_FAILED:
+                When there's an error from the cloud provider for launch failure so
+                that the instance becomes ALLOCATION_FAILED.
+            3.  * -> RAY_RUNNING:
+                When a ray node on a cloud instance joins the ray cluster, we will
+                transition the instance to RAY_RUNNING.
+            4.  * -> TERMINATED:
+                When the cloud instance is already terminated, we will transition the
+                instance to TERMINATED.
+            5.  TERMINATING -> TERMINATION_FAILED:
+                When there's an error from the cloud provider for termination failure.
+            6.  * -> RAY_STOPPED:
+                When ray was stopped on the cloud instance, we will transition the
+                instance to RAY_STOPPED.
+            7.  * -> RAY_INSTALL_FAILED:
+                When there's an error from RayInstaller.
+            8. RAY_STOP_REQUESTED -> RAY_RUNNING:
+                When requested to stop ray, but failed to stop/drain the ray node
+                (e.g. idle termination drain rejected by the node).
+        Args:
+            instance_manager: The instance manager to reconcile.
+            ray_nodes: The ray cluster's states of ray nodes.
+            non_terminated_cloud_instances: The non-terminated cloud instances from
+                the cloud provider.
+            cloud_provider_errors: The errors from the cloud provider.
+            ray_install_errors: The errors from RayInstaller.
+            ray_stop_errors: The errors from RayStopper.
+        """
+        # Handle 1 & 2 for cloud instance allocation.
+        Reconciler._handle_cloud_instance_allocation(
+            instance_manager,
+            non_terminated_cloud_instances,
+            cloud_provider_errors,
+        )
+        Reconciler._handle_cloud_instance_terminated(
+            instance_manager, non_terminated_cloud_instances
+        )
+        Reconciler._handle_cloud_instance_termination_errors(
+            instance_manager, cloud_provider_errors
+        )
+        Reconciler._handle_extra_cloud_instances(
+            instance_manager, non_terminated_cloud_instances, ray_nodes
+        )
+        Reconciler._handle_ray_status_transition(
+            instance_manager, ray_nodes, autoscaling_config
+        )
+        Reconciler._handle_ray_install_failed(instance_manager, ray_install_errors)
+        Reconciler._handle_ray_stop_failed(instance_manager, ray_stop_errors, ray_nodes)
+    @staticmethod
+    def _step_next(
+        autoscaling_state: AutoscalingState,
+        instance_manager: InstanceManager,
+        scheduler: IResourceScheduler,
+        cloud_provider: ICloudInstanceProvider,
+        ray_cluster_resource_state: ClusterResourceState,
+        non_terminated_cloud_instances: Dict[CloudInstanceId, CloudInstance],
+        autoscaling_config: AutoscalingConfig,
+        _logger: Optional[logging.Logger] = None,
+    ):
+        """
+        Step the reconciler to the next state by computing instance status transitions
+        that are needed and updating the instance manager's state.
+        Specifically, we will:
+            1. Shut down leak cloud instances
+                Leaked cloud instances that are not managed by the instance manager.
+            2. Terminating instances with ray stopped or ray install failure.
+            3. Scale down the cluster:
+              (* -> RAY_STOP_REQUESTED/TERMINATING)
+                b. Extra cloud due to max nodes config.
+                c. Cloud instances with outdated configs.
+            4. Scale up the cluster:
+              (new QUEUED)
+                Create new instances based on the IResourceScheduler's decision for
+                scaling up.
+            5. Request cloud provider to launch new instances.
+              (QUEUED -> REQUESTED)
+            6. Install ray
+              (ALLOCATED -> RAY_INSTALLING)
+                When ray could be installed and launched.
+            7. Handle any stuck instances with timeouts.
+        Args:
+            instance_manager: The instance manager to reconcile.
+            scheduler: The resource scheduler to make scaling decisions.
+            ray_cluster_resource_state: The ray cluster's resource state.
+            non_terminated_cloud_instances: The non-terminated cloud instances from
+                the cloud provider.
+            autoscaling_config: The autoscaling config.
+            _logger: The logger (for testing).
+        """
+        Reconciler._handle_stuck_instances(
+            instance_manager=instance_manager,
+            reconcile_config=autoscaling_config.get_instance_reconcile_config(),
+            _logger=_logger or logger,
+        )
+        Reconciler._scale_cluster(
+            autoscaling_state=autoscaling_state,
+            instance_manager=instance_manager,
+            ray_state=ray_cluster_resource_state,
+            scheduler=scheduler,
+            autoscaling_config=autoscaling_config,
+        )
+        Reconciler._handle_instances_launch(
+            instance_manager=instance_manager, autoscaling_config=autoscaling_config
+        )
+        Reconciler._terminate_instances(instance_manager=instance_manager)
+        if not autoscaling_config.disable_node_updaters():
+            Reconciler._install_ray(
+                instance_manager=instance_manager,
+                non_terminated_cloud_instances=non_terminated_cloud_instances,
+            )
+        Reconciler._fill_autoscaling_state(
+            instance_manager=instance_manager, autoscaling_state=autoscaling_state
+        )
+    #######################################################
+    # Utility methods for reconciling instance states.
+    #######################################################
+    @staticmethod
+    def _handle_cloud_instance_allocation(
+        instance_manager: InstanceManager,
+        non_terminated_cloud_instances: Dict[CloudInstanceId, CloudInstance],
+        cloud_provider_errors: List[CloudInstanceProviderError],
+    ):
+        im_instances, version = Reconciler._get_im_instances(instance_manager)
+        updates = {}
+        # Compute intermediate states.
+        instances_with_launch_requests: List[IMInstance] = []
+        for instance in im_instances:
+            if instance.status != IMInstance.REQUESTED:
+                continue
+            assert (
+                instance.launch_request_id
+            ), "Instance in REQUESTED status should have launch_request_id set."
+            instances_with_launch_requests.append(instance)
+        assigned_cloud_instance_ids: Set[CloudInstanceId] = {
+            instance.cloud_instance_id for instance in im_instances
+        }
+        launch_errors: Dict[str, LaunchNodeError] = {
+            error.request_id: error
+            for error in cloud_provider_errors
+            if isinstance(error, LaunchNodeError)
+        }
+        unassigned_cloud_instances_by_type: Dict[
+            str, List[CloudInstance]
+        ] = defaultdict(list)
+        for cloud_instance_id, cloud_instance in non_terminated_cloud_instances.items():
+            if cloud_instance_id not in assigned_cloud_instance_ids:
+                unassigned_cloud_instances_by_type[cloud_instance.node_type].append(
+                    cloud_instance
+                )
+        # Sort the request instance by the increasing request time.
+        instances_with_launch_requests.sort(
+            key=lambda instance: InstanceUtil.get_status_transition_times_ns(
+                instance, IMInstance.REQUESTED
+            )
+        )
+        # For each instance, try to allocate or fail the allocation.
+        for instance in instances_with_launch_requests:
+            # Try allocate or fail with errors.
+            update_event = Reconciler._try_resolve_pending_allocation(
+                instance, unassigned_cloud_instances_by_type, launch_errors
+            )
+            if not update_event:
+                continue
+            updates[instance.instance_id] = update_event
+        # Update the instance manager for the events.
+        Reconciler._update_instance_manager(instance_manager, version, updates)
+    @staticmethod
+    def _try_resolve_pending_allocation(
+        im_instance: IMInstance,
+        unassigned_cloud_instances_by_type: Dict[str, List[CloudInstance]],
+        launch_errors: Dict[str, LaunchNodeError],
+    ) -> Optional[IMInstanceUpdateEvent]:
+        """
+        Allocate, or fail the cloud instance allocation for the instance.
+        Args:
+            im_instance: The instance to allocate or fail.
+            unassigned_cloud_instances_by_type: The unassigned cloud instances by type.
+            launch_errors: The launch errors from the cloud provider.
+        Returns:
+            Instance update to ALLOCATED: if there's a matching unassigned cloud
+                instance with the same type.
+            Instance update to ALLOCATION_FAILED: if the instance allocation failed
+                with errors.
+            None: if there's no update.
+        """
+        unassigned_cloud_instance = None
+        # Try to allocate an unassigned cloud instance.
+        # TODO(rickyx): We could also look at the launch request id
+        # on the cloud node and the im instance later once all node providers
+        # support request id. For now, we only look at the instance type.
+        if len(unassigned_cloud_instances_by_type.get(im_instance.instance_type, [])):
+            unassigned_cloud_instance = unassigned_cloud_instances_by_type[
+                im_instance.instance_type
+            ].pop()
+        if unassigned_cloud_instance:
+            return IMInstanceUpdateEvent(
+                instance_id=im_instance.instance_id,
+                new_instance_status=IMInstance.ALLOCATED,
+                cloud_instance_id=unassigned_cloud_instance.cloud_instance_id,
+                node_kind=unassigned_cloud_instance.node_kind,
+                instance_type=unassigned_cloud_instance.node_type,
+                details=(
+                    "allocated unassigned cloud instance "
+                    f"{unassigned_cloud_instance.cloud_instance_id}"
+                ),
+            )
+        # If there's a launch error, transition to ALLOCATION_FAILED.
+        launch_error = launch_errors.get(im_instance.launch_request_id)
+        if launch_error and launch_error.node_type == im_instance.instance_type:
+            return IMInstanceUpdateEvent(
+                instance_id=im_instance.instance_id,
+                new_instance_status=IMInstance.ALLOCATION_FAILED,
+                details=f"launch failed with {str(launch_error)}",
+            )
+        # No update.
+        return None
+    @staticmethod
+    def _handle_ray_stop_failed(
+        instance_manager: InstanceManager,
+        ray_stop_errors: List[RayStopError],
+        ray_nodes: List[NodeState],
+    ):
+        """
+        The instance requested to stop ray, but failed to stop/drain the ray node.
+        E.g. connection errors, idle termination drain rejected by the node.
+        We will transition the instance back to RAY_RUNNING.
+        Args:
+            instance_manager: The instance manager to reconcile.
+            ray_stop_errors: The errors from RayStopper.
+        """
+        instances, version = Reconciler._get_im_instances(instance_manager)
+        updates = {}
+        ray_stop_errors_by_instance_id = {
+            error.im_instance_id: error for error in ray_stop_errors
+        }
+        ray_nodes_by_ray_node_id = {binary_to_hex(n.node_id): n for n in ray_nodes}
+        ray_stop_requested_instances = {
+            instance.instance_id: instance
+            for instance in instances
+            if instance.status == IMInstance.RAY_STOP_REQUESTED
+        }
+        for instance_id, instance in ray_stop_requested_instances.items():
+            stop_error = ray_stop_errors_by_instance_id.get(instance_id)
+            if not stop_error:
+                continue
+            assert instance.node_id
+            ray_node = ray_nodes_by_ray_node_id.get(instance.node_id)
+            assert ray_node is not None and ray_node.status in [
+                NodeStatus.RUNNING,
+                NodeStatus.IDLE,
+            ], (
+                "There should be a running ray node for instance with ray stop "
+                "requested failed."
+            )
+            updates[instance_id] = IMInstanceUpdateEvent(
+                instance_id=instance_id,
+                new_instance_status=IMInstance.RAY_RUNNING,
+                details="failed to stop/drain ray",
+                ray_node_id=instance.node_id,
+            )
+        Reconciler._update_instance_manager(instance_manager, version, updates)
+    @staticmethod
+    def _handle_ray_install_failed(
+        instance_manager: InstanceManager, ray_install_errors: List[RayInstallError]
+    ):
+        instances, version = Reconciler._get_im_instances(instance_manager)
+        updates = {}
+        # Get all instances with RAY_INSTALLING status.
+        instances_with_ray_installing = {
+            instance.instance_id: instance
+            for instance in instances
+            if instance.status == IMInstance.RAY_INSTALLING
+        }
+        install_errors = {error.im_instance_id: error for error in ray_install_errors}
+        # For each instance with RAY_INSTALLING status, check if there's any
+        # install error.
+        for instance_id, instance in instances_with_ray_installing.items():
+            install_error = install_errors.get(instance_id)
+            if install_error:
+                updates[instance_id] = IMInstanceUpdateEvent(
+                    instance_id=instance_id,
+                    new_instance_status=IMInstance.RAY_INSTALL_FAILED,
+                    details=(
+                        f"failed to install ray with errors: {install_error.details}"
+                    ),
+                )
+        # Update the instance manager for the events.
+        Reconciler._update_instance_manager(instance_manager, version, updates)
+    @staticmethod
+    def _handle_cloud_instance_terminated(
+        instance_manager: InstanceManager,
+        non_terminated_cloud_instances: Dict[CloudInstanceId, CloudInstance],
+    ):
+        """
+        For any IM (instance manager) instance with a cloud node id, if the mapped
+        cloud instance is no longer running, transition the instance to TERMINATED.
+        Args:
+            instance_manager: The instance manager to reconcile.
+            non_terminated_cloud_instances: The non-terminated cloud instances from
+                the cloud provider.
+        """
+        updates = {}
+        instances, version = Reconciler._get_im_instances(instance_manager)
+        non_terminated_instances_with_cloud_instance_assigned = {
+            instance.cloud_instance_id: instance
+            for instance in instances
+            if instance.cloud_instance_id and instance.status != IMInstance.TERMINATED
+        }
+        for (
+            cloud_instance_id,
+            instance,
+        ) in non_terminated_instances_with_cloud_instance_assigned.items():
+            if cloud_instance_id in non_terminated_cloud_instances.keys():
+                # The cloud instance is still running.
+                continue
+            # The cloud instance is terminated.
+            updates[instance.instance_id] = IMInstanceUpdateEvent(
+                instance_id=instance.instance_id,
+                new_instance_status=IMInstance.TERMINATED,
+                details=f"cloud instance {cloud_instance_id} no longer found",
+            )
+        Reconciler._update_instance_manager(instance_manager, version, updates)
+    @staticmethod
+    def _handle_cloud_instance_termination_errors(
+        instance_manager: InstanceManager,
+        cloud_provider_errors: List[CloudInstanceProviderError],
+    ):
+        """
+        If any TERMINATING instances have termination errors, transition the instance to
+        TERMINATION_FAILED.
+        We will retry the termination for the TERMINATION_FAILED instances in the next
+        reconciler step.
+        Args:
+            instance_manager: The instance manager to reconcile.
+            cloud_provider_errors: The errors from the cloud provider.
+        """
+        instances, version = Reconciler._get_im_instances(instance_manager)
+        updates = {}
+        termination_errors = {
+            error.cloud_instance_id: error
+            for error in cloud_provider_errors
+            if isinstance(error, TerminateNodeError)
+        }
+        terminating_instances_by_cloud_instance_id = {
+            instance.cloud_instance_id: instance
+            for instance in instances
+            if instance.status == IMInstance.TERMINATING
+        }
+        for cloud_instance_id, failure in termination_errors.items():
+            instance = terminating_instances_by_cloud_instance_id.get(cloud_instance_id)
+            if not instance:
+                # The instance is no longer in TERMINATING status.
+                continue
+            updates[instance.instance_id] = IMInstanceUpdateEvent(
+                instance_id=instance.instance_id,
+                new_instance_status=IMInstance.TERMINATION_FAILED,
+                details=f"termination failed: {str(failure)}",
+            )
+        Reconciler._update_instance_manager(instance_manager, version, updates)
+    @staticmethod
+    def _get_im_instances(
+        instance_manager: InstanceManager,
+    ) -> Tuple[List[IMInstance], int]:
+        reply = instance_manager.get_instance_manager_state(
+            request=GetInstanceManagerStateRequest()
+        )
+        assert reply.status.code == StatusCode.OK
+        im_state = reply.state
+        return im_state.instances, im_state.version
+    @staticmethod
+    def _update_instance_manager(
+        instance_manager: InstanceManager,
+        version: int,
+        updates: Dict[str, IMInstanceUpdateEvent],
+    ) -> None:
+        if not updates:
+            return
+        updates = list(updates.values()) or []
+        reply = instance_manager.update_instance_manager_state(
+            request=UpdateInstanceManagerStateRequest(
+                expected_version=version,
+                updates=updates,
+            )
+        )
+        # TODO: While it's possible that a version mismatch
+        # happens, or some other failures could happen. But given
+        # the current implementation:
+        #   1. There's only 1 writer (the reconciler) for updating the instance
+        #       manager states, so there shouldn't be version mismatch.
+        #   2. Any failures in one reconciler step should be caught at a higher
+        #       level and be retried in the next reconciler step. If the IM
+        #       fails to be updated, we don't have sufficient info to handle it
+        #       here.
+        assert (
+            reply.status.code == StatusCode.OK
+        ), f"Failed to update instance manager: {reply}"
+    @staticmethod
+    def _handle_ray_status_transition(
+        instance_manager: InstanceManager,
+        ray_nodes: List[NodeState],
+        autoscaling_config: AutoscalingConfig,
+    ):
+        """
+        Handle the ray status transition for the instance manager.
+        If a new ray node running on the instance, transition it to RAY_RUNNING.
+        If a ray node stopped, transition it to RAY_STOPPED.
+        If a ray node is draining, transition it to RAY_STOPPING.
+        Args:
+            instance_manager: The instance manager to reconcile.
+            ray_nodes: The ray cluster's states of ray nodes.
+        """
+        instances, version = Reconciler._get_im_instances(instance_manager)
+        updates = {}
+        im_instances_by_cloud_instance_id = {
+            i.cloud_instance_id: i for i in instances if i.cloud_instance_id
+        }
+        ray_nodes_by_cloud_instance_id = {}
+        for n in ray_nodes:
+            if n.instance_id:
+                ray_nodes_by_cloud_instance_id[n.instance_id] = n
+            else:
+                if autoscaling_config.provider == Provider.READ_ONLY:
+                    # We will use the node id as the cloud instance id for read-only
+                    # provider.
+                    ray_nodes_by_cloud_instance_id[binary_to_hex(n.node_id)] = n
+                else:
+                    # This should only happen to a ray node that's not managed by us.
+                    logger.warning(
+                        f"Ray node {binary_to_hex(n.node_id)} has no instance id. "
+                        "This only happens to a ray node not managed by autoscaler. "
+                        "If not, please file a bug at "
+                        "https://github.com/ray-project/ray"
+                    )
+        for cloud_instance_id, ray_node in ray_nodes_by_cloud_instance_id.items():
+            assert cloud_instance_id in im_instances_by_cloud_instance_id, (
+                f"Ray node {binary_to_hex(ray_node.node_id)} has no matching "
+                f"instance with cloud instance id={cloud_instance_id}. We should "
+                "not see a ray node with cloud instance id not found in IM since "
+                "we have reconciled all cloud instances, and ray nodes by now."
+            )
+            im_instance = im_instances_by_cloud_instance_id[cloud_instance_id]
+            reconciled_im_status = Reconciler._reconciled_im_status_from_ray_status(
+                ray_node.status, im_instance.status
+            )
+            if reconciled_im_status != im_instance.status:
+                updates[im_instance.instance_id] = IMInstanceUpdateEvent(
+                    instance_id=im_instance.instance_id,
+                    new_instance_status=reconciled_im_status,
+                    details=(
+                        f"ray node {binary_to_hex(ray_node.node_id)} is "
+                        f"{NodeStatus.Name(ray_node.status)}"
+                    ),
+                    ray_node_id=binary_to_hex(ray_node.node_id),
+                )
+        Reconciler._update_instance_manager(instance_manager, version, updates)
+    @staticmethod
+    def _reconciled_im_status_from_ray_status(
+        ray_status: NodeStatus, cur_im_status: IMInstance.InstanceStatus
+    ) -> "IMInstance.InstanceStatus":
+        """
+        Reconcile the instance status from the ray node status.
+        Args:
+            ray_status: the current ray node status.
+            cur_im_status: the current IM instance status.
+        Returns:
+            The reconciled IM instance status
+        Raises:
+            ValueError: If the ray status is unknown.
+        """
+        reconciled_im_status = None
+        if ray_status in [NodeStatus.RUNNING, NodeStatus.IDLE]:
+            reconciled_im_status = IMInstance.RAY_RUNNING
+        elif ray_status == NodeStatus.DEAD:
+            reconciled_im_status = IMInstance.RAY_STOPPED
+        elif ray_status == NodeStatus.DRAINING:
+            reconciled_im_status = IMInstance.RAY_STOPPING
+        else:
+            raise ValueError(f"Unknown ray status: {ray_status}")
+        if (
+            cur_im_status == reconciled_im_status
+            or cur_im_status
+            in InstanceUtil.get_reachable_statuses(reconciled_im_status)
+        ):
+            # No need to reconcile if the instance is already in the reconciled status
+            # or has already transitioned beyond it.
+            return cur_im_status
+        return reconciled_im_status
+    @staticmethod
+    def _handle_instances_launch(
+        instance_manager: InstanceManager, autoscaling_config: AutoscalingConfig
+    ):
+        instances, version = Reconciler._get_im_instances(instance_manager)
+        queued_instances = []
+        requested_instances = []
+        allocated_instances = []
+        for instance in instances:
+            if instance.status == IMInstance.QUEUED:
+                queued_instances.append(instance)
+            elif instance.status == IMInstance.REQUESTED:
+                requested_instances.append(instance)
+            elif instance.cloud_instance_id:
+                allocated_instances.append(instance)
+        if not queued_instances:
+            # No QUEUED instances
+            return
+        to_launch = Reconciler._compute_to_launch(
+            queued_instances,
+            requested_instances,
+            allocated_instances,
+            autoscaling_config.get_upscaling_speed(),
+            autoscaling_config.get_max_concurrent_launches(),
+        )
+        # Transition the instances to REQUESTED for instance launcher to
+        # launch them.
+        updates = {}
+        new_launch_request_id = str(uuid.uuid4())
+        for instance_type, instances in to_launch.items():
+            for instance in instances:
+                # Reuse launch request id for any QUEUED instances that have been
+                # requested before due to retry.
+                launch_request_id = (
+                    new_launch_request_id
+                    if len(instance.launch_request_id) == 0
+                    else instance.launch_request_id
+                )
+                updates[instance.instance_id] = IMInstanceUpdateEvent(
+                    instance_id=instance.instance_id,
+                    new_instance_status=IMInstance.REQUESTED,
+                    launch_request_id=launch_request_id,
+                    instance_type=instance_type,
+                    details=(
+                        f"requested to launch {instance_type} with request id "
+                        f"{launch_request_id}"
+                    ),
+                )
+        Reconciler._update_instance_manager(instance_manager, version, updates)
+    @staticmethod
+    def _compute_to_launch(
+        queued_instances: List[IMInstance],
+        requested_instances: List[IMInstance],
+        allocated_instances: List[IMInstance],
+        upscaling_speed: float,
+        max_concurrent_launches: int,
+    ) -> Dict[NodeType, List[IMInstance]]:
+        def _group_by_type(instances):
+            instances_by_type = defaultdict(list)
+            for instance in instances:
+                instances_by_type[instance.instance_type].append(instance)
+            return instances_by_type
+        # Sort the instances by the time they were queued.
+        def _sort_by_earliest_queued(instance: IMInstance) -> List[int]:
+            queue_times = InstanceUtil.get_status_transition_times_ns(
+                instance, IMInstance.QUEUED
+            )
+            return sorted(queue_times)
+        queued_instances_by_type = _group_by_type(queued_instances)
+        requested_instances_by_type = _group_by_type(requested_instances)
+        allocated_instances_by_type = _group_by_type(allocated_instances)
+        total_num_requested_to_launch = len(requested_instances)
+        all_to_launch: Dict[NodeType : List[IMInstance]] = defaultdict(list)
+        for (
+            instance_type,
+            queued_instances_for_type,
+        ) in queued_instances_by_type.items():
+            requested_instances_for_type = requested_instances_by_type.get(
+                instance_type, []
+            )
+            allocated_instances_for_type = allocated_instances_by_type.get(
+                instance_type, []
+            )
+            num_desired_to_upscale = max(
+                1,
+                math.ceil(
+                    upscaling_speed
+                    * (
+                        len(requested_instances_for_type)
+                        + len(allocated_instances_for_type)
+                    )
+                ),
+            )
+            # Enforce global limit, at most we can launch `max_concurrent_launches`
+            num_to_launch = min(
+                max_concurrent_launches - total_num_requested_to_launch,
+                num_desired_to_upscale,
+            )
+            # Cap both ends 0 <= num_to_launch <= num_queued
+            num_to_launch = max(0, num_to_launch)
+            num_to_launch = min(len(queued_instances_for_type), num_to_launch)
+            to_launch = sorted(queued_instances_for_type, key=_sort_by_earliest_queued)[
+                :num_to_launch
+            ]
+            all_to_launch[instance_type].extend(to_launch)
+            total_num_requested_to_launch += num_to_launch
+        return all_to_launch
+    @staticmethod
+    def _handle_stuck_instances(
+        instance_manager: InstanceManager,
+        reconcile_config: InstanceReconcileConfig,
+        _logger: logging.Logger,
+    ):
+        """
+        Handle stuck instances with timeouts.
+        Instances could be stuck in the following status and needs to be updated:
+            - REQUESTED: cloud provider is slow/fails to launch instances.
+            - ALLOCATED: ray fails to be started on the instance.
+            - RAY_INSTALLING: ray fails to be installed on the instance.
+            - TERMINATING: cloud provider is slow/fails to terminate instances.
+        Instances could be in the following status which could be unbounded or
+        transient, and we don't have a timeout mechanism to handle them. We would
+        warn if they are stuck for too long:
+            - RAY_STOPPING: ray taking time to drain.
+            - QUEUED: cloud provider is slow to launch instances, resulting in long
+                queue.
+            Reconciler should handle below statuses, if not, could be slow
+                reconcilation loop or a bug:
+            - RAY_INSTALL_FAILED
+            - RAY_STOPPED
+            - TERMINATION_FAILED
+        Args:
+            instance_manager: The instance manager to reconcile.
+            reconcile_config: The instance reconcile config.
+            _logger: The logger to log the warning messages. It's used for testing.
+        """
+        instances, version = Reconciler._get_im_instances(instance_manager)
+        instances_by_status = defaultdict(list)
+        for instance in instances:
+            instances_by_status[instance.status].append(instance)
+        im_updates = {}
+        # Fail or retry the cloud instance allocation if it's stuck
+        # in the REQUESTED state.
+        for instance in instances_by_status[IMInstance.REQUESTED]:
+            update = Reconciler._handle_stuck_requested_instance(
+                instance,
+                reconcile_config.request_status_timeout_s,
+                reconcile_config.max_num_retry_request_to_allocate,
+            )
+            if update:
+                im_updates[instance.instance_id] = update
+        # Leaked ALLOCATED instances should be terminated.
+        # This usually happens when ray fails to be started on the instance, so
+        # it's unable to be RAY_RUNNING after a long time.
+        for instance in instances_by_status[IMInstance.ALLOCATED]:
+            assert (
+                instance.cloud_instance_id
+            ), "cloud instance id should be set on ALLOCATED instance"
+            update = Reconciler._handle_stuck_instance(
+                instance,
+                reconcile_config.allocate_status_timeout_s,
+                new_status=IMInstance.TERMINATING,
+                cloud_instance_id=instance.cloud_instance_id,
+            )
+            if update:
+                im_updates[instance.instance_id] = update
+        # Fail the installation if it's stuck in RAY_INSTALLING for too long.
+        # If RAY_INSTALLING is stuck for too long, it's likely that the instance
+        # is not able to install ray, so we should also fail the installation.
+        for instance in instances_by_status[IMInstance.RAY_INSTALLING]:
+            update = Reconciler._handle_stuck_instance(
+                instance,
+                reconcile_config.ray_install_status_timeout_s,
+                new_status=IMInstance.RAY_INSTALL_FAILED,
+            )
+            if update:
+                im_updates[instance.instance_id] = update
+        # If we tried to terminate the instance, but it doesn't terminate (disappear
+        # from the cloud provider) after a long time, we fail the termination.
+        # This will trigger another attempt to terminate the instance.
+        for instance in instances_by_status[IMInstance.TERMINATING]:
+            update = Reconciler._handle_stuck_instance(
+                instance,
+                reconcile_config.terminating_status_timeout_s,
+                new_status=IMInstance.TERMINATION_FAILED,
+            )
+            if update:
+                im_updates[instance.instance_id] = update
+        # If we tried to stop ray on the instance, but it doesn't stop after a long
+        # time, we will transition it back to RAY_RUNNING as the stop/drain somehow
+        # failed. If it had succeed, we should have transitioned it to RAY_STOPPING
+        # or RAY_STOPPED.
+        for instance in instances_by_status[IMInstance.RAY_STOP_REQUESTED]:
+            update = Reconciler._handle_stuck_instance(
+                instance,
+                reconcile_config.ray_stop_requested_status_timeout_s,
+                new_status=IMInstance.RAY_RUNNING,
+                ray_node_id=instance.node_id,
+            )
+            if update:
+                im_updates[instance.instance_id] = update
+        # These statues could be unbounded or transient, and we don't have a timeout
+        # mechanism to handle them. We only warn if they are stuck for too long.
+        for status in [
+            # Ray taking time to drain. We could also have a timeout when Drain protocol
+            # supports timeout.
+            IMInstance.RAY_STOPPING,
+            # These should just be transient, we will terminate instances with this
+            # status in the next reconciler step.
+            IMInstance.RAY_INSTALL_FAILED,
+            IMInstance.RAY_STOPPED,
+            IMInstance.TERMINATION_FAILED,
+            # Instances could be in the QUEUED status for a long time if the cloud
+            # provider is slow to launch instances.
+            IMInstance.QUEUED,
+        ]:
+            Reconciler._warn_stuck_instances(
+                instances_by_status[status],
+                status=status,
+                warn_interval_s=reconcile_config.transient_status_warn_interval_s,
+                logger=_logger,
+            )
+        Reconciler._update_instance_manager(instance_manager, version, im_updates)
+    @staticmethod
+    def _warn_stuck_instances(
+        instances: List[IMInstance],
+        status: IMInstance.InstanceStatus,
+        warn_interval_s: int,
+        logger: logging.Logger,
+    ):
+        """Warn if any instance is stuck in a transient/unbounded status for too
+        long.
+        """
+        for instance in instances:
+            status_times_ns = InstanceUtil.get_status_transition_times_ns(
+                instance, select_instance_status=status
+            )
+            assert len(status_times_ns) >= 1
+            status_time_ns = sorted(status_times_ns)[-1]
+            if time.time_ns() - status_time_ns > warn_interval_s * 1e9:
+                logger.warning(
+                    "Instance {}({}) is stuck in {} for {} seconds.".format(
+                        instance.instance_id,
+                        IMInstance.InstanceStatus.Name(instance.status),
+                        IMInstance.InstanceStatus.Name(status),
+                        (time.time_ns() - status_time_ns) // 1e9,
+                    )
+                )
+    @staticmethod
+    def _is_head_node_running(instance_manager: InstanceManager) -> bool:
+        """
+        Check if the head node is running and ready.
+        If we scale up the cluster before head node is running,
+        it would cause issues when launching the worker nodes.
+        There are corner cases when the GCS is up (so the ray cluster resource
+        state is retrievable from the GCS), but the head node's raylet is not
+        running so the head node is missing from the reported nodes. This happens
+        when the head node is still starting up, or the raylet is not running
+        due to some issues, and this would yield false.
+        Args:
+            instance_manager: The instance manager to reconcile.
+        Returns:
+            True if the head node is running and ready, False otherwise.
+        """
+        im_instances, _ = Reconciler._get_im_instances(instance_manager)
+        for instance in im_instances:
+            if instance.node_kind == NodeKind.HEAD:
+                if instance.status == IMInstance.RAY_RUNNING:
+                    return True
+        return False
+    @staticmethod
+    def _scale_cluster(
+        autoscaling_state: AutoscalingState,
+        instance_manager: InstanceManager,
+        ray_state: ClusterResourceState,
+        scheduler: IResourceScheduler,
+        autoscaling_config: AutoscalingConfig,
+    ) -> None:
+        """
+        Scale the cluster based on the resource state and the resource scheduler's
+        decision:
+        - It launches new instances if needed.
+        - It terminates extra ray nodes if they should be shut down (preemption
+            or idle termination)
+        Args:
+            autoscaling_state: The autoscaling state to reconcile.
+            instance_manager: The instance manager to reconcile.
+            ray_state: The ray cluster's resource state.
+            scheduler: The resource scheduler to make scaling decisions.
+            autoscaling_config: The autoscaling config.
+        """
+        # Get the current instance states.
+        im_instances, version = Reconciler._get_im_instances(instance_manager)
+        autoscaler_instances = []
+        ray_nodes_by_id = {
+            binary_to_hex(node.node_id): node for node in ray_state.node_states
+        }
+        for im_instance in im_instances:
+            ray_node = ray_nodes_by_id.get(im_instance.node_id)
+            autoscaler_instances.append(
+                AutoscalerInstance(
+                    ray_node=ray_node,
+                    im_instance=im_instance,
+                    cloud_instance_id=(
+                        im_instance.cloud_instance_id
+                        if im_instance.cloud_instance_id
+                        else None
+                    ),
+                )
+            )
+        # TODO(rickyx): We should probably name it as "Planner" or "Scaler"
+        # or "ClusterScaler"
+        sched_request = SchedulingRequest(
+            node_type_configs=autoscaling_config.get_node_type_configs(),
+            max_num_nodes=autoscaling_config.get_max_num_nodes(),
+            resource_requests=ray_state.pending_resource_requests,
+            gang_resource_requests=ray_state.pending_gang_resource_requests,
+            cluster_resource_constraints=ray_state.cluster_resource_constraints,
+            current_instances=autoscaler_instances,
+            idle_timeout_s=autoscaling_config.get_idle_timeout_s(),
+            disable_launch_config_check=(
+                autoscaling_config.disable_launch_config_check()
+            ),
+        )
+        # Ask scheduler for updates to the cluster shape.
+        reply = scheduler.schedule(sched_request)
+        # Populate the autoscaling state.
+        autoscaling_state.infeasible_resource_requests.extend(
+            reply.infeasible_resource_requests
+        )
+        autoscaling_state.infeasible_gang_resource_requests.extend(
+            reply.infeasible_gang_resource_requests
+        )
+        autoscaling_state.infeasible_cluster_resource_constraints.extend(
+            reply.infeasible_cluster_resource_constraints
+        )
+        if not Reconciler._is_head_node_running(instance_manager):
+            # We shouldn't be scaling the cluster until the head node is ready.
+            # This could happen when the head node (i.e. the raylet) is still
+            # pending registration even though GCS is up.
+            # We will wait until the head node is running and ready to avoid
+            # scaling the cluster from min worker nodes constraint.
+            return
+        if autoscaling_config.provider == Provider.READ_ONLY:
+            # We shouldn't be scaling the cluster if the provider is read-only.
+            return
+        # Scale the clusters if needed.
+        to_launch = reply.to_launch
+        to_terminate = reply.to_terminate
+        updates = {}
+        # Add terminating instances.
+        for terminate_request in to_terminate:
+            instance_id = terminate_request.instance_id
+            updates[terminate_request.instance_id] = IMInstanceUpdateEvent(
+                instance_id=instance_id,
+                new_instance_status=IMInstance.RAY_STOP_REQUESTED,
+                termination_request=terminate_request,
+                details=f"draining ray: {terminate_request.details}",
+            )
+        # Add new instances.
+        for launch_request in to_launch:
+            for _ in range(launch_request.count):
+                instance_id = InstanceUtil.random_instance_id()
+                updates[instance_id] = IMInstanceUpdateEvent(
+                    instance_id=instance_id,
+                    new_instance_status=IMInstance.QUEUED,
+                    instance_type=launch_request.instance_type,
+                    upsert=True,
+                    details=(
+                        f"queuing new instance of {launch_request.instance_type} "
+                        "from scheduler"
+                    ),
+                )
+        Reconciler._update_instance_manager(instance_manager, version, updates)
+    @staticmethod
+    def _terminate_instances(instance_manager: InstanceManager):
+        """
+        Terminate instances with the below statuses:
+            - RAY_STOPPED: ray was stopped on the cloud instance.
+            - RAY_INSTALL_FAILED: ray installation failed on the cloud instance,
+                we will not retry.
+            - TERMINATION_FAILED: cloud provider failed to terminate the instance
+                or timeout for termination happened, we will retry again.
+        Args:
+            instance_manager: The instance manager to reconcile.
+        """
+        im_instances, version = Reconciler._get_im_instances(instance_manager)
+        updates = {}
+        for instance in im_instances:
+            if instance.status not in [
+                IMInstance.RAY_STOPPED,
+                IMInstance.RAY_INSTALL_FAILED,
+                IMInstance.TERMINATION_FAILED,
+            ]:
+                continue
+            # Terminate the instance.
+            updates[instance.instance_id] = IMInstanceUpdateEvent(
+                instance_id=instance.instance_id,
+                new_instance_status=IMInstance.TERMINATING,
+                cloud_instance_id=instance.cloud_instance_id,
+                details="terminating instance from "
+                f"{IMInstance.InstanceStatus.Name(instance.status)}",
+            )
+        Reconciler._update_instance_manager(instance_manager, version, updates)
+    @staticmethod
+    def _install_ray(
+        instance_manager: InstanceManager,
+        non_terminated_cloud_instances: Dict[CloudInstanceId, CloudInstance],
+    ) -> None:
+        """
+        Install ray on the allocated instances when it's ready (cloud instance
+        should be running)
+        This is needed if ray installation needs to be performed by
+        the instance manager.
+        Args:
+            instance_manager: The instance manager to reconcile.
+        """
+        im_instances, version = Reconciler._get_im_instances(instance_manager)
+        updates = {}
+        for instance in im_instances:
+            if instance.status != IMInstance.ALLOCATED:
+                continue
+            if instance.node_kind == NodeKind.HEAD:
+                # Skip head node.
+                continue
+            cloud_instance = non_terminated_cloud_instances.get(
+                instance.cloud_instance_id
+            )
+            assert cloud_instance, (
+                f"Cloud instance {instance.cloud_instance_id} is not found "
+                "in non_terminated_cloud_instances."
+            )
+            if not cloud_instance.is_running:
+                # It might still be pending (e.g. setting up ssh)
+                continue
+            # Install ray on the running cloud instance
+            updates[instance.instance_id] = IMInstanceUpdateEvent(
+                instance_id=instance.instance_id,
+                new_instance_status=IMInstance.RAY_INSTALLING,
+                details="installing ray",
+            )
+        Reconciler._update_instance_manager(instance_manager, version, updates)
+    @staticmethod
+    def _fill_autoscaling_state(
+        instance_manager: InstanceManager,
+        autoscaling_state: AutoscalingState,
+    ) -> None:
+        # Use the IM instance version for the autoscaler_state_version
+        instances, version = Reconciler._get_im_instances(instance_manager)
+        autoscaling_state.autoscaler_state_version = version
+        # Group instances by status
+        instances_by_status = defaultdict(list)
+        for instance in instances:
+            instances_by_status[instance.status].append(instance)
+        # Pending instance requests
+        instances_by_launch_request = defaultdict(list)
+        queued_instances = []
+        for instance in (
+            instances_by_status[IMInstance.REQUESTED]
+            + instances_by_status[IMInstance.QUEUED]
+        ):
+            if instance.launch_request_id:
+                instances_by_launch_request[instance.launch_request_id].append(instance)
+            else:
+                queued_instances.append(instance)
+        for _, instances in instances_by_launch_request.items():
+            num_instances_by_type = defaultdict(int)
+            for instance in instances:
+                num_instances_by_type[instance.instance_type] += 1
+            # All instances with same request id should have the same
+            # request time.
+            request_update = InstanceUtil.get_last_status_transition(
+                instances[0], IMInstance.REQUESTED
+            )
+            request_time_ns = request_update.timestamp_ns if request_update else 0
+            for instance_type, count in num_instances_by_type.items():
+                autoscaling_state.pending_instance_requests.append(
+                    PendingInstanceRequest(
+                        ray_node_type_name=instance_type,
+                        count=int(count),
+                        request_ts=int(request_time_ns // 1e9),
+                    )
+                )
+        # Pending instances
+        for instance in (
+            instances_by_status[IMInstance.ALLOCATED]
+            + instances_by_status[IMInstance.RAY_INSTALLING]
+        ):
+            status_history = sorted(
+                instance.status_history, key=lambda x: x.timestamp_ns, reverse=True
+            )
+            autoscaling_state.pending_instances.append(
+                PendingInstance(
+                    instance_id=instance.instance_id,
+                    ray_node_type_name=instance.instance_type,
+                    details=status_history[0].details,
+                )
+            )
+        # Failed instance requests
+        for instance in instances_by_status[IMInstance.ALLOCATION_FAILED]:
+            request_status_update = InstanceUtil.get_last_status_transition(
+                instance, IMInstance.REQUESTED
+            )
+            failed_status_update = InstanceUtil.get_last_status_transition(
+                instance, IMInstance.ALLOCATION_FAILED
+            )
+            failed_time = (
+                failed_status_update.timestamp_ns if failed_status_update else 0
+            )
+            request_time = (
+                request_status_update.timestamp_ns if request_status_update else 0
+            )
+            autoscaling_state.failed_instance_requests.append(
+                FailedInstanceRequest(
+                    ray_node_type_name=instance.instance_type,
+                    start_ts=int(request_time // 1e9),
+                    failed_ts=int(
+                        failed_time // 1e9,
+                    ),
+                    reason=failed_status_update.details,
+                    count=1,
+                )
+            )
+    @staticmethod
+    def _handle_stuck_requested_instance(
+        instance: IMInstance, timeout_s: int, max_num_retry_request_to_allocate: int
+    ) -> Optional[IMInstanceUpdateEvent]:
+        """
+        Fail the cloud instance allocation if it's stuck in the REQUESTED state.
+        Args:
+            instance: The instance to handle.
+            timeout_s: The timeout in seconds.
+            max_num_retry_request_to_allocate: The maximum number of times an instance
+                could be requested to allocate.
+        Returns:
+            Instance update to ALLOCATION_FAILED: if the instance allocation failed
+                with errors.
+            None: if there's no update.
+        """
+        if not InstanceUtil.has_timeout(instance, timeout_s):
+            # Not timeout yet, be patient.
+            return None
+        all_request_times_ns = sorted(
+            InstanceUtil.get_status_transition_times_ns(
+                instance, select_instance_status=IMInstance.REQUESTED
+            )
+        )
+        # Fail the allocation if we have tried too many times.
+        if len(all_request_times_ns) > max_num_retry_request_to_allocate:
+            return IMInstanceUpdateEvent(
+                instance_id=instance.instance_id,
+                new_instance_status=IMInstance.ALLOCATION_FAILED,
+                details=(
+                    "failed to allocate cloud instance after "
+                    f"{len(all_request_times_ns)} attempts > "
+                    f"max_num_retry_request_to_allocate={max_num_retry_request_to_allocate}"  # noqa
+                ),
+            )
+        # Retry the allocation if we could by transitioning to QUEUED again.
+        return IMInstanceUpdateEvent(
+            instance_id=instance.instance_id,
+            new_instance_status=IMInstance.QUEUED,
+            details=f"queue again to launch after timeout={timeout_s}s",
+        )
+    @staticmethod
+    def _handle_stuck_instance(
+        instance: IMInstance,
+        timeout_s: int,
+        new_status: IMInstance.InstanceStatus,
+        **update_kwargs: Dict,
+    ) -> Optional[IMInstanceUpdateEvent]:
+        """
+        Fail the instance if it's stuck in the status for too long.
+        Args:
+            instance: The instance to handle.
+            timeout_s: The timeout in seconds.
+            new_status: The new status to transition to.
+            update_kwargs: The update kwargs for InstanceUpdateEvent
+        Returns:
+            Instance update to the new status: if the instance is stuck in the status
+                for too long.
+            None: if there's no update.
+        """
+        if not InstanceUtil.has_timeout(instance, timeout_s):
+            # Not timeout yet, be patient.
+            return None
+        return IMInstanceUpdateEvent(
+            instance_id=instance.instance_id,
+            new_instance_status=new_status,
+            details=f"timeout={timeout_s}s at status "
+            f"{IMInstance.InstanceStatus.Name(instance.status)}",
+            **update_kwargs,
+        )
+    @staticmethod
+    def _handle_extra_cloud_instances(
+        instance_manager: InstanceManager,
+        non_terminated_cloud_instances: Dict[CloudInstanceId, CloudInstance],
+        ray_nodes: List[NodeState],
+    ):
+        """
+        For extra cloud instances (i.e. cloud instances that are non terminated as
+        returned by cloud provider, but not managed by the instance manager), we
+        will create new IM instances with ALLOCATED status.
+        Such instances could either be:
+            1. Leaked instances that are incorrectly started by the cloud instance
+            provider, and they would be terminated eventually if they fail to
+            transition to RAY_RUNNING by stuck instances reconciliation, or they
+            would join the  ray cluster and be terminated when the cluster scales down.
+            2. Instances that are started by the cloud instance provider intentionally
+            but not yet discovered by the instance manager. This could happen for
+               a. Head node that's started before the autoscaler.
+               b. Worker nodes that's started by the cloud provider upon users'
+               actions: i.e. KubeRay scaling up the cluster with ray cluster config
+               change.
+            3. Ray nodes with cloud instance id not in the cloud provider. This could
+            happen if there's delay in the Ray's state (i.e. cloud instance already
+            terminated, but the ray node is still not dead yet).
+        Args:
+            instance_manager: The instance manager to reconcile.
+            non_terminated_cloud_instances: The non-terminated cloud instances from
+                the cloud provider.
+            ray_nodes: The ray cluster's states of ray nodes.
+        """
+        Reconciler._handle_extra_cloud_instances_from_cloud_provider(
+            instance_manager, non_terminated_cloud_instances
+        )
+        Reconciler._handle_extra_cloud_instances_from_ray_nodes(
+            instance_manager, ray_nodes
+        )
+    @staticmethod
+    def _handle_extra_cloud_instances_from_cloud_provider(
+        instance_manager: InstanceManager,
+        non_terminated_cloud_instances: Dict[CloudInstanceId, CloudInstance],
+    ):
+        """
+        For extra cloud instances that are not managed by the instance manager but
+        are running in the cloud provider, we will create new IM instances with
+        ALLOCATED status.
+        Args:
+            instance_manager: The instance manager to reconcile.
+            non_terminated_cloud_instances: The non-terminated cloud instances from
+                the cloud provider.
+        """
+        updates = {}
+        instances, version = Reconciler._get_im_instances(instance_manager)
+        cloud_instance_ids_managed_by_im = {
+            instance.cloud_instance_id
+            for instance in instances
+            if instance.cloud_instance_id
+        }
+        # Find the extra cloud instances that are not managed by the instance manager.
+        for cloud_instance_id, cloud_instance in non_terminated_cloud_instances.items():
+            if cloud_instance_id in cloud_instance_ids_managed_by_im:
+                continue
+            updates[cloud_instance_id] = IMInstanceUpdateEvent(
+                instance_id=InstanceUtil.random_instance_id(),  # Assign a new id.
+                cloud_instance_id=cloud_instance_id,
+                new_instance_status=IMInstance.ALLOCATED,
+                node_kind=cloud_instance.node_kind,
+                instance_type=cloud_instance.node_type,
+                details=(
+                    "allocated unmanaged cloud instance :"
+                    f"{cloud_instance.cloud_instance_id} "
+                    f"({NodeKind.Name(cloud_instance.node_kind)}) from cloud provider"
+                ),
+                upsert=True,
+            )
+        Reconciler._update_instance_manager(instance_manager, version, updates)
+    @staticmethod
+    def _handle_extra_cloud_instances_from_ray_nodes(
+        instance_manager: InstanceManager, ray_nodes: List[NodeState]
+    ):
+        """
+        For extra cloud instances reported by Ray but not managed by the instance
+        manager, we will create new IM instances with ALLOCATED status.
+        Args:
+            instance_manager: The instance manager to reconcile.
+            ray_nodes: The ray cluster's states of ray nodes.
+        """
+        updates = {}
+        instances, version = Reconciler._get_im_instances(instance_manager)
+        cloud_instance_ids_managed_by_im = {
+            instance.cloud_instance_id
+            for instance in instances
+            if instance.cloud_instance_id
+        }
+        for ray_node in ray_nodes:
+            if not ray_node.instance_id:
+                continue
+            cloud_instance_id = ray_node.instance_id
+            if cloud_instance_id in cloud_instance_ids_managed_by_im:
+                continue
+            is_head = is_head_node(ray_node)
+            updates[cloud_instance_id] = IMInstanceUpdateEvent(
+                instance_id=InstanceUtil.random_instance_id(),  # Assign a new id.
+                cloud_instance_id=cloud_instance_id,
+                new_instance_status=IMInstance.ALLOCATED,
+                node_kind=NodeKind.HEAD if is_head else NodeKind.WORKER,
+                instance_type=ray_node.ray_node_type_name,
+                details=(
+                    "allocated unmanaged worker cloud instance from ray node: "
+                    f"{binary_to_hex(ray_node.node_id)}"
+                ),
+                upsert=True,
+            )
+        Reconciler._update_instance_manager(instance_manager, version, updates)
+    @staticmethod
+    def _report_metrics(
+        instance_manager: InstanceManager,
+        autoscaling_config: AutoscalingConfig,
+        metrics_reporter: Optional[AutoscalerMetricsReporter] = None,
+    ):
+        if not metrics_reporter:
+            return
+        instances, _ = Reconciler._get_im_instances(instance_manager)
+        node_type_configs = autoscaling_config.get_node_type_configs()
+        metrics_reporter.report_instances(instances, node_type_configs)
+        metrics_reporter.report_resources(instances, node_type_configs)

.venv/lib/python3.11/site-packages/ray/autoscaler/v2/instance_manager/storage.py ADDED Viewed

	@@ -0,0 +1,180 @@

+import copy
+from abc import ABCMeta, abstractmethod
+from collections import defaultdict, namedtuple
+from threading import Lock
+from typing import Dict, List, Optional, Tuple
+StoreStatus = namedtuple("StoreStatus", ["success", "version"])
+VersionedValue = namedtuple("VersionedValue", ["value", "version"])
+class Storage(metaclass=ABCMeta):
+    """Interface for a storage backend that stores the state of nodes in the cluster.
+    The storage is thread-safe.
+    The storage is versioned, which means that each successful stage change to the
+    storage will bump the version number. The version number can be used to
+    implement optimistic concurrency control.
+    Each entry in the storage table is also versioned. The version number of an entry
+    is the last version number when the entry is updated.
+    """
+    @abstractmethod
+    def batch_update(
+        self,
+        table: str,
+        mutation: Optional[Dict[str, str]] = None,
+        deletion: Optional[List[str]] = None,
+        expected_storage_version: Optional[int] = None,
+    ) -> StoreStatus:
+        """Batch update the storage table. This method is atomic.
+        Args:
+            table: The name of the table.
+            mutation: A dictionary of key-value pairs to be updated.
+            deletion: A list of keys to be deleted.
+            expected_storage_version: The expected storage version. The
+                update will fail if the version does not match the
+                current storage version.
+        Returns:
+            StoreStatus: A tuple of (success, version). If the update is
+                successful, returns (True, new_version).
+                Otherwise, returns (False, current_version).
+        """
+        raise NotImplementedError("batch_update() has to be implemented")
+    @abstractmethod
+    def update(
+        self,
+        table: str,
+        key: str,
+        value: str,
+        expected_entry_version: Optional[int] = None,
+        insert_only: bool = False,
+    ) -> StoreStatus:
+        """Update a single entry in the storage table.
+        Args:
+            table: The name of the table.
+            key: The key of the entry.
+            value: The value of the entry.
+            expected_entry_version: The expected version of the entry.
+                The update will fail if the version does not match the current
+                version of the entry.
+            insert_only: If True, the update will
+                fail if the entry already exists.
+        Returns:
+            StoreStatus: A tuple of (success, version). If the update is
+                successful, returns (True, new_version). Otherwise,
+                returns (False, current_version).
+        """
+        raise NotImplementedError("update() has to be implemented")
+    @abstractmethod
+    def get_all(self, table: str) -> Tuple[Dict[str, Tuple[str, int]], int]:
+        raise NotImplementedError("get_all() has to be implemented")
+    @abstractmethod
+    def get(
+        self, table: str, keys: List[str]
+    ) -> Tuple[Dict[str, Tuple[str, int]], int]:
+        """Get a list of entries from the storage table.
+        Args:
+            table: The name of the table.
+            keys: A list of keys to be retrieved. If the list is empty,
+                all entries in the table will be returned.
+        Returns:
+            Tuple[Dict[str, VersionedValue], int]: A tuple of
+                (entries, storage_version). The entries is a dictionary of
+                (key, (value, entry_version)) pairs. The entry_version is the
+                version of the entry when it was last updated. The
+                storage_version is the current storage version.
+        """
+        raise NotImplementedError("get() has to be implemented")
+    @abstractmethod
+    def get_version(self) -> int:
+        """Get the current storage version.
+        Returns:
+            int: The current storage version.
+        """
+        raise NotImplementedError("get_version() has to be implemented")
+class InMemoryStorage(Storage):
+    """An in-memory implementation of the Storage interface. This implementation
+    is not durable"""
+    def __init__(self):
+        self._version = 0
+        self._tables = defaultdict(dict)
+        self._lock = Lock()
+    def batch_update(
+        self,
+        table: str,
+        mutation: Dict[str, str] = None,
+        deletion: List[str] = None,
+        expected_version: Optional[int] = None,
+    ) -> StoreStatus:
+        mutation = mutation if mutation else {}
+        deletion = deletion if deletion else []
+        with self._lock:
+            if expected_version is not None and expected_version != self._version:
+                return StoreStatus(False, self._version)
+            self._version += 1
+            key_value_pairs_with_version = {
+                key: VersionedValue(value, self._version)
+                for key, value in mutation.items()
+            }
+            self._tables[table].update(key_value_pairs_with_version)
+            for deleted_key in deletion:
+                self._tables[table].pop(deleted_key, None)
+            return StoreStatus(True, self._version)
+    def update(
+        self,
+        table: str,
+        key: str,
+        value: str,
+        expected_entry_version: Optional[int] = None,
+        expected_storage_version: Optional[int] = None,
+        insert_only: bool = False,
+    ) -> StoreStatus:
+        with self._lock:
+            if (
+                expected_storage_version is not None
+                and expected_storage_version != self._version
+            ):
+                return StoreStatus(False, self._version)
+            if insert_only and key in self._tables[table]:
+                return StoreStatus(False, self._version)
+            _, version = self._tables[table].get(key, (None, -1))
+            if expected_entry_version is not None and expected_entry_version != version:
+                return StoreStatus(False, self._version)
+            self._version += 1
+            self._tables[table][key] = VersionedValue(value, self._version)
+            return StoreStatus(True, self._version)
+    def get_all(self, table: str) -> Tuple[Dict[str, VersionedValue], int]:
+        with self._lock:
+            return (copy.deepcopy(self._tables[table]), self._version)
+    def get(self, table: str, keys: List[str]) -> Tuple[Dict[str, VersionedValue], int]:
+        if not keys:
+            return self.get_all(table)
+        with self._lock:
+            result = {}
+            for key in keys:
+                if key in self._tables.get(table, {}):
+                    result[key] = self._tables[table][key]
+            return StoreStatus(result, self._version)
+    def get_version(self) -> int:
+        return self._version

.venv/lib/python3.11/site-packages/ray/autoscaler/v2/metrics_reporter.py ADDED Viewed

	@@ -0,0 +1,100 @@

+from collections import defaultdict
+from typing import Dict, List
+from ray.autoscaler._private.prom_metrics import AutoscalerPrometheusMetrics
+from ray.autoscaler.v2.instance_manager.common import InstanceUtil
+from ray.autoscaler.v2.instance_manager.config import NodeTypeConfig
+from ray.autoscaler.v2.schema import NodeType
+from ray.core.generated.instance_manager_pb2 import Instance as IMInstance
+class AutoscalerMetricsReporter:
+    def __init__(self, prom_metrics: AutoscalerPrometheusMetrics) -> None:
+        self._prom_metrics = prom_metrics
+    def report_instances(
+        self,
+        instances: List[IMInstance],
+        node_type_configs: Dict[NodeType, NodeTypeConfig],
+    ):
+        """
+        Record autoscaler metrics for:
+            - pending_nodes: Nodes that are launching/pending ray start
+            - active_nodes: Active nodes (nodes running ray)
+            - recently_failed_nodes: Nodes that are being terminated.
+            - stopped_nodes: Nodes that are terminated.
+        """
+        # map of instance type to a dict of status to count.
+        status_count_by_type: Dict[NodeType : Dict[str, int]] = {}
+        # initialize the status count by type.
+        for instance_type in node_type_configs.keys():
+            status_count_by_type[instance_type] = {
+                "pending": 0,
+                "running": 0,
+                "terminating": 0,
+                "terminated": 0,
+            }
+        for instance in instances:
+            if InstanceUtil.is_ray_pending(instance.status):
+                status_count_by_type[instance.instance_type]["pending"] += 1
+            elif InstanceUtil.is_ray_running(instance.status):
+                status_count_by_type[instance.instance_type]["running"] += 1
+            elif instance.status == IMInstance.TERMINATING:
+                status_count_by_type[instance.instance_type]["terminating"] += 1
+            elif instance.status == IMInstance.TERMINATED:
+                status_count_by_type[instance.instance_type]["terminated"] += 1
+        for instance_type, status_count in status_count_by_type.items():
+            self._prom_metrics.pending_nodes.labels(
+                SessionName=self._prom_metrics.session_name, NodeType=instance_type
+            ).set(status_count["pending"])
+            self._prom_metrics.active_nodes.labels(
+                SessionName=self._prom_metrics.session_name, NodeType=instance_type
+            ).set(status_count["running"])
+            self._prom_metrics.recently_failed_nodes.labels(
+                SessionName=self._prom_metrics.session_name, NodeType=instance_type
+            ).set(status_count["terminating"])
+            self._prom_metrics.stopped_nodes.inc(status_count["terminated"])
+    def report_resources(
+        self,
+        instances: List[IMInstance],
+        node_type_configs: Dict[NodeType, NodeTypeConfig],
+    ):
+        """
+        Record autoscaler metrics for:
+            - pending_resources: Pending resources
+            - cluster_resources: Cluster resources (resources running on the cluster)
+        """
+        # pending resources.
+        pending_resources = defaultdict(float)
+        cluster_resources = defaultdict(float)
+        def _add_resources(resource_map, node_type_configs, node_type, count):
+            node_resources = node_type_configs[node_type].resources
+            for resource_name, resource_value in node_resources.items():
+                resource_map[resource_name] += resource_value * count
+        for instance in instances:
+            if InstanceUtil.is_ray_pending(instance.status):
+                _add_resources(
+                    pending_resources, node_type_configs, instance.instance_type, 1
+                )
+            elif InstanceUtil.is_ray_running(instance.status):
+                _add_resources(
+                    cluster_resources, node_type_configs, instance.instance_type, 1
+                )
+        for resource_name, resource_value in pending_resources.items():
+            self._prom_metrics.pending_resources.labels(
+                SessionName=self._prom_metrics.session_name, resource=resource_name
+            ).set(resource_value)
+        for resource_name, resource_value in cluster_resources.items():
+            self._prom_metrics.cluster_resources.labels(
+                SessionName=self._prom_metrics.session_name, resource=resource_name
+            ).set(resource_value)

.venv/lib/python3.11/site-packages/ray/autoscaler/v2/monitor.py ADDED Viewed

	@@ -0,0 +1,302 @@

+"""Autoscaler monitoring loop daemon.
+See autoscaler._private/monitor.py for the legacy implementation. All the legacy flags
+are supported here, but the new implementation uses the new autoscaler v2.
+"""
+import argparse
+import logging
+import os
+import sys
+import time
+from typing import Optional
+import ray
+import ray._private.ray_constants as ray_constants
+import ray._private.utils
+from ray._private.event.event_logger import get_event_logger
+from ray._private.ray_logging import setup_component_logger
+from ray._private.usage.usage_lib import record_extra_usage_tag
+from ray._private.worker import SCRIPT_MODE
+from ray._raylet import GcsClient
+from ray.autoscaler._private.constants import (
+    AUTOSCALER_METRIC_PORT,
+    AUTOSCALER_UPDATE_INTERVAL_S,
+)
+from ray.autoscaler._private.prom_metrics import AutoscalerPrometheusMetrics
+from ray.autoscaler.v2.autoscaler import Autoscaler
+from ray.autoscaler.v2.event_logger import AutoscalerEventLogger
+from ray.autoscaler.v2.instance_manager.config import (
+    FileConfigReader,
+    IConfigReader,
+    ReadOnlyProviderConfigReader,
+)
+from ray.autoscaler.v2.metrics_reporter import AutoscalerMetricsReporter
+from ray.core.generated.autoscaler_pb2 import AutoscalingState
+from ray.core.generated.event_pb2 import Event as RayEvent
+from ray.core.generated.usage_pb2 import TagKey
+try:
+    import prometheus_client
+except ImportError:
+    prometheus_client = None
+logger = logging.getLogger(__name__)
+class AutoscalerMonitor:
+    """Autoscaling monitor.
+    This process periodically collects stats from the GCS and triggers
+    autoscaler updates.
+    TODO:
+    We should also handle autoscaler failures properly in the future.
+    Right now, we don't restart autoscaler if it fails (internal reconciliation
+    however, should not fail the autoscaler process).
+    With the Reconciler able to handle extra cloud instances, we could in fact
+    recover the autoscaler process from reconciliation.
+    """
+    def __init__(
+        self,
+        address: str,
+        config_reader: IConfigReader,
+        log_dir: Optional[str] = None,
+        monitor_ip: Optional[str] = None,
+    ):
+        # Record v2 usage (we do this as early as possible to capture usage)
+        record_autoscaler_v2_usage(GcsClient(address))
+        self.gcs_address = address
+        worker = ray._private.worker.global_worker
+        # TODO: eventually plumb ClusterID through to here
+        self.gcs_client = GcsClient(address=self.gcs_address)
+        if monitor_ip:
+            monitor_addr = f"{monitor_ip}:{AUTOSCALER_METRIC_PORT}"
+            self.gcs_client.internal_kv_put(
+                b"AutoscalerMetricsAddress", monitor_addr.encode(), True, None
+            )
+        self._session_name = self._get_session_name(self.gcs_client)
+        logger.info(f"session_name: {self._session_name}")
+        worker.set_mode(SCRIPT_MODE)
+        head_node_ip = self.gcs_address.split(":")[0]
+        self.autoscaler = None
+        if log_dir:
+            try:
+                ray_event_logger = get_event_logger(
+                    RayEvent.SourceType.AUTOSCALER, log_dir
+                )
+                self.event_logger = AutoscalerEventLogger(ray_event_logger)
+            except Exception:
+                self.event_logger = None
+        else:
+            self.event_logger = None
+        prom_metrics = AutoscalerPrometheusMetrics(session_name=self._session_name)
+        self.metric_reporter = AutoscalerMetricsReporter(prom_metrics)
+        if monitor_ip and prometheus_client:
+            # If monitor_ip wasn't passed in, then don't attempt to start the
+            # metric server to keep behavior identical to before metrics were
+            # introduced
+            try:
+                logger.info(
+                    "Starting autoscaler metrics server on port {}".format(
+                        AUTOSCALER_METRIC_PORT
+                    )
+                )
+                kwargs = {"addr": "127.0.0.1"} if head_node_ip == "127.0.0.1" else {}
+                prometheus_client.start_http_server(
+                    port=AUTOSCALER_METRIC_PORT,
+                    registry=prom_metrics.registry,
+                    **kwargs,
+                )
+            except Exception:
+                logger.exception(
+                    "An exception occurred while starting the metrics server."
+                )
+        elif not prometheus_client:
+            logger.warning(
+                "`prometheus_client` not found, so metrics will not be exported."
+            )
+        self.autoscaler = Autoscaler(
+            session_name=self._session_name,
+            config_reader=config_reader,
+            gcs_client=self.gcs_client,
+            event_logger=self.event_logger,
+            metrics_reporter=self.metric_reporter,
+        )
+    @staticmethod
+    def _get_session_name(gcs_client: GcsClient) -> Optional[str]:
+        """Obtain the session name from the GCS.
+        If the GCS doesn't respond, session name is considered None.
+        In this case, the metrics reported from the monitor won't have
+        the correct session name.
+        """
+        session_name = gcs_client.internal_kv_get(
+            b"session_name",
+            ray_constants.KV_NAMESPACE_SESSION,
+            timeout=10,
+        )
+        if session_name:
+            session_name = session_name.decode()
+        return session_name
+    @staticmethod
+    def _report_autoscaling_state(
+        gcs_client: GcsClient, autoscaling_state: AutoscalingState
+    ):
+        """Report the autoscaling state to the GCS."""
+        try:
+            gcs_client.report_autoscaling_state(autoscaling_state.SerializeToString())
+        except Exception:
+            logger.exception("Error reporting autoscaling state to GCS.")
+    def _run(self):
+        """Run the monitor loop."""
+        while True:
+            autoscaling_state = self.autoscaler.update_autoscaling_state()
+            if autoscaling_state:
+                # report autoscaling state
+                self._report_autoscaling_state(self.gcs_client, autoscaling_state)
+            else:
+                logger.warning("No autoscaling state to report.")
+            # Wait for a autoscaler update interval before processing the next
+            # round of messages.
+            time.sleep(AUTOSCALER_UPDATE_INTERVAL_S)
+    def run(self):
+        try:
+            self._run()
+        except Exception:
+            logger.exception("Error in monitor loop")
+            raise
+def record_autoscaler_v2_usage(gcs_client: GcsClient) -> None:
+    """
+    Record usage for autoscaler v2.
+    """
+    try:
+        record_extra_usage_tag(TagKey.AUTOSCALER_VERSION, "v2", gcs_client)
+    except Exception:
+        logger.exception("Error recording usage for autoscaler v2.")
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        description=("Parse GCS server for the monitor to connect to.")
+    )
+    parser.add_argument(
+        "--gcs-address", required=False, type=str, help="The address (ip:port) of GCS."
+    )
+    parser.add_argument(
+        "--autoscaling-config",
+        required=False,
+        type=str,
+        help="the path to the autoscaling config file",
+    )
+    parser.add_argument(
+        "--logging-level",
+        required=False,
+        type=str,
+        default=ray_constants.LOGGER_LEVEL,
+        choices=ray_constants.LOGGER_LEVEL_CHOICES,
+        help=ray_constants.LOGGER_LEVEL_HELP,
+    )
+    parser.add_argument(
+        "--logging-format",
+        required=False,
+        type=str,
+        default=ray_constants.LOGGER_FORMAT,
+        help=ray_constants.LOGGER_FORMAT_HELP,
+    )
+    parser.add_argument(
+        "--logging-filename",
+        required=False,
+        type=str,
+        default=ray_constants.MONITOR_LOG_FILE_NAME,
+        help="Specify the name of log file, "
+        "log to stdout if set empty, default is "
+        f'"{ray_constants.MONITOR_LOG_FILE_NAME}"',
+    )
+    parser.add_argument(
+        "--logs-dir",
+        required=True,
+        type=str,
+        help="Specify the path of the temporary directory used by Ray processes.",
+    )
+    parser.add_argument(
+        "--logging-rotate-bytes",
+        required=False,
+        type=int,
+        default=ray_constants.LOGGING_ROTATE_BYTES,
+        help="Specify the max bytes for rotating "
+        "log file, default is "
+        f"{ray_constants.LOGGING_ROTATE_BYTES} bytes.",
+    )
+    parser.add_argument(
+        "--logging-rotate-backup-count",
+        required=False,
+        type=int,
+        default=ray_constants.LOGGING_ROTATE_BACKUP_COUNT,
+        help="Specify the backup count of rotated log file, default is "
+        f"{ray_constants.LOGGING_ROTATE_BACKUP_COUNT}.",
+    )
+    parser.add_argument(
+        "--monitor-ip",
+        required=False,
+        type=str,
+        default=None,
+        help="The IP address of the machine hosting the monitor process.",
+    )
+    args = parser.parse_args()
+    setup_component_logger(
+        logging_level=args.logging_level,
+        logging_format=args.logging_format,
+        log_dir=args.logs_dir,
+        filename=args.logging_filename,
+        max_bytes=args.logging_rotate_bytes,
+        backup_count=args.logging_rotate_backup_count,
+    )
+    logger.info(
+        f"Starting autoscaler v2 monitor using ray installation: {ray.__file__}"
+    )
+    logger.info(f"Ray version: {ray.__version__}")
+    logger.info(f"Ray commit: {ray.__commit__}")
+    logger.info(f"AutoscalerMonitor started with command: {sys.argv}")
+    gcs_address = args.gcs_address
+    if gcs_address is None:
+        raise ValueError("--gcs-address must be set!")
+    if not args.autoscaling_config:
+        logger.info("No autoscaling config provided: use read only node provider.")
+        config_reader = ReadOnlyProviderConfigReader(gcs_address)
+    else:
+        autoscaling_config = os.path.expanduser(args.autoscaling_config)
+        config_reader = FileConfigReader(
+            config_file=autoscaling_config, skip_content_hash=True
+        )
+    monitor = AutoscalerMonitor(
+        gcs_address,
+        config_reader,
+        log_dir=args.logs_dir,
+        monitor_ip=args.monitor_ip,
+    )
+    monitor.run()

.venv/lib/python3.11/site-packages/ray/autoscaler/v2/scheduler.py ADDED Viewed

	@@ -0,0 +1,1642 @@

+import copy
+import logging
+import time
+import uuid
+from abc import ABC, abstractmethod
+from collections import defaultdict
+from dataclasses import dataclass, field
+from enum import Enum
+from typing import Dict, List, Optional, Tuple
+from ray._private.protobuf_compat import message_to_dict
+from ray.autoscaler._private.constants import AUTOSCALER_CONSERVE_GPU_NODES
+from ray.autoscaler._private.resource_demand_scheduler import (
+    UtilizationScore,
+    _fits,
+    _inplace_subtract,
+)
+from ray.autoscaler.v2.event_logger import AutoscalerEventLogger
+from ray.autoscaler.v2.instance_manager.common import InstanceUtil
+from ray.autoscaler.v2.instance_manager.config import NodeTypeConfig
+from ray.autoscaler.v2.schema import AutoscalerInstance, NodeType
+from ray.autoscaler.v2.utils import ProtobufUtil, ResourceRequestUtil
+from ray.core.generated.autoscaler_pb2 import (
+    ClusterResourceConstraint,
+    GangResourceRequest,
+    ResourceRequest,
+    ResourceRequestByCount,
+)
+from ray.core.generated.instance_manager_pb2 import (
+    Instance,
+    LaunchRequest,
+    NodeKind,
+    TerminationRequest,
+)
+# ============= Resource Scheduling Service API =======================
+#
+#  ResourceSchedulerService is a service that schedules resource bundles
+#  to nodes. It's used by the autoscaler to schedule resource bundles
+#  to determine the desired cluster size to satisfy the current resource
+#  demands.
+#
+logger = logging.getLogger(__name__)
+@dataclass
+class SchedulingRequest:
+    # If outdated node check through launch config is disabled.
+    disable_launch_config_check: bool
+    # Available node type configs
+    node_type_configs: Dict[NodeType, NodeTypeConfig] = field(default_factory=dict)
+    # Max number of worker nodes.
+    max_num_nodes: Optional[int] = None
+    # Idle timeout in seconds.
+    idle_timeout_s: Optional[float] = None
+    # TODO: This prob could be refactored into the ClusterStatus data class later.
+    # The current ray resource requests.
+    resource_requests: List[ResourceRequestByCount] = field(default_factory=list)
+    # The Gang resource requests.
+    gang_resource_requests: List[GangResourceRequest] = field(default_factory=list)
+    # cluster resource constraints.
+    cluster_resource_constraints: List[ClusterResourceConstraint] = field(
+        default_factory=list
+    )
+    # The current instances.
+    current_instances: List[AutoscalerInstance] = field(default_factory=list)
+@dataclass
+class SchedulingReply:
+    # Instances to launch.
+    to_launch: List[LaunchRequest] = field(default_factory=list)
+    # To terminate.
+    to_terminate: List[TerminationRequest] = field(default_factory=list)
+    # The infeasible resource bundles.
+    infeasible_resource_requests: List[ResourceRequest] = field(default_factory=list)
+    # The infeasible gang resource bundles.
+    infeasible_gang_resource_requests: List[GangResourceRequest] = field(
+        default_factory=list
+    )
+    # The infeasible cluster resource constraints.
+    infeasible_cluster_resource_constraints: List[ClusterResourceConstraint] = field(
+        default_factory=list
+    )
+class IResourceScheduler(ABC):
+    """
+    Interface for a resource scheduler.
+    Implements the `instance_manager.proto ResourceSchedulerService` interface.
+    """
+    @abstractmethod
+    def schedule(self, request: SchedulingRequest) -> SchedulingReply:
+        """
+        Given the resource requests and the current cluster state, calculate the
+        target cluster shape by trying to schedule the resource requests on the
+        nodes.
+        """
+        pass
+class SchedulingNodeStatus(Enum):
+    """
+    The status of a scheduling node (`SchedulingNode`)
+    """
+    # The node is added by the ResourceDemandScheduler.
+    TO_LAUNCH = "TO_LAUNCH"
+    # The node is pending, i.e. there's already an autoscaler instance being launched
+    # The node is schedulable. It could be running ray or pending to run ray. Either
+    # Way, it should be able to accept new resource requests/resource constraints.
+    SCHEDULABLE = "SCHEDULABLE"
+    # The node is to be terminated by the ResourceDemandScheduler
+    TO_TERMINATE = "TO_TERMINATE"
+class ResourceRequestSource(Enum):
+    """
+    The source of the resource request.
+    """
+    # The resource request is from demand, e.g. ray tasks/actors,
+    # placement groups, etc.
+    PENDING_DEMAND = "PENDING_DEMAND"
+    # The resource request is from the cluster resource constraints, i.e.
+    # from ray.autoscaler.sdk.request_resources().
+    CLUSTER_RESOURCE_CONSTRAINT = "CLUSTER_RESOURCE_CONSTRAINT"
+@dataclass
+class SchedulingNode:
+    """
+    A abstraction of a node that can be scheduled on by the resource scheduler.
+    A scheduling node is expected to be used as:
+        node  = SchedulingNode.new(instance, node_configs)
+        remaining, score = node.try_schedule(requests)
+        .... do something with the score ....
+    NOTE:
+        One could also extend the scheduling behavior by overriding `try_schedule`
+    """
+    # Node type name.
+    node_type: NodeType
+    # Status
+    status: SchedulingNodeStatus
+    # Resource requests scheduled on this nodes for different sources.
+    sched_requests: Dict[ResourceRequestSource, List[ResourceRequest]] = field(
+        default_factory=lambda: defaultdict(list)
+    )
+    # Available resources for different sources of requests.
+    available_resources_for_sched: Dict[
+        ResourceRequestSource, Dict[str, float]
+    ] = field(default_factory=dict)
+    # The node's current resource capacity.
+    total_resources: Dict[str, float] = field(default_factory=dict)
+    # Node's labels, including static or dynamic labels.
+    labels: Dict[str, str] = field(default_factory=dict)
+    # Observability descriptive message for why the node was launched in the
+    # first place.
+    launch_reason: Optional[str] = None
+    # Termination request, none when the node is not being terminated.
+    termination_request: Optional[TerminationRequest] = None
+    # The instance id of the IM(Instance Manager) instance. None if the node
+    # is not yet in IM.
+    im_instance_id: Optional[str] = None
+    # The ray node id of the ray node. None if the node is not included in
+    # ray cluster's GCS report yet (not running ray yet).
+    ray_node_id: Optional[str] = None
+    # Idle duration in ms. Default not idle.
+    idle_duration_ms: int = 0
+    # Launch config hash.
+    launch_config_hash: Optional[str] = None
+    # node kind.
+    node_kind: NodeKind = NodeKind.WORKER
+    def __init__(
+        self,
+        node_type: NodeType,
+        total_resources: Dict[str, float],
+        available_resources: Dict[str, float],
+        labels: Dict[str, str],
+        status: SchedulingNodeStatus,
+        im_instance_id: str = "",
+        ray_node_id: str = "",
+        idle_duration_ms: int = 0,
+        launch_config_hash: str = "",
+        node_kind: NodeKind = NodeKind.WORKER,
+        termination_request: Optional[TerminationRequest] = None,
+    ):
+        self.node_type = node_type
+        self.total_resources = total_resources
+        self.available_resources_for_sched = {
+            ResourceRequestSource.PENDING_DEMAND: dict(available_resources),
+            ResourceRequestSource.CLUSTER_RESOURCE_CONSTRAINT: dict(total_resources),
+        }
+        self.sched_requests = {
+            ResourceRequestSource.PENDING_DEMAND: [],
+            ResourceRequestSource.CLUSTER_RESOURCE_CONSTRAINT: [],
+        }
+        self.labels = labels
+        self.status = status
+        self.im_instance_id = im_instance_id
+        self.ray_node_id = ray_node_id
+        self.idle_duration_ms = idle_duration_ms
+        self.launch_config_hash = launch_config_hash
+        self.node_kind = node_kind
+        self.termination_request = termination_request
+    def get_available_resources(self, resource_request_source: ResourceRequestSource):
+        """Get the available resources for the given resource request source."""
+        return self.available_resources_for_sched[resource_request_source]
+    def get_sched_requests(self, resource_request_source: ResourceRequestSource):
+        """Get the resource requests for the given resource request source."""
+        return self.sched_requests[resource_request_source]
+    def add_sched_request(
+        self,
+        request: ResourceRequest,
+        resource_request_source: ResourceRequestSource,
+    ):
+        """
+        Add the resource requests to the node.
+        Args:
+            request: The resource request to be added.
+            resource_request_source: The source of the resource request.
+        """
+        self.sched_requests[resource_request_source].append(request)
+    @staticmethod
+    def new(
+        instance: AutoscalerInstance,
+        node_type_configs: Dict[NodeType, NodeTypeConfig],
+        disable_launch_config_check: bool,
+    ) -> Optional["SchedulingNode"]:
+        """
+        Create a new scheduling node from an autoscaler instance.
+        It creates:
+            - None if the instance is not schedulable by IM.
+            - A schedulable node if the instance is running ray or pending to run ray,
+              so it should be considered in the scheduling process.
+        Args:
+            instance: The instance.
+            node_type_configs: The node type configs.
+            disable_launch_config_check: If outdated node check through launch config is
+                disabled.
+        """
+        if not SchedulingNode.is_schedulable(instance):
+            return None
+        if instance.im_instance.status == Instance.RAY_RUNNING:
+            assert instance.ray_node is not None, (
+                "ray node should not be None "
+                f"when the instance is running ray: instance={instance}"
+            )
+            # An running ray node
+            return SchedulingNode(
+                node_type=instance.im_instance.instance_type,
+                total_resources=dict(instance.ray_node.total_resources),
+                # Available resources for scheduling requests of different
+                # sources.
+                available_resources=dict(instance.ray_node.available_resources),
+                # Use ray node's dynamic labels.
+                labels=dict(instance.ray_node.dynamic_labels),
+                status=SchedulingNodeStatus.SCHEDULABLE,
+                im_instance_id=instance.im_instance.instance_id,
+                ray_node_id=instance.im_instance.node_id,
+                idle_duration_ms=instance.ray_node.idle_duration_ms,
+                launch_config_hash=instance.im_instance.launch_config_hash,
+                node_kind=instance.im_instance.node_kind,
+            )
+        # This is an instance pending to run ray. Initialize a schedulable node
+        # from the node type config.
+        node_config = node_type_configs.get(instance.im_instance.instance_type, None)
+        if node_config is None:
+            if disable_launch_config_check:
+                # We are not terminating outdated nodes.
+                logger.info(
+                    f"Node config for {instance.im_instance.instance_type} is missing, "
+                    "but we are not terminating the outdated node because "
+                    "`disable_launch_config_check` is True in "
+                    "the autoscaler's provider config."
+                )
+                return None
+            # Configs might have been updated, and no more
+            # node_type_configs for this node type. We should terminate it.
+            return SchedulingNode(
+                node_type=instance.im_instance.instance_type,
+                total_resources={},
+                available_resources={},
+                labels={},
+                status=SchedulingNodeStatus.TO_TERMINATE,
+                im_instance_id=instance.im_instance.instance_id,
+                termination_request=TerminationRequest(
+                    id=str(uuid.uuid4()),
+                    instance_id=instance.im_instance.instance_id,
+                    cause=TerminationRequest.Cause.OUTDATED,
+                    instance_type=instance.im_instance.instance_type,
+                ),
+                node_kind=NodeKind.WORKER,
+            )
+        return SchedulingNode.from_node_config(
+            node_config,
+            SchedulingNodeStatus.SCHEDULABLE,
+            node_kind=instance.im_instance.node_kind,
+            im_instance_id=instance.im_instance.instance_id,
+        )
+    @staticmethod
+    def is_schedulable(instance: AutoscalerInstance) -> bool:
+        """
+        Check if the instance is schedulable by IM.
+        Args:
+            instance: The instance.
+        Returns:
+            True if the instance is schedulable by IM.
+        """
+        if instance.im_instance is None:
+            # We will skip any instances that are not yet in IM which
+            # could be
+            #   1. an out-of-band ray node
+            #   2. an cloud instance running ray not yet discovered
+            #      by the IM's Reconciler
+            #   3. an cloud instance already terminated but ray state
+            #      still lagging behind.
+            #
+            # In all of these cases, the instance is not schedulable or
+            # shouldn't be managed by IM, so we don't consider them.
+            return False
+        # These are the statuses where there's a running ray node or
+        # could eventually run ray.
+        if InstanceUtil.is_ray_running_reachable(instance.im_instance.status):
+            return True
+        return False
+    @staticmethod
+    def from_node_config(
+        node_config: NodeTypeConfig,
+        status: SchedulingNodeStatus,
+        node_kind: NodeKind,
+        im_instance_id: Optional[str] = None,
+    ) -> "SchedulingNode":
+        """
+        Create a scheduling node from a node config.
+        Args:
+            node_config: The node config.
+            status: The status of the node.
+            node_kind: The node kind.
+            im_instance_id: The instance id of the im instance.
+            node_kind: The node kind.
+        """
+        return SchedulingNode(
+            node_type=node_config.name,
+            total_resources=dict(node_config.resources),
+            available_resources=dict(node_config.resources),
+            labels=dict(node_config.labels),
+            status=status,
+            im_instance_id=im_instance_id,
+            node_kind=node_kind,
+        )
+    def __post_init__(self):
+        assert self.node_type, "node_type should be set"
+    def try_schedule(
+        self,
+        requests: List[ResourceRequest],
+        resource_request_source: ResourceRequestSource,
+    ) -> Tuple[List[ResourceRequest], UtilizationScore]:
+        """
+        Try to schedule the resource requests on this node.
+        This modifies the node's available resources if the requests are schedulable.
+        The requests are scheduled one by one in the sorted order, and no
+        backtracking is done.
+        Args:
+            requests: The resource requests to be scheduled.
+            resource_request_source: The source of the resource request, i.e.
+                pending demands from ray actors/tasks or cluster resource constraints.
+        Returns:
+            A tuple of:
+                - list of remaining requests that cannot be scheduled on this node.
+                - the utilization score for this node with respect to the current
+                resource requests being scheduled.
+        """
+        # Track the resource requests that cannot be scheduled on this node.
+        unschedulable_requests = []
+        # Sort the requests and try schedule them one by one.
+        for r in requests:
+            if not self._try_schedule_one(r, resource_request_source):
+                unschedulable_requests.append(r)
+        score = self._compute_score(resource_request_source)
+        return unschedulable_requests, score
+    def _compute_score(
+        self, resource_request_source: ResourceRequestSource
+    ) -> UtilizationScore:
+        """
+        Compute the utilization score for this node with respect to the current resource
+        request being scheduled.
+        A "higher" score means that this node is more suitable for scheduling the
+        current scheduled resource requests.
+        The score is a tuple of 4 values:
+            1. Whether this node is a GPU node and the current resource request has
+                GPU requirements:
+                    0: if this node is a GPU node and the current resource request
+                    placed onto the node has no GPU requirements.
+                    1: if this node is not a GPU node or the current resource request
+                    placed onto the node has GPU requirements.
+            2. The number of resource types being scheduled.
+            3. The minimum utilization rate across all resource types.
+            4. The average utilization rate across all resource types.
+        NOTE:
+            This function is adapted from  _resource_based_utilization_scorer from
+            autoscaler v1.
+        TODO(rickyx,jjyao):  We should also consider node labels for
+            scoring. For example, if a node has a label that matches the affinity
+            label of the resource request, we should give it a higher score.
+        TODO(rickyx): add pluggable scoring functions here.
+        Returns:
+            A utilization score for this node.
+        """
+        sched_requests = self.get_sched_requests(resource_request_source)
+        available_resources = self.get_available_resources(resource_request_source)
+        # Compute the number of resource types being scheduled.
+        num_matching_resource_types = 0
+        sched_resource_types = set()
+        for req in sched_requests:
+            for resource_name, v in req.resources_bundle.items():
+                if v > 0:
+                    sched_resource_types.add(resource_name)
+        for sched_resource_type in sched_resource_types:
+            if sched_resource_type in self.total_resources:
+                num_matching_resource_types += 1
+        # Compute the utilization rate for each resource type
+        util_by_resources = []
+        for k, v in self.total_resources.items():
+            if v == 0:
+                # Skip any zero values.
+                continue
+            if k in available_resources:
+                util = (v - available_resources.get(k, 0)) / v
+                assert util >= 0 and util <= 1, f"Invalid utilization: {util}"
+                util_by_resources.append(v * (util**3))
+        # Prefer not to launch a GPU node if there aren't any GPU requirements in the
+        # resource bundle.
+        gpu_ok = True
+        if AUTOSCALER_CONSERVE_GPU_NODES:
+            # TODO: we should also generalize this optimization for accelerators.
+            # https://github.com/ray-project/ray/issues/43079
+            is_gpu_node = self.total_resources.get("GPU", 0) > 0
+            any_gpu_requests = any("GPU" in r.resources_bundle for r in sched_requests)
+            if is_gpu_node and not any_gpu_requests:
+                gpu_ok = False
+        # Prioritize avoiding gpu nodes for non-gpu workloads first,
+        # then prioritize matching multiple resource types,
+        # then prioritize using all resources,
+        # then prioritize overall balance of multiple resources.
+        return (
+            gpu_ok,
+            num_matching_resource_types,
+            min(util_by_resources) if util_by_resources else 0,
+            float(sum(util_by_resources)) / len(util_by_resources)
+            if util_by_resources
+            else 0,
+        )
+    def _try_schedule_one(
+        self, request: ResourceRequest, resource_request_source: ResourceRequestSource
+    ) -> bool:
+        """
+        Try to schedule one resource request on this node. The request could be from
+        various sources, specified by `resource_request_source`.
+        Args:
+            request: The resource request to be scheduled.
+            resource_request_source: The source of the resource request, i.e.
+                pending demands from ray actors/tasks or cluster resource constraints.
+        Returns:
+            True if the resource request is scheduled on this node.
+        """
+        # Check if there's placement constraints that are not satisfied.
+        for constraint in request.placement_constraints:
+            if constraint.HasField("anti_affinity"):
+                anti_affinity = constraint.anti_affinity
+                if (
+                    anti_affinity.label_name in self.labels
+                    and anti_affinity.label_value
+                    == self.labels[anti_affinity.label_name]
+                ):
+                    # The node already has a label that matches the anti-affinity
+                    return False
+            # We don't need to check for affinity constraints here since
+            # we have already combined resource requests with the affinity
+            # constraints into the same request at `combine_requests_with_affinity`.
+            pass
+        available_resources_dict = self.get_available_resources(resource_request_source)
+        # Check if there's enough resources to schedule the request.
+        if not _fits(available_resources_dict, dict(request.resources_bundle)):
+            return False
+        # Schedule the request, update resources
+        _inplace_subtract(available_resources_dict, dict(request.resources_bundle))
+        # Add the request to the node.
+        self.add_sched_request(request, resource_request_source)
+        # Update the dynamic labels if there's any
+        for constraint in request.placement_constraints:
+            # We don't need to check for affinity constraints here since
+            # we have already combined resource requests with the affinity
+            # constraints into the same request at `combine_requests_with_affinity`.
+            # We don't need node labels for enforcing affinity.
+            if constraint.HasField("anti_affinity"):
+                anti_affinity = constraint.anti_affinity
+                self._add_label(anti_affinity.label_name, anti_affinity.label_value)
+        return True
+    def _add_label(self, label_name: str, label_value: str):
+        """
+        Add a label to the node.
+        This assumes a label key can only have one value.
+        """
+        assert (
+            self.labels.get(label_name) is None
+            or self.labels[label_name] == label_value
+        ), (
+            f"Label {label_name} already exists with value "
+            f"{self.labels[label_name]}, cannot set to "
+            f"{label_value}"
+        )
+        self.labels[label_name] = label_value
+    def __repr__(self) -> str:
+        return (
+            "SchedulingNode(node_type={node_type}, "
+            "node_kind={node_kind}, "
+            "instance_id={instance_id},"
+            "ray_node_id={ray_node_id},"
+            "idle_duration_ms={idle_duration_ms},"
+            "termination_request={termination_request},"
+            "status={status}, "
+            "total_resources={total_resources}, "
+            "available_resources_for_demand={available_resources_for_demand}, "
+            "available_resources_for_cluster_resource_constraints="
+            "{available_resources_for_cluster_resource_constraints},"
+            "labels={labels}, launch_reason={launch_reason}), "
+            "sched_requests_for_demand={sched_requests_for_demand}), "
+            "sched_requests_for_cluster_resource_constraints="
+            "{sched_requests_for_cluster_resources_constraint})"
+        ).format(
+            node_type=self.node_type,
+            node_kind=self.node_kind,
+            instance_id=self.im_instance_id,
+            ray_node_id=self.ray_node_id,
+            idle_duration_ms=self.idle_duration_ms,
+            termination_request=str(message_to_dict(self.termination_request))
+            if self.termination_request
+            else None,
+            status=self.status,
+            total_resources=self.total_resources,
+            available_resources_for_demand=self.available_resources_for_sched[
+                ResourceRequestSource.PENDING_DEMAND
+            ],
+            available_resources_for_cluster_resource_constraints=self.available_resources_for_sched[  # noqa
+                ResourceRequestSource.CLUSTER_RESOURCE_CONSTRAINT
+            ],
+            labels=self.labels,
+            launch_reason=self.launch_reason,
+            sched_requests_for_demand="|".join(
+                str(message_to_dict(r))
+                for r in self.sched_requests[ResourceRequestSource.PENDING_DEMAND]
+            ),
+            sched_requests_for_cluster_resources_constraint="|".join(
+                str(message_to_dict(r))
+                for r in self.sched_requests[
+                    ResourceRequestSource.CLUSTER_RESOURCE_CONSTRAINT
+                ]
+            ),
+        )
+class ResourceDemandScheduler(IResourceScheduler):
+    """
+    A resource demand scheduler that schedules resource requests based on the
+    following rules:
+        1. Enforce the minimal count of nodes for each worker node type.
+        2. Enforce the cluster resource constraints.
+        3. Schedule the gang resource requests.
+        4. Schedule the tasks/actor resource requests
+    """
+    def __init__(self, event_logger: Optional[AutoscalerEventLogger] = None):
+        self._event_logger = event_logger
+    @dataclass
+    class ScheduleContext:
+        """
+        Encapsulates the context for processing one scheduling request.
+        This exposes functions to read and write the scheduling nodes, to prevent
+        accidental modification of the internal state.
+        """
+        # The node type configs for this scheduling request.
+        _node_type_configs: Dict[NodeType, NodeTypeConfig]
+        # If outdated node check through launch config is disabled.
+        _disable_launch_config_check: bool
+        # The max number of nodes for the entire cluster.
+        _max_num_nodes: Optional[int] = None
+        # The idle timeout in seconds.
+        _idle_timeout_s: Optional[float] = None
+        # The current schedulable nodes (including pending nodes and pending requests).
+        _nodes: List[SchedulingNode] = field(default_factory=list)
+        # The number of nodes by node types available for launching based on the max
+        # number of workers in the config. This takes into account any pending/running
+        # nodes.
+        _node_type_available: Dict[NodeType, int] = field(default_factory=dict)
+        def __init__(
+            self,
+            nodes: List[SchedulingNode],
+            node_type_configs: Dict[NodeType, NodeTypeConfig],
+            disable_launch_config_check: bool,
+            max_num_nodes: Optional[int] = None,
+            idle_timeout_s: Optional[float] = None,
+        ):
+            self._nodes = nodes
+            self._node_type_configs = node_type_configs
+            self._node_type_available = self._compute_available_node_types(
+                nodes, node_type_configs
+            )
+            self._max_num_nodes = max_num_nodes
+            self._idle_timeout_s = idle_timeout_s
+            self._disable_launch_config_check = disable_launch_config_check
+        @classmethod
+        def from_schedule_request(
+            cls, req: SchedulingRequest
+        ) -> "ResourceDemandScheduler.ScheduleContext":
+            """
+            Create a schedule context from a schedule request.
+            It will populate the context with the existing nodes and the available node
+            types from the config.
+            Args:
+                req: The scheduling request. The caller should make sure the
+                    request is valid.
+            """
+            nodes = []
+            node_type_configs = req.node_type_configs
+            # Initialize the scheduling nodes.
+            for instance in req.current_instances:
+                node = SchedulingNode.new(
+                    instance, node_type_configs, req.disable_launch_config_check
+                )
+                if node:
+                    nodes.append(node)
+            return cls(
+                nodes=nodes,
+                node_type_configs=node_type_configs,
+                disable_launch_config_check=req.disable_launch_config_check,
+                max_num_nodes=req.max_num_nodes,
+                idle_timeout_s=req.idle_timeout_s,
+            )
+        @staticmethod
+        def _compute_available_node_types(
+            nodes: List[SchedulingNode],
+            node_type_configs: Dict[NodeType, NodeTypeConfig],
+        ) -> Dict[NodeType, int]:
+            """
+            Compute the number of nodes by node types available for launching based on
+            the max number of workers in the config.
+            Args:
+                nodes: The current existing nodes.
+                node_type_configs: The node type configs.
+            Returns:
+                A dict of node types and the number of nodes available for launching.
+            """
+            node_type_available: Dict[NodeType, int] = defaultdict(int)
+            node_type_existing: Dict[NodeType, int] = defaultdict(int)
+            for node in nodes:
+                node_type_existing[node.node_type] += 1
+            for (
+                node_type,
+                node_type_config,
+            ) in node_type_configs.items():
+                node_type_available[
+                    node_type
+                ] = node_type_config.max_worker_nodes - node_type_existing.get(
+                    node_type, 0
+                )
+            return node_type_available
+        def get_nodes(self) -> List[SchedulingNode]:
+            """
+            Get the current nodes with filter.
+            Returns:
+                A list of nodes.
+            """
+            nodes = copy.deepcopy(self._nodes)
+            return nodes
+        def get_node_type_available(self) -> Dict[NodeType, int]:
+            return copy.deepcopy(self._node_type_available)
+        def get_cluster_shape(self) -> Dict[NodeType, int]:
+            cluster_shape = defaultdict(int)
+            for node in self._nodes:
+                if node.status == SchedulingNodeStatus.TO_TERMINATE:
+                    # Skip the nodes that are to be terminated.
+                    continue
+                cluster_shape[node.node_type] += 1
+            return cluster_shape
+        def get_idle_timeout_s(self) -> Optional[float]:
+            return self._idle_timeout_s
+        def update(self, new_nodes: List[SchedulingNode]) -> None:
+            """
+            Update the context with the new nodes.
+            """
+            self._nodes = new_nodes
+            # Update the available node types.
+            self._node_type_available = self._compute_available_node_types(
+                self._nodes, self._node_type_configs
+            )
+        def get_max_num_nodes(self) -> Optional[int]:
+            """
+            Get the max number of nodes for the entire cluster.
+            """
+            return self._max_num_nodes
+        def get_node_type_configs(self) -> Dict[NodeType, NodeTypeConfig]:
+            return self._node_type_configs
+        def __str__(self) -> str:
+            return "ScheduleContext({} nodes, node_type_available={})".format(
+                len(self._nodes), dict(self._node_type_available)
+            )
+        def get_launch_requests(self) -> List[LaunchRequest]:
+            """
+            Get the launch requests for the nodes that are to be launched.
+            """
+            launch_by_type = defaultdict(int)
+            for node in self._nodes:
+                if node.status == SchedulingNodeStatus.TO_LAUNCH:
+                    launch_by_type[node.node_type] += 1
+            launch_requests = []
+            for instance_type, count in launch_by_type.items():
+                launch_requests.append(
+                    LaunchRequest(
+                        instance_type=instance_type,
+                        count=count,
+                        id=str(uuid.uuid4()),
+                        request_ts_ms=time.time_ns() // 1000,
+                    )
+                )
+            return launch_requests
+        def get_terminate_requests(
+            self,
+        ) -> List[TerminationRequest]:
+            """
+            Get the terminate requests for the nodes that are to be terminated.
+            """
+            return [
+                node.termination_request
+                for node in self._nodes
+                if node.termination_request is not None
+            ]
+    def schedule(self, request: SchedulingRequest) -> SchedulingReply:
+        logger.debug(
+            "Scheduling for request: resource_request={}, gang_resource_request={}, "
+            "cluster_constraint={}".format(
+                ResourceRequestUtil.to_dict_list(request.resource_requests),
+                ProtobufUtil.to_dict_list(request.gang_resource_requests),
+                ProtobufUtil.to_dict_list(request.cluster_resource_constraints),
+            )
+        )
+        ctx = ResourceDemandScheduler.ScheduleContext.from_schedule_request(request)
+        # Enforce outdate nodes.
+        ResourceDemandScheduler._terminate_outdated_nodes(ctx)
+        # Enforce the minimal count of nodes for each worker node type.
+        ResourceDemandScheduler._enforce_min_workers_per_type(ctx)
+        # Enforce the max worker nodes count.
+        ResourceDemandScheduler._enforce_max_workers_per_type(ctx)
+        # Enforce the max worker nodes count globally.
+        ResourceDemandScheduler._enforce_max_workers_global(ctx)
+        # Enforce the cluster resource constraints.
+        infeasible_constraints = ResourceDemandScheduler._enforce_resource_constraints(
+            ctx, request.cluster_resource_constraints
+        )
+        # Schedule the gang resource requests.
+        infeasible_gang_requests = (
+            ResourceDemandScheduler._sched_gang_resource_requests(
+                ctx, request.gang_resource_requests
+            )
+        )
+        # Schedule the tasks/actor resource requests
+        infeasible_requests = ResourceDemandScheduler._sched_resource_requests(
+            ctx,
+            ResourceRequestUtil.ungroup_by_count(request.resource_requests),
+        )
+        # Shutdown any idle nodes that's not needed (e.g. no resource constraints.
+        # not needed by min_worker count, etc.)
+        ResourceDemandScheduler._enforce_idle_termination(ctx)
+        # Compute the number of nodes to launch.
+        reply = SchedulingReply(
+            infeasible_resource_requests=infeasible_requests,
+            infeasible_gang_resource_requests=infeasible_gang_requests,
+            infeasible_cluster_resource_constraints=infeasible_constraints,
+            to_launch=ctx.get_launch_requests(),
+            to_terminate=ctx.get_terminate_requests(),
+        )
+        if self._event_logger is not None:
+            try:
+                self._event_logger.log_cluster_scheduling_update(
+                    launch_requests=reply.to_launch,
+                    terminate_requests=reply.to_terminate,
+                    infeasible_requests=infeasible_requests,
+                    infeasible_gang_requests=infeasible_gang_requests,
+                    infeasible_cluster_resource_constraints=infeasible_constraints,
+                    cluster_shape=ctx.get_cluster_shape(),
+                    node_type_configs=ctx.get_node_type_configs(),
+                )
+            except Exception:
+                logger.exception("Failed to emit event logs.")
+        return reply
+    @staticmethod
+    def _enforce_max_workers_per_type(
+        ctx: "ResourceDemandScheduler.ScheduleContext",
+    ) -> None:
+        """
+        Enforce the max number of workers for each node type.
+        """
+        # Get all the nodes by type
+        all_nodes = ctx.get_nodes()
+        non_terminating_nodes_by_type = defaultdict(list)
+        terminating_nodes = []
+        for node in all_nodes:
+            if node.status == SchedulingNodeStatus.TO_TERMINATE:
+                terminating_nodes.append(node)
+            else:
+                non_terminating_nodes_by_type[node.node_type].append(node)
+        # Step 1. Enforce the max number of workers for each node type.
+        for node_type in non_terminating_nodes_by_type.keys():
+            non_terminate_nodes_of_type = non_terminating_nodes_by_type[node_type]
+            node_config = ctx.get_node_type_configs()[node_type]
+            num_max_nodes_per_type = node_config.max_worker_nodes
+            num_extra_nodes = len(non_terminate_nodes_of_type) - num_max_nodes_per_type
+            if num_extra_nodes <= 0:
+                # No extra nodes for this type, continue.
+                continue
+            # Terminate the nodes
+            (
+                to_terminate,
+                remained_nodes,
+            ) = ResourceDemandScheduler._select_nodes_to_terminate(
+                non_terminate_nodes_of_type,
+                num_extra_nodes,
+                TerminationRequest.Cause.MAX_NUM_NODE_PER_TYPE,
+                max_num_nodes_per_type=num_max_nodes_per_type,
+            )
+            non_terminating_nodes_by_type[node_type] = remained_nodes
+            terminating_nodes.extend(to_terminate)
+        non_terminating_nodes = []
+        for nodes in non_terminating_nodes_by_type.values():
+            non_terminating_nodes.extend(nodes)
+        # Update the context
+        assert len(all_nodes) == len(
+            terminating_nodes + non_terminating_nodes
+        ), "The number of nodes should be the same after enforcing max nodes per type."
+        ctx.update(terminating_nodes + non_terminating_nodes)
+        if terminating_nodes:
+            logger.debug(
+                f"Terminating {len(terminating_nodes)} "
+                "nodes for per node type max num node's constraints."
+            )
+    @staticmethod
+    def _enforce_max_workers_global(
+        ctx: "ResourceDemandScheduler.ScheduleContext",
+    ) -> None:
+        """
+        Enforce the max number of workers for the entire cluster.
+        """
+        all_nodes = ctx.get_nodes()
+        terminating_nodes = []
+        non_terminating_nodes = []
+        for node in all_nodes:
+            if node.status == SchedulingNodeStatus.TO_TERMINATE:
+                terminating_nodes.append(node)
+            else:
+                non_terminating_nodes.append(node)
+        num_max_nodes = ctx.get_max_num_nodes()
+        num_to_terminate = (
+            max(len(non_terminating_nodes) - num_max_nodes, 0) if num_max_nodes else 0
+        )
+        if num_to_terminate <= 0:
+            # No extra nodes needed to terminate.
+            return
+        # Terminate the nodes
+        (
+            to_terminate_nodes,
+            non_terminating_nodes,
+        ) = ResourceDemandScheduler._select_nodes_to_terminate(
+            non_terminating_nodes,
+            num_to_terminate,
+            TerminationRequest.Cause.MAX_NUM_NODES,
+            max_num_nodes=num_max_nodes,
+        )
+        assert len(to_terminate_nodes) == num_to_terminate, (
+            "Terminating {} nodes, failed to terminate {} nodes to "
+            "satisfy max_num_nodes={}".format(
+                len(to_terminate_nodes),
+                num_to_terminate - len(to_terminate_nodes),
+                num_max_nodes,
+            )
+        )
+        # Update the context
+        terminating_nodes.extend(to_terminate_nodes)
+        assert len(all_nodes) == len(
+            terminating_nodes + non_terminating_nodes
+        ), "The number of nodes should be the same after enforcing max nodes."
+        all_nodes = terminating_nodes + non_terminating_nodes
+        ctx.update(all_nodes)
+    @staticmethod
+    def _select_nodes_to_terminate(
+        nodes: List[SchedulingNode],
+        num_to_terminate: int,
+        cause: TerminationRequest.Cause,
+        max_num_nodes: Optional[int] = None,
+        max_num_nodes_per_type: Optional[int] = None,
+    ) -> Tuple[List[SchedulingNode], List[SchedulingNode]]:
+        """
+        Select 'num_to_terminate' of nodes to be terminated
+        from the 'nodes' list. It should never select a head node.
+        Args:
+            nodes: The nodes to be terminated.
+            num_to_terminate: The number of nodes to be terminated.
+            cause: The cause of the termination. Should be one of
+                TerminationRequest.Cause.MAX_NUM_NODES or
+                TerminationRequest.Cause.MAX_NUM_NODE_PER_TYPE.
+            max_num_nodes: The max number of nodes for the entire cluster only
+                used when the cause is TerminationRequest.Cause.MAX_NUM_NODES.
+            max_num_nodes_per_type: The max number of nodes for each node type.
+                Only used when the cause is
+                TerminationRequest.Cause.MAX_NUM_NODE_PER_TYPE.
+        Returns:
+            A tuple of:
+                - The terminated nodes.
+                - The remained nodes.
+        """
+        # Sort the nodes for termination.
+        nodes.sort(key=ResourceDemandScheduler._sort_nodes_for_termination)
+        # Remove the head node from the list.
+        head_node = None
+        for i, node in enumerate(nodes):
+            if node.node_kind == NodeKind.HEAD:
+                # Remove the head node from the list.
+                head_node = nodes.pop(i)
+                break
+        terminated_nodes, remained_nodes = (
+            nodes[:num_to_terminate],
+            # The head could be None if there's no head node being reported yet
+            # from the ray cluster.
+            nodes[num_to_terminate:] + ([head_node] if head_node else []),
+        )
+        assert cause in [
+            TerminationRequest.Cause.MAX_NUM_NODES,
+            TerminationRequest.Cause.MAX_NUM_NODE_PER_TYPE,
+        ], "Other termination causes don't have to select nodes for termination."
+        for node in terminated_nodes:
+            node.status = SchedulingNodeStatus.TO_TERMINATE
+            node.termination_request = TerminationRequest(
+                id=str(uuid.uuid4()),
+                instance_id=node.im_instance_id,
+                ray_node_id=node.ray_node_id,
+                cause=cause,
+                instance_type=node.node_type,
+                details=(
+                    f"Terminating node due to {TerminationRequest.Cause.Name(cause)}: "
+                    f"max_num_nodes={max_num_nodes}, "
+                    f"max_num_nodes_per_type={max_num_nodes_per_type}"
+                ),
+            )
+            if cause == TerminationRequest.Cause.MAX_NUM_NODES:
+                node.termination_request.max_num_nodes = max_num_nodes
+            elif cause == TerminationRequest.Cause.MAX_NUM_NODE_PER_TYPE:
+                node.termination_request.max_num_nodes_per_type = max_num_nodes_per_type
+            else:
+                raise ValueError("Unknown termination cause: {}".format(cause))
+        return terminated_nodes, remained_nodes
+    @staticmethod
+    def _sort_nodes_for_termination(node: SchedulingNode) -> Tuple:
+        """
+        Sort the nodes for termination increasingly by:
+            1. First if ray hasn't been started yet
+            2. Then if the nodes are idle
+            3. Then with lower resources util nodes first.
+        Such that nodes sorted earlier will be terminated first.
+        """
+        running_ray = len(node.ray_node_id) > 0
+        # Reverse the idle duration such that the nodes with the largest idle duration
+        # will be terminated first.
+        idle_dur = -1 * node.idle_duration_ms
+        available_resources = node.get_available_resources(
+            ResourceRequestSource.PENDING_DEMAND
+        )
+        utils_per_resources = {}
+        for resource, total in node.total_resources.items():
+            if total <= 0:
+                continue
+            utils_per_resources[resource] = (
+                total - available_resources.get(resource, 0)
+            ) / total
+        avg_util = (
+            sum(utils_per_resources.values()) / len(utils_per_resources)
+            if utils_per_resources
+            else 0
+        )
+        return (running_ray, idle_dur, avg_util)
+    @staticmethod
+    def _enforce_min_workers_per_type(
+        ctx: "ResourceDemandScheduler.ScheduleContext",
+    ) -> None:
+        """
+        Enforce the minimal count of nodes for each worker node type.
+        """
+        # Count the existing nodes by type
+        count_by_node_type = ctx.get_cluster_shape()
+        new_nodes = []
+        # Launch new nodes to satisfy min count for each node type.
+        for (
+            node_type,
+            node_type_config,
+        ) in ctx.get_node_type_configs().items():
+            cur_count = count_by_node_type.get(node_type, 0)
+            min_count = node_type_config.min_worker_nodes
+            if cur_count < min_count:
+                logger.info(
+                    f"Adding {min_count - cur_count} nodes to satisfy min count for "
+                    f"node type: {node_type}."
+                )
+                new_nodes.extend(
+                    [
+                        SchedulingNode.from_node_config(
+                            copy.deepcopy(node_type_config),
+                            status=SchedulingNodeStatus.TO_LAUNCH,
+                            node_kind=NodeKind.WORKER,
+                        )
+                    ]
+                    * (min_count - cur_count)
+                )
+        # NOTE: we assume the aggregated number of min workers across all node types
+        # should not exceed any globally enforced max_num_nodes
+        # Add the new nodes to the existing nodes and update the context.
+        ctx.update(new_nodes + ctx.get_nodes())
+    @staticmethod
+    def _enforce_resource_constraints(
+        ctx: "ResourceDemandScheduler.ScheduleContext",
+        constraints: List[ClusterResourceConstraint],
+    ) -> List[ClusterResourceConstraint]:
+        """
+        Enforce the cluster resource constraints.
+        Args:
+            ctx: The schedule context.
+            constraints: The cluster resource constraints.
+        Returns:
+            A list of infeasible constraints.
+        Notes:
+            It's different from the other scheduling functions since it doesn't actually
+        schedule any resource requests. Instead, it asks if the cluster could be
+        upscale to a certain shape to fulfill the constraints.
+        """
+        # NOTE: we currently only have 1 constraint from a cluster, but
+        # we may have multiple in the future.
+        assert len(constraints) <= 1, "Max 1 cluster resource constraint is supported."
+        if len(constraints) == 0:
+            # No cluster resource constraints - nothing needs to be done.
+            return []
+        constraint = constraints[0]
+        # Flatten the requests for iterating through.
+        requests = ResourceRequestUtil.ungroup_by_count(constraint.resource_requests)
+        # Pass the empty nodes to schedule.
+        scheduled_nodes, infeasible = ResourceDemandScheduler._try_schedule(
+            ctx,
+            requests,
+            resource_request_source=ResourceRequestSource.CLUSTER_RESOURCE_CONSTRAINT,
+        )
+        if infeasible:
+            # Unable to satisfy the constraint.
+            return [constraint]
+        ctx.update(scheduled_nodes)
+        return []
+    @staticmethod
+    def _sched_resource_requests(
+        ctx: "ResourceDemandScheduler.ScheduleContext",
+        requests: List[ResourceRequest],
+    ) -> List[ResourceRequest]:
+        """
+        Schedule the resource requests.
+        Args:
+            ctx: The schedule context.
+            requests_by_count: The resource requests.
+        Returns:
+            A list of infeasible resource requests.
+        """
+        nodes, infeasible = ResourceDemandScheduler._try_schedule(
+            ctx, requests, resource_request_source=ResourceRequestSource.PENDING_DEMAND
+        )
+        # Regardless if there's feasible, we will update the context for schedule nodes.
+        ctx.update(nodes)
+        return infeasible
+    @staticmethod
+    def _sched_gang_resource_requests(
+        ctx: "ResourceDemandScheduler.ScheduleContext",
+        gang_requests: List[GangResourceRequest],
+    ) -> List[GangResourceRequest]:
+        """
+        Schedule the gang resource requests.
+        These requests should be scheduled atomically, i.e. either all of the resources
+        requests in a gang request are scheduled or none of them are scheduled.
+        For now, the gang resource requests represent Ray's placement groups, while it
+        could be more general in the future:
+        - For STRICT_PACK placement group requests, we combine them into a single
+            request and try to schedule them together.
+        - For STRICT_SPREAD placement groups requests, they should be scheduled on
+            different nodes by leveraging on the node labels that are associated with
+            the placement group.
+            If there are requests from rescheduling placement groups due to node
+            failures, these requests should not be scheduled on nodes with requests
+            from the same placement group.
+        Args:
+            ctx: The schedule context.
+            gang_requests: The gang resource requests.
+        Returns:
+            A list of infeasible gang resource requests.
+        """
+        def _sort_gang_resource_requests(req: GangResourceRequest) -> Tuple:
+            """
+            Key function for sorting the gang resource request by:
+                1. the number of placement constraints in the gang request.
+                2. the number of resource requests in the gang request.
+            """
+            total_placement_constraints = 0
+            for resource_request in req.requests:
+                total_placement_constraints += len(
+                    resource_request.placement_constraints
+                )
+            return (total_placement_constraints, len(req.requests))
+        infeasible_gang_requests = []
+        # Try fulfilling the gang requests one by one.
+        for gang_req in sorted(
+            gang_requests, key=_sort_gang_resource_requests, reverse=True
+        ):
+            requests = gang_req.requests
+            # Try to combine requests with affinity constraints into the same request.
+            requests = ResourceRequestUtil.combine_requests_with_affinity(requests)
+            nodes, infeasible = ResourceDemandScheduler._try_schedule(
+                ctx, requests, ResourceRequestSource.PENDING_DEMAND
+            )
+            if infeasible:
+                # Unable to satisfy the constraint. We will skip the gang request.
+                # Don't update the context.
+                infeasible_gang_requests.append(gang_req)
+                continue
+            # We are able to satisfy the constraint and thus update the context.
+            ctx.update(nodes)
+        return infeasible_gang_requests
+    @staticmethod
+    def _try_schedule(
+        ctx: "ResourceDemandScheduler.ScheduleContext",
+        requests_to_sched: List[ResourceRequest],
+        resource_request_source: ResourceRequestSource,
+    ) -> Tuple[List[SchedulingNode], List[ResourceRequest]]:
+        """
+        Try to schedule the resource requests on the current context.
+        It tries to schedule the requests on the existing nodes first, and
+        then try to schedule the requests on new nodes if possible.
+        Args:
+            requests_to_sched: The resource requests to be scheduled.
+            ctx: The current scheduling context.
+            resource_request_source: The source of the resource request, i.e.
+                pending demands from ray actors/tasks or cluster resource
+                constraints.
+        Returns:
+            - List of scheduled nodes to that have part or all of the requests
+                scheduled.
+            - List of infeasible requests remained that cannot be scheduled.
+        """
+        # First sort the requests.
+        def _sort_resource_request(req: ResourceRequest) -> Tuple:
+            """
+            Sort the resource requests by:
+                1. The length of it's placement constraints.
+                2. The number of resources it requests.
+                3. The values of resources it requests.
+                4. lexicographically for each resource (for stable ordering)
+            This is a legacy sorting function for the autoscaler's binpacking
+            algo - we do this so that we could have a deterministic scheduling
+            results with reasonable fragmentation.
+            """
+            return (
+                len(req.placement_constraints),
+                len(req.resources_bundle.values()),
+                sum(req.resources_bundle.values()),
+                sorted(req.resources_bundle.items()),
+            )
+        requests_to_sched = sorted(
+            requests_to_sched, key=_sort_resource_request, reverse=True
+        )
+        existing_nodes = ctx.get_nodes()
+        node_type_available = ctx.get_node_type_available()
+        # A list of nodes that are either:
+        #   1. existing nodes in the cluster. or
+        #   2. new nodes that are launched to satisfy the resource requests.
+        target_nodes = []
+        # Try scheduling resource requests with existing nodes first.
+        while len(requests_to_sched) > 0 and len(existing_nodes) > 0:
+            (
+                best_node,
+                requests_to_sched,
+                existing_nodes,
+            ) = ResourceDemandScheduler._sched_best_node(
+                requests_to_sched, existing_nodes, resource_request_source
+            )
+            if best_node is None:
+                # No existing nodes can schedule any more requests.
+                break
+            target_nodes.append(best_node)
+        # If there's any existing nodes left, we will add to the target nodes
+        target_nodes.extend(existing_nodes)
+        # Try scheduling resource requests with new nodes.
+        node_pools = [
+            SchedulingNode.from_node_config(
+                ctx.get_node_type_configs()[node_type],
+                status=SchedulingNodeStatus.TO_LAUNCH,
+                node_kind=NodeKind.WORKER,
+            )
+            for node_type, num_available in node_type_available.items()
+            if num_available > 0
+        ]
+        while len(requests_to_sched) > 0 and len(node_pools) > 0:
+            # Max number of nodes reached.
+            max_num_nodes = ctx.get_max_num_nodes()
+            if max_num_nodes is not None and len(target_nodes) >= max_num_nodes:
+                logger.debug(
+                    "Max number of nodes reached: {}, "
+                    "cannot launch more nodes.".format(max_num_nodes)
+                )
+                break
+            (
+                best_node,
+                requests_to_sched,
+                node_pools,
+            ) = ResourceDemandScheduler._sched_best_node(
+                requests_to_sched, node_pools, resource_request_source
+            )
+            if best_node is None:
+                break
+            target_nodes.append(best_node)
+            # Update the node pool if a node with the same node type of the
+            # added node can be launched.
+            node_type_available[best_node.node_type] -= 1
+            if node_type_available[best_node.node_type] > 0:
+                node_pools.append(
+                    SchedulingNode.from_node_config(
+                        ctx.get_node_type_configs()[best_node.node_type],
+                        status=SchedulingNodeStatus.TO_LAUNCH,
+                        node_kind=NodeKind.WORKER,
+                    )
+                )
+        return target_nodes, requests_to_sched
+    @staticmethod
+    def _sched_best_node(
+        requests: List[ResourceRequest],
+        nodes: List[SchedulingNode],
+        resource_request_source: ResourceRequestSource,
+    ) -> Tuple[SchedulingNode, List[ResourceRequest], List[SchedulingNode]]:
+        """
+        Schedule the requests on the best node.
+        A simple greedy algorithm is used to schedule the requests:
+            1. Try to schedule the requests on each node.
+            2. Sort the nodes by a score
+            3. Return the node with the highest score.
+        The highest score node is updated with the scheduled requests, and the node is
+        removed from the node list.
+        Args:
+            requests: The resource requests to be scheduled.
+            nodes: The node candidates to be scheduled on. The nodes will be updated
+                after the scheduling attempt, i.e. the node that is scheduled will be
+                removed from the list.
+            resource_request_source: The source of the resource request, i.e.
+                pending demands from ray actors/tasks or cluster resource constraints.
+        Returns:
+            best_node: The best node to schedule the requests.
+            infeasible: The infeasible requests that cannot be scheduled on the best
+                node.
+            nodes: Remaining nodes after the best node is removed.
+        """
+        results = []
+        # A temporary data class to store the scheduling result.
+        @dataclass
+        class ScheduleResult:
+            # The node candidate after a scheduling attempt.
+            node: SchedulingNode
+            # The infeasible resource requests that are not scheduled.
+            infeasible_requests: List[ResourceRequest]
+            # The index of the node in the original node list.
+            idx: int
+            # the score of the scheduling node to compare with others.
+            score: UtilizationScore
+        nodes_copy = copy.deepcopy(nodes)
+        # Iterate through each node and modify the node's available resources
+        # if the requests are schedulable.
+        for idx, node in enumerate(nodes_copy):
+            remaining, score = node.try_schedule(requests, resource_request_source)
+            if len(remaining) == len(requests):
+                # The node cannot schedule any of the requests.
+                continue
+            results.append(ScheduleResult(node, remaining, idx, score))
+        # No nodes can schedule any of the requests.
+        if len(results) == 0:
+            logger.debug(
+                "No nodes can schedule the requests: {}, for nodes: {}".format(
+                    ResourceRequestUtil.to_dict_list(requests), nodes
+                )
+            )
+            return None, requests, nodes
+        # Sort the results by score.
+        results = sorted(results, key=lambda r: r.score, reverse=True)
+        best_result = results[0]
+        # Remove the best node from the nodes.
+        nodes.pop(best_result.idx)
+        logger.debug(
+            "Best node: {}, score: {}, remaining requests: {}".format(
+                best_result.node,
+                best_result.score,
+                ResourceRequestUtil.to_dict_list(best_result.infeasible_requests),
+            )
+        )
+        return best_result.node, best_result.infeasible_requests, nodes
+    @staticmethod
+    def _terminate_outdated_nodes(
+        ctx: "ResourceDemandScheduler.ScheduleContext",
+    ) -> None:
+        """
+        Terminate the nodes that are outdated, i.e. the node type config has been
+        updated or the node's launch config hash is outdated.
+        Args:
+            ctx: The schedule context.
+        """
+        nodes = ctx.get_nodes()
+        if ctx._disable_launch_config_check:
+            # Outdated nodes check through launch config check is disabled.
+            return
+        for node in nodes:
+            if node.status != SchedulingNodeStatus.SCHEDULABLE:
+                # We don't need to care about the non-running nodes.
+                continue
+            if node.node_kind == NodeKind.HEAD:
+                # We should not be terminating the head node even if it's outdated.
+                logger.warning(
+                    f"Head node {node.im_instance_id}(ray={node.ray_node_id}) is "
+                    "outdated with node config changes. "
+                    "Please check the node's config or restart the cluster or restart "
+                    "the head node. Autoscaler is not able to shutdown the outdated "
+                    "head node"
+                )
+                continue
+            node_type = node.node_type
+            node_type_config = ctx.get_node_type_configs().get(node_type)
+            if node_type_config is None or (
+                node_type_config.launch_config_hash
+                and node_type_config.launch_config_hash != node.launch_config_hash
+            ):
+                # The node type config has been updated, and the node's launch config
+                # hash is outdated.
+                node.status = SchedulingNodeStatus.TO_TERMINATE
+                node.termination_request = TerminationRequest(
+                    id=str(time.time_ns()),
+                    instance_id=node.im_instance_id,
+                    ray_node_id=node.ray_node_id,
+                    instance_type=node.node_type,
+                    cause=TerminationRequest.Cause.OUTDATED,
+                    details=f"node from {node.node_type} has outdated config",
+                )
+        ctx.update(nodes)
+    @staticmethod
+    def _enforce_idle_termination(
+        ctx: "ResourceDemandScheduler.ScheduleContext",
+    ) -> None:
+        """
+        Enforce the idle termination for the nodes that are not needed by the cluster
+        resource constraints and idle for too long.
+        Args:
+            ctx: The schedule context.
+        """
+        count_by_node_type = ctx.get_cluster_shape()
+        node_type_configs = ctx.get_node_type_configs()
+        terminate_nodes_by_type: Dict[NodeType, int] = defaultdict(int)
+        nodes = ctx.get_nodes()
+        s_to_ms = 1000
+        for node in nodes:
+            if node.status != SchedulingNodeStatus.SCHEDULABLE:
+                # We don't need to care about the non-running nodes.
+                continue
+            if node.node_kind == NodeKind.HEAD:
+                # The head node is not subject to idle termination.
+                continue
+            idle_timeout_s = ctx.get_idle_timeout_s()
+            # Override the scheduler idle_timeout_s if set for this node_type.
+            node_type = node.node_type
+            if node_type in node_type_configs:
+                if node_type_configs[node_type].idle_timeout_s is not None:
+                    idle_timeout_s = node_type_configs[node_type].idle_timeout_s
+            if idle_timeout_s is None:
+                # No idle timeout is set, skip the idle termination.
+                continue
+            if node.idle_duration_ms <= idle_timeout_s * s_to_ms:
+                # The node is not idle for too long, skip it.
+                continue
+            if node.sched_requests[ResourceRequestSource.CLUSTER_RESOURCE_CONSTRAINT]:
+                # The node is needed by the resource constraints.
+                # Skip it.
+                if node.idle_duration_ms > ctx.get_idle_timeout_s() * s_to_ms:
+                    logger.debug(
+                        "Node {} (idle for {} secs) is needed by the cluster resource "
+                        "constraints, skip idle termination.".format(
+                            node.ray_node_id, node.idle_duration_ms / s_to_ms
+                        )
+                    )
+                continue
+            # Honor the min_worker_nodes setting for the node type.
+            min_count = 0
+            if node_type in node_type_configs:
+                min_count = node_type_configs[node_type].min_worker_nodes
+            if (
+                count_by_node_type.get(node_type, 0)
+                - terminate_nodes_by_type[node_type]
+                <= min_count
+            ):
+                logger.info(
+                    "Node {} (idle for {} secs) belongs to node_type {} and is "
+                    "required by min_worker_nodes, skipping idle termination.".format(
+                        node.ray_node_id, node.idle_duration_ms / s_to_ms, node_type
+                    )
+                )
+                continue
+            terminate_nodes_by_type[node.node_type] += 1
+            # The node is idle for too long, terminate it.
+            node.status = SchedulingNodeStatus.TO_TERMINATE
+            node.termination_request = TerminationRequest(
+                id=str(uuid.uuid4()),
+                instance_id=node.im_instance_id,
+                ray_node_id=node.ray_node_id,
+                cause=TerminationRequest.Cause.IDLE,
+                instance_type=node.node_type,
+                idle_duration_ms=node.idle_duration_ms,
+                details=f"idle for {node.idle_duration_ms/s_to_ms} secs > "
+                f"timeout={idle_timeout_s} secs",
+            )
+        ctx.update(nodes)

.venv/lib/python3.11/site-packages/ray/autoscaler/v2/schema.py ADDED Viewed

	@@ -0,0 +1,351 @@

+import re
+from dataclasses import dataclass, field
+from enum import Enum
+from typing import Dict, List, Optional, Tuple
+from ray.autoscaler.v2.instance_manager.common import InstanceUtil
+from ray.core.generated.autoscaler_pb2 import NodeState, NodeStatus
+from ray.core.generated.instance_manager_pb2 import Instance
+# TODO(rickyx): once we have graceful shutdown, we could populate
+# the failure detail with the actual termination message. As of now,
+# we will use a more generic message to include cases such as:
+# (idle termination, node death, crash, preemption, etc)
+NODE_DEATH_CAUSE_RAYLET_DIED = "NodeTerminated"
+# e.g., cpu_4_ondemand.
+NodeType = str
+@dataclass
+class ResourceUsage:
+    # Resource name.
+    resource_name: str = ""
+    # Total resource.
+    total: float = 0.0
+    # Resource used.
+    used: float = 0.0
+@dataclass
+class NodeUsage:
+    # The node resource usage.
+    usage: List[ResourceUsage]
+    # How long the node has been idle.
+    idle_time_ms: int
+@dataclass
+class NodeInfo:
+    # The instance type name, e.g. p3.2xlarge
+    instance_type_name: str
+    # ray node type name.
+    ray_node_type_name: str
+    # Cloud instance id.
+    instance_id: str
+    # Ip address of the node when alive.
+    ip_address: str
+    # The status of the node. Optional for pending nodes.
+    node_status: Optional[str] = None
+    # ray node id in hex. None if still pending.
+    node_id: Optional[str] = None
+    # Resource usage breakdown if node is running.
+    resource_usage: Optional[NodeUsage] = None
+    # Failure detail if the node failed.
+    failure_detail: Optional[str] = None
+    # Descriptive details.
+    details: Optional[str] = None
+    # Activity on the node.
+    node_activity: Optional[List[str]] = None
+    def total_resources(self) -> Dict[str, float]:
+        if self.resource_usage is None:
+            return {}
+        return {r.resource_name: r.total for r in self.resource_usage.usage}
+    def available_resources(self) -> Dict[str, float]:
+        if self.resource_usage is None:
+            return {}
+        return {r.resource_name: r.total - r.used for r in self.resource_usage.usage}
+    def used_resources(self) -> Dict[str, float]:
+        if self.resource_usage is None:
+            return {}
+        return {r.resource_name: r.used for r in self.resource_usage.usage}
+@dataclass
+class LaunchRequest:
+    class Status(Enum):
+        FAILED = "FAILED"
+        PENDING = "PENDING"
+    # The instance type name, e.g. p3.2xlarge
+    instance_type_name: str
+    # ray node type name.
+    ray_node_type_name: str
+    # count.
+    count: int
+    # State: (e.g. PENDING, FAILED)
+    state: Status
+    # When the launch request was made in unix timestamp in secs.
+    request_ts_s: int
+    # When the launch request failed unix timestamp in secs if failed.
+    failed_ts_s: Optional[int] = None
+    # Request details, e.g. error reason if the launch request failed.
+    details: Optional[str] = None
+@dataclass
+class ResourceRequestByCount:
+    # Bundles in the demand.
+    bundle: Dict[str, float]
+    # Number of bundles with the same shape.
+    count: int
+    def __str__(self) -> str:
+        return f"[{self.count} {self.bundle}]"
+@dataclass
+class ResourceDemand:
+    # The bundles in the demand with shape and count info.
+    bundles_by_count: List[ResourceRequestByCount]
+@dataclass
+class PlacementGroupResourceDemand(ResourceDemand):
+    # Details string (parsed into below information)
+    details: str
+    # Placement group's id.
+    pg_id: Optional[str] = None
+    # Strategy, e.g. STRICT_SPREAD
+    strategy: Optional[str] = None
+    # Placement group's state, e.g. PENDING
+    state: Optional[str] = None
+    def __post_init__(self):
+        if not self.details:
+            return
+        # Details in the format of <pg_id>:<strategy>|<state>, parse
+        # it into the above fields.
+        pattern = r"^.*:.*\|.*$"
+        match = re.match(pattern, self.details)
+        if not match:
+            return
+        pg_id, details = self.details.split(":")
+        strategy, state = details.split("|")
+        self.pg_id = pg_id
+        self.strategy = strategy
+        self.state = state
+@dataclass
+class RayTaskActorDemand(ResourceDemand):
+    pass
+@dataclass
+class ClusterConstraintDemand(ResourceDemand):
+    pass
+@dataclass
+class ResourceDemandSummary:
+    # Placement group demand.
+    placement_group_demand: List[PlacementGroupResourceDemand] = field(
+        default_factory=list
+    )
+    # Ray task actor demand.
+    ray_task_actor_demand: List[RayTaskActorDemand] = field(default_factory=list)
+    # Cluster constraint demand.
+    cluster_constraint_demand: List[ClusterConstraintDemand] = field(
+        default_factory=list
+    )
+@dataclass
+class Stats:
+    # How long it took to get the GCS request.
+    # This is required when initializing the Stats since it should be calculated before
+    # the request was made.
+    gcs_request_time_s: float
+    # How long it took to get all live instances from node provider.
+    none_terminated_node_request_time_s: Optional[float] = None
+    # How long for autoscaler to process the scaling decision.
+    autoscaler_iteration_time_s: Optional[float] = None
+    # The last seen autoscaler state version from Ray.
+    autoscaler_version: Optional[str] = None
+    # The last seen cluster state resource version.
+    cluster_resource_state_version: Optional[str] = None
+    # Request made time unix timestamp: when the data was pulled from GCS.
+    request_ts_s: Optional[int] = None
+@dataclass
+class ClusterStatus:
+    # Healthy nodes information (non-idle)
+    active_nodes: List[NodeInfo] = field(default_factory=list)
+    # Idle node information
+    idle_nodes: List[NodeInfo] = field(default_factory=list)
+    # Pending launches.
+    pending_launches: List[LaunchRequest] = field(default_factory=list)
+    # Failed launches.
+    failed_launches: List[LaunchRequest] = field(default_factory=list)
+    # Pending nodes.
+    pending_nodes: List[NodeInfo] = field(default_factory=list)
+    # Failures
+    failed_nodes: List[NodeInfo] = field(default_factory=list)
+    # Resource usage summary for entire cluster.
+    cluster_resource_usage: List[ResourceUsage] = field(default_factory=list)
+    # Demand summary.
+    resource_demands: ResourceDemandSummary = field(
+        default_factory=ResourceDemandSummary
+    )
+    # Query metics
+    stats: Stats = field(default_factory=Stats)
+    def total_resources(self) -> Dict[str, float]:
+        return {r.resource_name: r.total for r in self.cluster_resource_usage}
+    def available_resources(self) -> Dict[str, float]:
+        return {r.resource_name: r.total - r.used for r in self.cluster_resource_usage}
+    # TODO(rickyx): we don't show infeasible requests as of now.
+    # (They will just be pending forever as part of the demands)
+    # We should show them properly in the future.
+@dataclass
+class AutoscalerInstance:
+    """
+    AutoscalerInstance represents an instance that's managed by the autoscaler.
+    This includes two states:
+        1. the instance manager state: information of the underlying cloud instance.
+        2. the ray node state, e.g. resources, ray node status.
+    The two states are linked by the cloud instance id, which should be set
+    when the ray node is started.
+    """
+    # The cloud instance id. It could be None if the instance hasn't been assigned
+    # a cloud instance id, e.g. the instance is still in QUEUED or REQUESTED status.
+    cloud_instance_id: Optional[str] = None
+    # The ray node state status. It could be None when no ray node is running
+    # or has run on the cloud instance: for example, ray is still being installed
+    # or the instance manager hasn't had a cloud instance assigned (e.g. QUEUED,
+    # REQUESTED).
+    ray_node: Optional[NodeState] = None
+    # The instance manager instance state. It would be None when the ray_node is not
+    # None.
+    # It could be None iff:
+    #   1. There's a ray node, but the instance manager hasn't discovered the
+    #   cloud instance that's running this ray process yet. This could happen since
+    #   the instance manager only discovers instances periodically.
+    #
+    #   2. There was a ray node running on the cloud instance, which was already stopped
+    #   and removed from the instance manager state. But the ray state is still lagging
+    #   behind.
+    #
+    #   3. There is a ray node that's unmanaged by the instance manager.
+    #
+    im_instance: Optional[Instance] = None
+    # | cloud_instance_id | ray_node | im_instance |
+    # |-------------------|----------|-------------|
+    # | None              | None     | None        | Not possible.
+    # | None              | None     | not None    | OK. An instance hasn't had ray running on it yet. # noqa E501
+    # | None              | Not None | None        | OK. Possible if the ray node is not started by autoscaler. # noqa E501
+    # | None              | Not None | not None    | Not possible - no way to link im instance with ray node. # noqa E501
+    # | not None          | None     | None        | Not possible since cloud instance id is either part of im state or ray node. # noqa E501
+    # | not None          | None     | not None    | OK. e.g. An instance that's not running ray yet. # noqa E501
+    # | not None          | Not None | None        | OK. See scenario 1, 2, 3 above.
+    # | not None          | Not None | not None    | OK. An instance that's running ray.
+    def validate(self) -> Tuple[bool, str]:
+        """Validate the autoscaler instance state.
+        Returns:
+            A tuple of (valid, error_msg) where:
+            - valid is whether the state is valid
+            - error_msg is the error message for the validation results.
+        """
+        state_combinations = {
+            # (cloud_instance_id is None, ray_node is None, im_instance is None): (valid, error_msg) # noqa E501
+            (True, True, True): (False, "Not possible"),
+            (True, True, False): (True, ""),
+            (True, False, True): (
+                True,
+                "There's a ray node w/o cloud instance id, must be started not "
+                "by autoscaler",
+            ),
+            (True, False, False): (
+                False,
+                "Not possible - no way to link im instance with ray node",
+            ),
+            (False, True, True): (
+                False,
+                "Not possible since cloud instance id is either part of "
+                "im state or ray node",
+            ),
+            (False, True, False): (True, ""),
+            (False, False, True): (True, ""),
+            (False, False, False): (True, ""),
+        }
+        valid, error_msg = state_combinations[
+            (
+                self.cloud_instance_id is None,
+                self.ray_node is None,
+                self.im_instance is None,
+            )
+        ]
+        if not valid:
+            return valid, error_msg
+        if self.im_instance is not None and self.ray_node is None:
+            # We don't see a ray node, but tracking an im instance.
+            if self.cloud_instance_id is None:
+                if InstanceUtil.is_cloud_instance_allocated(self.im_instance.status):
+                    return (
+                        False,
+                        "instance should be in a status where cloud instance "
+                        "is not allocated.",
+                    )
+            else:
+                if not InstanceUtil.is_cloud_instance_allocated(
+                    self.im_instance.status
+                ):
+                    return (
+                        False,
+                        "instance should be in a status where cloud instance is "
+                        "allocated.",
+                    )
+        if self.ray_node is not None:
+            if self.cloud_instance_id != self.ray_node.instance_id:
+                return False, "cloud instance id doesn't match."
+        if self.im_instance is not None and self.cloud_instance_id is not None:
+            if self.cloud_instance_id != self.im_instance.cloud_instance_id:
+                return False, "cloud instance id doesn't match."
+        return True, ""
+    def is_ray_running(self) -> bool:
+        """Whether the ray node is running."""
+        return self.ray_node is not None and self.ray_node.status in [
+            NodeStatus.RUNNING,
+            NodeStatus.IDLE,
+        ]
+    def is_ray_stop(self) -> bool:
+        """Whether the ray node is stopped."""
+        return self.ray_node is None or self.ray_node.status in [
+            NodeStatus.DEAD,
+        ]