diff --git a/.venv/lib/python3.11/site-packages/ray/autoscaler/_private/aliyun/__init__.py b/.venv/lib/python3.11/site-packages/ray/autoscaler/_private/aliyun/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/.venv/lib/python3.11/site-packages/ray/autoscaler/_private/aliyun/__pycache__/__init__.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/autoscaler/_private/aliyun/__pycache__/__init__.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..84801d381d19794017ab6f7f5cabb93f753c0c43 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/autoscaler/_private/aliyun/__pycache__/__init__.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/autoscaler/_private/aliyun/__pycache__/config.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/autoscaler/_private/aliyun/__pycache__/config.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..a80bfc78b8f22f90af70d69def117b5cbe14383d Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/autoscaler/_private/aliyun/__pycache__/config.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/autoscaler/_private/aliyun/__pycache__/node_provider.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/autoscaler/_private/aliyun/__pycache__/node_provider.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..20c990748ad577c364cf0adf53f9965ef427650b Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/autoscaler/_private/aliyun/__pycache__/node_provider.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/autoscaler/_private/aliyun/__pycache__/utils.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/autoscaler/_private/aliyun/__pycache__/utils.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..6e1ffeee02c098dc1b5e4601e9e32e5eb44df0b2 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/autoscaler/_private/aliyun/__pycache__/utils.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/autoscaler/_private/aliyun/config.py b/.venv/lib/python3.11/site-packages/ray/autoscaler/_private/aliyun/config.py new file mode 100644 index 0000000000000000000000000000000000000000..464b170ddb646089f84a2bd5b3a053ea11e4d601 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/autoscaler/_private/aliyun/config.py @@ -0,0 +1,116 @@ +import logging +import os +import stat + +from ray.autoscaler._private.aliyun.utils import AcsClient + +# instance status +PENDING = "Pending" +RUNNING = "Running" +STARTING = "Starting" +STOPPING = "Stopping" +STOPPED = "Stopped" + +logger = logging.getLogger(__name__) + + +def bootstrap_aliyun(config): + # print(config["provider"]) + # create vpc + _get_or_create_vpc(config) + # create security group id + _get_or_create_security_group(config) + # create vswitch + _get_or_create_vswitch(config) + # create key pair + _get_or_import_key_pair(config) + # print(config["provider"]) + return config + + +def _client(config): + return AcsClient( + access_key=config["provider"].get("access_key"), + access_key_secret=config["provider"].get("access_key_secret"), + region_id=config["provider"]["region"], + max_retries=1, + ) + + +def _get_or_create_security_group(config): + cli = _client(config) + security_groups = cli.describe_security_groups(vpc_id=config["provider"]["vpc_id"]) + if security_groups is not None and len(security_groups) > 0: + config["provider"]["security_group_id"] = security_groups[0]["SecurityGroupId"] + return config + + security_group_id = cli.create_security_group(vpc_id=config["provider"]["vpc_id"]) + + for rule in config["provider"].get("security_group_rule", {}): + cli.authorize_security_group( + security_group_id=security_group_id, + port_range=rule["port_range"], + source_cidr_ip=rule["source_cidr_ip"], + ip_protocol=rule["ip_protocol"], + ) + config["provider"]["security_group_id"] = security_group_id + return + + +def _get_or_create_vpc(config): + cli = _client(config) + vpcs = cli.describe_vpcs() + if vpcs is not None and len(vpcs) > 0: + config["provider"]["vpc_id"] = vpcs[0].get("VpcId") + return + + vpc_id = cli.create_vpc() + if vpc_id is not None: + config["provider"]["vpc_id"] = vpc_id + + +def _get_or_create_vswitch(config): + cli = _client(config) + vswitches = cli.describe_v_switches(vpc_id=config["provider"]["vpc_id"]) + if vswitches is not None and len(vswitches) > 0: + config["provider"]["v_switch_id"] = vswitches[0].get("VSwitchId") + return + + v_switch_id = cli.create_v_switch( + vpc_id=config["provider"]["vpc_id"], + zone_id=config["provider"]["zone_id"], + cidr_block=config["provider"]["cidr_block"], + ) + + if v_switch_id is not None: + config["provider"]["v_switch_id"] = v_switch_id + + +def _get_or_import_key_pair(config): + cli = _client(config) + key_name = config["provider"].get("key_name", "ray") + key_path = os.path.expanduser("~/.ssh/{}".format(key_name)) + keypairs = cli.describe_key_pairs(key_pair_name=key_name) + + if keypairs is not None and len(keypairs) > 0: + if "ssh_private_key" not in config["auth"]: + logger.info( + "{} keypair exists, use {} as local ssh key".format(key_name, key_path) + ) + config["auth"]["ssh_private_key"] = key_path + else: + if "ssh_private_key" not in config["auth"]: + # create new keypair + resp = cli.create_key_pair(key_pair_name=key_name) + if resp is not None: + with open(key_path, "w+") as f: + f.write(resp.get("PrivateKeyBody")) + os.chmod(key_path, stat.S_IRUSR) + config["auth"]["ssh_private_key"] = key_path + else: + public_key_file = config["auth"]["ssh_private_key"] + ".pub" + # create new keypair, from local file + with open(public_key_file) as f: + public_key = f.readline().strip("\n") + cli.import_key_pair(key_pair_name=key_name, public_key_body=public_key) + return diff --git a/.venv/lib/python3.11/site-packages/ray/autoscaler/_private/aliyun/node_provider.py b/.venv/lib/python3.11/site-packages/ray/autoscaler/_private/aliyun/node_provider.py new file mode 100644 index 0000000000000000000000000000000000000000..e9b65a07936f9a1aaf18a9763e2263d542738809 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/autoscaler/_private/aliyun/node_provider.py @@ -0,0 +1,324 @@ +import logging +import random +import threading +import time +from collections import defaultdict +from typing import Any, Dict, List, Optional + +from ray.autoscaler._private.aliyun.config import ( + PENDING, + RUNNING, + STOPPED, + STOPPING, + bootstrap_aliyun, +) +from ray.autoscaler._private.aliyun.utils import AcsClient +from ray.autoscaler._private.cli_logger import cli_logger +from ray.autoscaler._private.constants import BOTO_MAX_RETRIES +from ray.autoscaler._private.log_timer import LogTimer +from ray.autoscaler.node_provider import NodeProvider +from ray.autoscaler.tags import ( + TAG_RAY_CLUSTER_NAME, + TAG_RAY_LAUNCH_CONFIG, + TAG_RAY_NODE_KIND, + TAG_RAY_NODE_NAME, + TAG_RAY_NODE_STATUS, + TAG_RAY_USER_NODE_TYPE, +) + +logger = logging.getLogger(__name__) + +TAG_BATCH_DELAY = 1 +STOPPING_NODE_DELAY = 1 + + +class AliyunNodeProvider(NodeProvider): + def __init__(self, provider_config, cluster_name): + NodeProvider.__init__(self, provider_config, cluster_name) + self.cache_stopped_nodes = provider_config.get("cache_stopped_nodes", True) + self.acs = AcsClient( + access_key=provider_config["access_key"], + access_key_secret=provider_config["access_key_secret"], + region_id=provider_config["region"], + max_retries=BOTO_MAX_RETRIES, + ) + + # Try availability zones round-robin, starting from random offset + self.subnet_idx = random.randint(0, 100) + + # Tags that we believe to actually be on the node. + self.tag_cache = {} + # Tags that we will soon upload. + self.tag_cache_pending = defaultdict(dict) + # Number of threads waiting for a batched tag update. + self.batch_thread_count = 0 + self.batch_update_done = threading.Event() + self.batch_update_done.set() + self.ready_for_new_batch = threading.Event() + self.ready_for_new_batch.set() + self.tag_cache_lock = threading.Lock() + self.count_lock = threading.Lock() + + # Cache of node objects from the last nodes() call. This avoids + # excessive DescribeInstances requests. + self.cached_nodes = {} + + def non_terminated_nodes(self, tag_filters: Dict[str, str]) -> List[str]: + tags = [ + { + "Key": TAG_RAY_CLUSTER_NAME, + "Value": self.cluster_name, + }, + ] + for k, v in tag_filters.items(): + tags.append( + { + "Key": k, + "Value": v, + } + ) + + instances = self.acs.describe_instances(tags=tags) + non_terminated_instance = [] + for instance in instances: + if instance.get("Status") == RUNNING or instance.get("Status") == PENDING: + non_terminated_instance.append(instance.get("InstanceId")) + self.cached_nodes[instance.get("InstanceId")] = instance + return non_terminated_instance + + def is_running(self, node_id: str) -> bool: + instances = self.acs.describe_instances(instance_ids=[node_id]) + if instances is not None: + instance = instances[0] + return instance.get("Status") == "Running" + cli_logger.error("Invalid node id: %s", node_id) + return False + + def is_terminated(self, node_id: str) -> bool: + instances = self.acs.describe_instances(instance_ids=[node_id]) + if instances is not None: + assert len(instances) == 1 + instance = instances[0] + return instance.get("Status") == "Stopped" + cli_logger.error("Invalid node id: %s", node_id) + return False + + def node_tags(self, node_id: str) -> Dict[str, str]: + instances = self.acs.describe_instances(instance_ids=[node_id]) + if instances is not None: + assert len(instances) == 1 + instance = instances[0] + if instance.get("Tags") is not None: + node_tags = dict() + for tag in instance.get("Tags").get("Tag"): + node_tags[tag.get("TagKey")] = tag.get("TagValue") + return node_tags + return dict() + + def external_ip(self, node_id: str) -> str: + while True: + instances = self.acs.describe_instances(instance_ids=[node_id]) + if instances is not None: + assert len(instances) + instance = instances[0] + if ( + instance.get("PublicIpAddress") is not None + and instance.get("PublicIpAddress").get("IpAddress") is not None + ): + if len(instance.get("PublicIpAddress").get("IpAddress")) > 0: + return instance.get("PublicIpAddress").get("IpAddress")[0] + cli_logger.error("PublicIpAddress attribute is not exist. %s" % instance) + time.sleep(STOPPING_NODE_DELAY) + + def internal_ip(self, node_id: str) -> str: + while True: + instances = self.acs.describe_instances(instance_ids=[node_id]) + if instances is not None: + assert len(instances) == 1 + instance = instances[0] + if ( + instance.get("VpcAttributes") is not None + and instance.get("VpcAttributes").get("PrivateIpAddress") + is not None + and len( + instance.get("VpcAttributes") + .get("PrivateIpAddress") + .get("IpAddress") + ) + > 0 + ): + return ( + instance.get("VpcAttributes") + .get("PrivateIpAddress") + .get("IpAddress")[0] + ) + cli_logger.error("InnerIpAddress attribute is not exist. %s" % instance) + time.sleep(STOPPING_NODE_DELAY) + + def set_node_tags(self, node_id: str, tags: Dict[str, str]) -> None: + is_batching_thread = False + with self.tag_cache_lock: + if not self.tag_cache_pending: + is_batching_thread = True + # Wait for threads in the last batch to exit + self.ready_for_new_batch.wait() + self.ready_for_new_batch.clear() + self.batch_update_done.clear() + self.tag_cache_pending[node_id].update(tags) + + if is_batching_thread: + time.sleep(TAG_BATCH_DELAY) + with self.tag_cache_lock: + self._update_node_tags() + self.batch_update_done.set() + + with self.count_lock: + self.batch_thread_count += 1 + self.batch_update_done.wait() + + with self.count_lock: + self.batch_thread_count -= 1 + if self.batch_thread_count == 0: + self.ready_for_new_batch.set() + + def _update_node_tags(self): + batch_updates = defaultdict(list) + + for node_id, tags in self.tag_cache_pending.items(): + for x in tags.items(): + batch_updates[x].append(node_id) + self.tag_cache[node_id] = tags + + self.tag_cache_pending = defaultdict(dict) + + self._create_tags(batch_updates) + + def _create_tags(self, batch_updates): + + for (k, v), node_ids in batch_updates.items(): + m = "Set tag {}={} on {}".format(k, v, node_ids) + with LogTimer("AliyunNodeProvider: {}".format(m)): + if k == TAG_RAY_NODE_NAME: + k = "Name" + + self.acs.tag_resource(node_ids, [{"Key": k, "Value": v}]) + + def create_node( + self, node_config: Dict[str, Any], tags: Dict[str, str], count: int + ) -> Optional[Dict[str, Any]]: + filter_tags = [ + { + "Key": TAG_RAY_CLUSTER_NAME, + "Value": self.cluster_name, + }, + {"Key": TAG_RAY_NODE_KIND, "Value": tags[TAG_RAY_NODE_KIND]}, + {"Key": TAG_RAY_USER_NODE_TYPE, "Value": tags[TAG_RAY_USER_NODE_TYPE]}, + {"Key": TAG_RAY_LAUNCH_CONFIG, "Value": tags[TAG_RAY_LAUNCH_CONFIG]}, + {"Key": TAG_RAY_NODE_NAME, "Value": tags[TAG_RAY_NODE_NAME]}, + ] + + reused_nodes_dict = {} + if self.cache_stopped_nodes: + reuse_nodes_candidate = self.acs.describe_instances(tags=filter_tags) + if reuse_nodes_candidate: + with cli_logger.group("Stopping instances to reuse"): + reuse_node_ids = [] + for node in reuse_nodes_candidate: + node_id = node.get("InstanceId") + status = node.get("Status") + if status != STOPPING and status != STOPPED: + continue + if status == STOPPING: + # wait for node stopped + while ( + self.acs.describe_instances(instance_ids=[node_id])[ + 0 + ].get("Status") + == STOPPING + ): + logging.info("wait for %s stop" % node_id) + time.sleep(STOPPING_NODE_DELAY) + # logger.info("reuse %s" % node_id) + reuse_node_ids.append(node_id) + reused_nodes_dict[node.get("InstanceId")] = node + self.acs.start_instance(node_id) + self.tag_cache[node_id] = node.get("Tags") + self.set_node_tags(node_id, tags) + if len(reuse_node_ids) == count: + break + count -= len(reuse_node_ids) + + created_nodes_dict = {} + if count > 0: + filter_tags.append( + {"Key": TAG_RAY_NODE_STATUS, "Value": tags[TAG_RAY_NODE_STATUS]} + ) + instance_id_sets = self.acs.run_instances( + instance_type=node_config["InstanceType"], + image_id=node_config["ImageId"], + tags=filter_tags, + amount=count, + vswitch_id=self.provider_config["v_switch_id"], + security_group_id=self.provider_config["security_group_id"], + key_pair_name=self.provider_config["key_name"], + ) + instances = self.acs.describe_instances(instance_ids=instance_id_sets) + + if instances is not None: + for instance in instances: + created_nodes_dict[instance.get("InstanceId")] = instance + + all_created_nodes = reused_nodes_dict + all_created_nodes.update(created_nodes_dict) + return all_created_nodes + + def terminate_node(self, node_id: str) -> None: + logger.info("terminate node: %s" % node_id) + if self.cache_stopped_nodes: + logger.info( + "Stopping instance {} (to terminate instead, " + "set `cache_stopped_nodes: False` " + "under `provider` in the cluster configuration)" + ).format(node_id) + self.acs.stop_instance(node_id) + else: + self.acs.delete_instance(node_id) + + def terminate_nodes(self, node_ids: List[str]) -> None: + if not node_ids: + return + if self.cache_stopped_nodes: + logger.info( + "Stopping instances {} (to terminate instead, " + "set `cache_stopped_nodes: False` " + "under `provider` in the cluster configuration)".format(node_ids) + ) + + self.acs.stop_instances(node_ids) + else: + self.acs.delete_instances(node_ids) + + def _get_node(self, node_id): + """Refresh and get info for this node, updating the cache.""" + self.non_terminated_nodes({}) # Side effect: updates cache + + if node_id in self.cached_nodes: + return self.cached_nodes[node_id] + + # Node not in {pending, running} -- retry with a point query. This + # usually means the node was recently preempted or terminated. + matches = self.acs.describe_instances(instance_ids=[node_id]) + + assert len(matches) == 1, "Invalid instance id {}".format(node_id) + return matches[0] + + def _get_cached_node(self, node_id): + """Return node info from cache if possible, otherwise fetches it.""" + if node_id in self.cached_nodes: + return self.cached_nodes[node_id] + + return self._get_node(node_id) + + @staticmethod + def bootstrap_config(cluster_config): + return bootstrap_aliyun(cluster_config) diff --git a/.venv/lib/python3.11/site-packages/ray/autoscaler/_private/aliyun/utils.py b/.venv/lib/python3.11/site-packages/ray/autoscaler/_private/aliyun/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..a1bb3c2710a7b73cd965f917d903c785a83bb1a3 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/autoscaler/_private/aliyun/utils.py @@ -0,0 +1,461 @@ +import json +import logging + +from aliyunsdkcore import client +from aliyunsdkcore.acs_exception.exceptions import ClientException, ServerException +from aliyunsdkecs.request.v20140526.AllocatePublicIpAddressRequest import ( + AllocatePublicIpAddressRequest, +) +from aliyunsdkecs.request.v20140526.AuthorizeSecurityGroupRequest import ( + AuthorizeSecurityGroupRequest, +) +from aliyunsdkecs.request.v20140526.CreateInstanceRequest import CreateInstanceRequest +from aliyunsdkecs.request.v20140526.CreateKeyPairRequest import CreateKeyPairRequest +from aliyunsdkecs.request.v20140526.CreateSecurityGroupRequest import ( + CreateSecurityGroupRequest, +) +from aliyunsdkecs.request.v20140526.CreateVpcRequest import CreateVpcRequest +from aliyunsdkecs.request.v20140526.CreateVSwitchRequest import CreateVSwitchRequest +from aliyunsdkecs.request.v20140526.DeleteInstanceRequest import DeleteInstanceRequest +from aliyunsdkecs.request.v20140526.DeleteInstancesRequest import DeleteInstancesRequest +from aliyunsdkecs.request.v20140526.DeleteKeyPairsRequest import DeleteKeyPairsRequest +from aliyunsdkecs.request.v20140526.DescribeInstancesRequest import ( + DescribeInstancesRequest, +) +from aliyunsdkecs.request.v20140526.DescribeKeyPairsRequest import ( + DescribeKeyPairsRequest, +) +from aliyunsdkecs.request.v20140526.DescribeSecurityGroupsRequest import ( + DescribeSecurityGroupsRequest, +) +from aliyunsdkecs.request.v20140526.DescribeVpcsRequest import DescribeVpcsRequest +from aliyunsdkecs.request.v20140526.DescribeVSwitchesRequest import ( + DescribeVSwitchesRequest, +) +from aliyunsdkecs.request.v20140526.ImportKeyPairRequest import ImportKeyPairRequest +from aliyunsdkecs.request.v20140526.RunInstancesRequest import RunInstancesRequest +from aliyunsdkecs.request.v20140526.StartInstanceRequest import StartInstanceRequest +from aliyunsdkecs.request.v20140526.StopInstanceRequest import StopInstanceRequest +from aliyunsdkecs.request.v20140526.StopInstancesRequest import StopInstancesRequest +from aliyunsdkecs.request.v20140526.TagResourcesRequest import TagResourcesRequest + + +class AcsClient: + """ + A wrapper around Aliyun SDK. We use this wrapper in aliyun node provider. + + Parameters: + access_key: The AccessKey ID of your aliyun account. + access_key_secret: The AccessKey secret of your aliyun account. + region_id: A region is a geographic area where a data center resides. + Region_id is the ID of region (e.g., cn-hangzhou, + us-west-1, etc.) + max_retries: The maximum number of retries each connection. + """ + + def __init__(self, access_key, access_key_secret, region_id, max_retries): + self.cli = client.AcsClient( + ak=access_key, + secret=access_key_secret, + max_retry_time=max_retries, + region_id=region_id, + ) + + def describe_instances(self, tags=None, instance_ids=None): + """Query the details of one or more Elastic Compute Service (ECS) instances. + + :param tags: The tags of the instance. + :param instance_ids: The IDs of ECS instances + :return: ECS instance list + """ + request = DescribeInstancesRequest() + if tags is not None: + request.set_Tags(tags) + if instance_ids is not None: + request.set_InstanceIds(instance_ids) + response = self._send_request(request) + if response is not None: + instance_list = response.get("Instances").get("Instance") + return instance_list + return None + + def create_instance( + self, + instance_type, + image_id, + tags, + key_pair_name, + optimized="optimized", + instance_charge_type="PostPaid", + spot_strategy="SpotWithPriceLimit", + internet_charge_type="PayByTraffic", + internet_max_bandwidth_out=5, + ): + """Create a subscription or pay-as-you-go ECS instance. + + :param instance_type: The instance type of the ECS. + :param image_id: The ID of the image used to create the instance. + :param tags: The tags of the instance. + :param key_pair_name: The name of the key pair to be bound to + the instance. + :param optimized: Specifies whether the instance is I/O optimized + :param instance_charge_type: The billing method of the instance. + Default value: PostPaid. + :param spot_strategy: The preemption policy for the pay-as-you-go + instance. + :param internet_charge_type: The billing method for network usage. + Default value: PayByTraffic. + :param internet_max_bandwidth_out: The maximum inbound public + bandwidth. Unit: Mbit/s. + :return: The created instance ID. + """ + request = CreateInstanceRequest() + request.set_InstanceType(instance_type) + request.set_ImageId(image_id) + request.set_IoOptimized(optimized) + request.set_InstanceChargeType(instance_charge_type) + request.set_SpotStrategy(spot_strategy) + request.set_InternetChargeType(internet_charge_type) + request.set_InternetMaxBandwidthOut(internet_max_bandwidth_out) + request.set_KeyPairName(key_pair_name) + request.set_Tags(tags) + + response = self._send_request(request) + if response is not None: + instance_id = response.get("InstanceId") + logging.info("instance %s created task submit successfully.", instance_id) + return instance_id + logging.error("instance created failed.") + return None + + def run_instances( + self, + instance_type, + image_id, + tags, + security_group_id, + vswitch_id, + key_pair_name, + amount=1, + optimized="optimized", + instance_charge_type="PostPaid", + spot_strategy="SpotWithPriceLimit", + internet_charge_type="PayByTraffic", + internet_max_bandwidth_out=1, + ): + """Create one or more pay-as-you-go or subscription + Elastic Compute Service (ECS) instances + + :param instance_type: The instance type of the ECS. + :param image_id: The ID of the image used to create the instance. + :param tags: The tags of the instance. + :param security_group_id: The ID of the security group to which to + assign the instance. Instances in the same + security group can communicate with + each other. + :param vswitch_id: The ID of the vSwitch to which to connect + the instance. + :param key_pair_name: The name of the key pair to be bound to + the instance. + :param amount: The number of instances that you want to create. + :param optimized: Specifies whether the instance is I/O optimized + :param instance_charge_type: The billing method of the instance. + Default value: PostPaid. + :param spot_strategy: The preemption policy for the pay-as-you-go + instance. + :param internet_charge_type: The billing method for network usage. + Default value: PayByTraffic. + :param internet_max_bandwidth_out: The maximum inbound public + bandwidth. Unit: Mbit/s. + :return: The created instance IDs. + """ + request = RunInstancesRequest() + request.set_InstanceType(instance_type) + request.set_ImageId(image_id) + request.set_IoOptimized(optimized) + request.set_InstanceChargeType(instance_charge_type) + request.set_SpotStrategy(spot_strategy) + request.set_InternetChargeType(internet_charge_type) + request.set_InternetMaxBandwidthOut(internet_max_bandwidth_out) + request.set_Tags(tags) + request.set_Amount(amount) + request.set_SecurityGroupId(security_group_id) + request.set_VSwitchId(vswitch_id) + request.set_KeyPairName(key_pair_name) + + response = self._send_request(request) + if response is not None: + instance_ids = response.get("InstanceIdSets").get("InstanceIdSet") + return instance_ids + logging.error("instance created failed.") + return None + + def create_security_group(self, vpc_id): + """Create a security group + + :param vpc_id: The ID of the VPC in which to create + the security group. + :return: The created security group ID. + """ + request = CreateSecurityGroupRequest() + request.set_VpcId(vpc_id) + response = self._send_request(request) + if response is not None: + security_group_id = response.get("SecurityGroupId") + return security_group_id + return None + + def describe_security_groups(self, vpc_id=None, tags=None): + """Query basic information of security groups. + + :param vpc_id: The ID of the VPC to which the security group belongs. + :param tags: The tags of the security group. + :return: Security group list. + """ + request = DescribeSecurityGroupsRequest() + if vpc_id is not None: + request.set_VpcId(vpc_id) + if tags is not None: + request.set_Tags(tags) + response = self._send_request(request) + if response is not None: + security_groups = response.get("SecurityGroups").get("SecurityGroup") + return security_groups + logging.error("describe security group failed.") + return None + + def authorize_security_group( + self, ip_protocol, port_range, security_group_id, source_cidr_ip + ): + """Create an inbound security group rule. + + :param ip_protocol: The transport layer protocol. + :param port_range: The range of destination ports relevant to + the transport layer protocol. + :param security_group_id: The ID of the destination security group. + :param source_cidr_ip: The range of source IPv4 addresses. + CIDR blocks and IPv4 addresses are supported. + """ + request = AuthorizeSecurityGroupRequest() + request.set_IpProtocol(ip_protocol) + request.set_PortRange(port_range) + request.set_SecurityGroupId(security_group_id) + request.set_SourceCidrIp(source_cidr_ip) + self._send_request(request) + + def create_v_switch(self, vpc_id, zone_id, cidr_block): + """Create vSwitches to divide the VPC into one or more subnets + + :param vpc_id: The ID of the VPC to which the VSwitch belongs. + :param zone_id: The ID of the zone to which + the target VSwitch belongs. + :param cidr_block: The CIDR block of the VSwitch. + :return: + """ + request = CreateVSwitchRequest() + request.set_ZoneId(zone_id) + request.set_VpcId(vpc_id) + request.set_CidrBlock(cidr_block) + response = self._send_request(request) + if response is not None: + return response.get("VSwitchId") + else: + logging.error("create_v_switch vpc_id %s failed.", vpc_id) + return None + + def create_vpc(self): + """Creates a virtual private cloud (VPC). + + :return: The created VPC ID. + """ + request = CreateVpcRequest() + response = self._send_request(request) + if response is not None: + return response.get("VpcId") + return None + + def describe_vpcs(self): + """Queries one or more VPCs in a region. + + :return: VPC list. + """ + request = DescribeVpcsRequest() + response = self._send_request(request) + if response is not None: + return response.get("Vpcs").get("Vpc") + return None + + def tag_resource(self, resource_ids, tags, resource_type="instance"): + """Create and bind tags to specified ECS resources. + + :param resource_ids: The IDs of N resources. + :param tags: The tags of the resource. + :param resource_type: The type of the resource. + """ + request = TagResourcesRequest() + request.set_Tags(tags) + request.set_ResourceType(resource_type) + request.set_ResourceIds(resource_ids) + response = self._send_request(request) + if response is not None: + logging.info("instance %s create tag successfully.", resource_ids) + else: + logging.error("instance %s create tag failed.", resource_ids) + + def start_instance(self, instance_id): + """Start an ECS instance. + + :param instance_id: The Ecs instance ID. + """ + request = StartInstanceRequest() + request.set_InstanceId(instance_id) + response = self._send_request(request) + + if response is not None: + logging.info("instance %s start successfully.", instance_id) + else: + logging.error("instance %s start failed.", instance_id) + + def stop_instance(self, instance_id, force_stop=False): + """Stop an ECS instance that is in the Running state. + + :param instance_id: The Ecs instance ID. + :param force_stop: Specifies whether to forcibly stop the instance. + :return: + """ + request = StopInstanceRequest() + request.set_InstanceId(instance_id) + request.set_ForceStop(force_stop) + logging.info("Stop %s command submit successfully.", instance_id) + self._send_request(request) + + def stop_instances(self, instance_ids, stopped_mode="StopCharging"): + """Stop one or more ECS instances that are in the Running state. + + :param instance_ids: The IDs of instances. + :param stopped_mode: Specifies whether billing for the instance + continues after the instance is stopped. + """ + request = StopInstancesRequest() + request.set_InstanceIds(instance_ids) + request.set_StoppedMode(stopped_mode) + response = self._send_request(request) + if response is None: + logging.error("stop_instances failed") + + def delete_instance(self, instance_id): + """Release a pay-as-you-go instance or + an expired subscription instance. + + :param instance_id: The ID of the instance that you want to release. + """ + request = DeleteInstanceRequest() + request.set_InstanceId(instance_id) + request.set_Force(True) + logging.info("Delete %s command submit successfully", instance_id) + self._send_request(request) + + def delete_instances(self, instance_ids): + """Release one or more pay-as-you-go instances or + expired subscription instances. + + :param instance_ids: The IDs of instances that you want to release. + """ + request = DeleteInstancesRequest() + request.set_Force(True) + request.set_InstanceIds(instance_ids) + self._send_request(request) + + def allocate_public_address(self, instance_id): + """Assign a public IP address to an ECS instance. + + :param instance_id: The ID of the instance to which you want to + assign a public IP address. + :return: The assigned ip. + """ + request = AllocatePublicIpAddressRequest() + request.set_InstanceId(instance_id) + response = self._send_request(request) + if response is not None: + return response.get("IpAddress") + + def create_key_pair(self, key_pair_name): + """Create an SSH key pair. + + :param key_pair_name: The name of the key pair. + :return: The created keypair data. + """ + request = CreateKeyPairRequest() + request.set_KeyPairName(key_pair_name) + response = self._send_request(request) + if response is not None: + logging.info("Create Key Pair %s Successfully", response.get("KeyPairId")) + return response + else: + logging.error("Create Key Pair Failed") + return None + + def import_key_pair(self, key_pair_name, public_key_body): + """Import the public key of an RSA-encrypted key pair + that is generated by a third-party tool. + + :param key_pair_name: The name of the key pair. + :param public_key_body: The public key of the key pair. + """ + request = ImportKeyPairRequest() + request.set_KeyPairName(key_pair_name) + request.set_PublicKeyBody(public_key_body) + self._send_request(request) + + def delete_key_pairs(self, key_pair_names): + """Delete one or more SSH key pairs. + + :param key_pair_names: The name of the key pair. + :return: + """ + request = DeleteKeyPairsRequest() + request.set_KeyPairNames(key_pair_names) + self._send_request(request) + + def describe_key_pairs(self, key_pair_name=None): + """Query one or more key pairs. + + :param key_pair_name: The name of the key pair. + :return: + """ + request = DescribeKeyPairsRequest() + if key_pair_name is not None: + request.set_KeyPairName(key_pair_name) + response = self._send_request(request) + if response is not None: + return response.get("KeyPairs").get("KeyPair") + else: + return None + + def describe_v_switches(self, vpc_id=None): + """Queries one or more VSwitches. + + :param vpc_id: The ID of the VPC to which the VSwitch belongs. + :return: VSwitch list. + """ + request = DescribeVSwitchesRequest() + if vpc_id is not None: + request.set_VpcId(vpc_id) + response = self._send_request(request) + if response is not None: + return response.get("VSwitches").get("VSwitch") + else: + logging.error("Describe VSwitches Failed.") + return None + + def _send_request(self, request): + """send open api request""" + request.set_accept_format("json") + try: + response_str = self.cli.do_action_with_exception(request) + response_detail = json.loads(response_str) + return response_detail + except (ClientException, ServerException) as e: + logging.error(request.get_action_name()) + logging.error(e) + return None diff --git a/.venv/lib/python3.11/site-packages/ray/autoscaler/_private/aws/__pycache__/utils.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/autoscaler/_private/aws/__pycache__/utils.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..dcabd79b1f7482030329bb5df8fbab04ba232793 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/autoscaler/_private/aws/__pycache__/utils.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/autoscaler/_private/kuberay/__init__.py b/.venv/lib/python3.11/site-packages/ray/autoscaler/_private/kuberay/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/.venv/lib/python3.11/site-packages/ray/autoscaler/_private/kuberay/__pycache__/__init__.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/autoscaler/_private/kuberay/__pycache__/__init__.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..029e71f673fb06234dc24fc9c39a7da13cc2452d Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/autoscaler/_private/kuberay/__pycache__/__init__.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/autoscaler/_private/kuberay/__pycache__/autoscaling_config.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/autoscaler/_private/kuberay/__pycache__/autoscaling_config.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..a90d8b83c57fbab99d6ee4e9169e3da7c9893950 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/autoscaler/_private/kuberay/__pycache__/autoscaling_config.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/autoscaler/_private/kuberay/__pycache__/node_provider.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/autoscaler/_private/kuberay/__pycache__/node_provider.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..09b4bd530a8756c5cfcb6222b6a80ada9961fbba Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/autoscaler/_private/kuberay/__pycache__/node_provider.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/autoscaler/_private/kuberay/__pycache__/run_autoscaler.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/autoscaler/_private/kuberay/__pycache__/run_autoscaler.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..3b9ef23500c17a0027b8867070ddebd5ddf36a49 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/autoscaler/_private/kuberay/__pycache__/run_autoscaler.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/autoscaler/_private/kuberay/__pycache__/utils.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/autoscaler/_private/kuberay/__pycache__/utils.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e1ad3fd2da69c1db2177fdf6de956b7794880da8 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/autoscaler/_private/kuberay/__pycache__/utils.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/autoscaler/_private/kuberay/node_provider.py b/.venv/lib/python3.11/site-packages/ray/autoscaler/_private/kuberay/node_provider.py new file mode 100644 index 0000000000000000000000000000000000000000..0bf01e550443370b8dd76d1a22bc736cd02bffee --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/autoscaler/_private/kuberay/node_provider.py @@ -0,0 +1,536 @@ +import datetime +import json +import logging +import os +from abc import ABC, abstractmethod +from collections import defaultdict +from typing import Any, Dict, List, Optional, Tuple + +import requests + +from ray.autoscaler._private.constants import WORKER_LIVENESS_CHECK_KEY +from ray.autoscaler._private.util import NodeID, NodeIP, NodeKind, NodeStatus, NodeType +from ray.autoscaler.batching_node_provider import ( + BatchingNodeProvider, + NodeData, + ScaleRequest, +) +from ray.autoscaler.tags import ( + NODE_KIND_HEAD, + NODE_KIND_WORKER, + STATUS_UP_TO_DATE, + STATUS_UPDATE_FAILED, + TAG_RAY_USER_NODE_TYPE, +) + +# Key for KubeRay label that identifies a Ray pod as head or worker. +KUBERAY_LABEL_KEY_KIND = "ray.io/node-type" +# Key for KubeRay label that identifies the worker group (autoscaler node type) of a +# Ray pod. +KUBERAY_LABEL_KEY_TYPE = "ray.io/group" + +# These should be synced with: +# https://github.com/ray-project/kuberay/blob/f2d94ffe213dd8f69481b09c474047cb899fa73b/ray-operator/apis/ray/v1/raycluster_types.go#L165-L171 # noqa +# Kind label value indicating the pod is the head. +KUBERAY_KIND_HEAD = "head" +# Kind label value indicating the pod is the worker. +KUBERAY_KIND_WORKER = "worker" + +# KubeRay CRD version +KUBERAY_CRD_VER = os.getenv("KUBERAY_CRD_VER", "v1alpha1") + +KUBERAY_REQUEST_TIMEOUT_S = int(os.getenv("KUBERAY_REQUEST_TIMEOUT_S", 60)) + +RAY_HEAD_POD_NAME = os.getenv("RAY_HEAD_POD_NAME") + +# https://kubernetes.io/docs/tasks/run-application/access-api-from-pod +# While running in a Pod, your container can create an HTTPS URL for the +# Kubernetes API server by fetching the KUBERNETES_SERVICE_HOST and +# KUBERNETES_SERVICE_PORT_HTTPS environment variables. +KUBERNETES_SERVICE_HOST = os.getenv( + "KUBERNETES_SERVICE_HOST", "https://kubernetes.default" +) +KUBERNETES_SERVICE_PORT = os.getenv("KUBERNETES_SERVICE_PORT_HTTPS", "443") +KUBERNETES_HOST = f"{KUBERNETES_SERVICE_HOST}:{KUBERNETES_SERVICE_PORT}" +# Key for GKE label that identifies which multi-host replica a pod belongs to +REPLICA_INDEX_KEY = "replicaIndex" + +TOKEN_REFRESH_PERIOD = datetime.timedelta(minutes=1) + +# Design: + +# Each modification the autoscaler wants to make is posted to the API server goal state +# (e.g. if the autoscaler wants to scale up, it increases the number of +# replicas of the worker group it wants to scale, if it wants to scale down +# it decreases the number of replicas and adds the exact pods that should be +# terminated to the scaleStrategy). + +# KubeRayNodeProvider inherits from BatchingNodeProvider. +# Thus, the autoscaler's create and terminate requests are batched into a single +# Scale Request object which is submitted at the end of autoscaler update. +# KubeRay node provider converts the ScaleRequest into a RayCluster CR patch +# and applies the patch in the submit_scale_request method. + +# To reduce potential for race conditions, KubeRayNodeProvider +# aborts the autoscaler update if the operator has not yet processed workersToDelete - +# see KubeRayNodeProvider.safe_to_scale(). +# Once it is confirmed that workersToDelete have been cleaned up, KubeRayNodeProvider +# clears the workersToDelete list. + + +# Note: Log handlers set up in autoscaling monitor entrypoint. +logger = logging.getLogger(__name__) + + +def node_data_from_pod(pod: Dict[str, Any]) -> NodeData: + """Converts a Ray pod extracted from K8s into Ray NodeData. + NodeData is processed by BatchingNodeProvider. + """ + kind, type = kind_and_type(pod) + status = status_tag(pod) + ip = pod_ip(pod) + replica_index = _replica_index_label(pod) + return NodeData( + kind=kind, type=type, replica_index=replica_index, status=status, ip=ip + ) + + +def kind_and_type(pod: Dict[str, Any]) -> Tuple[NodeKind, NodeType]: + """Determine Ray node kind (head or workers) and node type (worker group name) + from a Ray pod's labels. + """ + labels = pod["metadata"]["labels"] + kind = ( + NODE_KIND_HEAD + if labels[KUBERAY_LABEL_KEY_KIND] == KUBERAY_KIND_HEAD + else NODE_KIND_WORKER + ) + type = labels[KUBERAY_LABEL_KEY_TYPE] + return kind, type + + +def _replica_index_label(pod: Dict[str, Any]) -> Optional[str]: + """Returns the replicaIndex label for a Pod in a multi-host TPU worker group. + The replicaIndex label is set by the GKE TPU Ray webhook and is of + the form {$WORKER_GROUP_NAME-$REPLICA_INDEX} where $REPLICA_INDEX + is an integer from 0 to Replicas-1. + """ + labels = pod["metadata"]["labels"] + return labels.get(REPLICA_INDEX_KEY, None) + + +def pod_ip(pod: Dict[str, Any]) -> NodeIP: + return pod["status"].get("podIP", "IP not yet assigned") + + +def status_tag(pod: Dict[str, Any]) -> NodeStatus: + """Convert pod state to Ray autoscaler node status. + + See the doc string of the class + batching_node_provider.NodeData for the semantics of node status. + """ + if ( + "containerStatuses" not in pod["status"] + or not pod["status"]["containerStatuses"] + ): + return "pending" + + state = pod["status"]["containerStatuses"][0]["state"] + + if "pending" in state: + return "pending" + if "running" in state: + return STATUS_UP_TO_DATE + if "waiting" in state: + return "waiting" + if "terminated" in state: + return STATUS_UPDATE_FAILED + raise ValueError("Unexpected container state.") + + +def worker_delete_patch(group_index: str, workers_to_delete: List[NodeID]): + path = f"/spec/workerGroupSpecs/{group_index}/scaleStrategy" + value = {"workersToDelete": workers_to_delete} + return replace_patch(path, value) + + +def worker_replica_patch(group_index: str, target_replicas: int): + path = f"/spec/workerGroupSpecs/{group_index}/replicas" + value = target_replicas + return replace_patch(path, value) + + +def replace_patch(path: str, value: Any) -> Dict[str, Any]: + return {"op": "replace", "path": path, "value": value} + + +def load_k8s_secrets() -> Tuple[Dict[str, str], str]: + """ + Loads secrets needed to access K8s resources. + + Returns: + headers: Headers with K8s access token + verify: Path to certificate + """ + with open("/var/run/secrets/kubernetes.io/serviceaccount/token") as secret: + token = secret.read() + + headers = { + "Authorization": "Bearer " + token, + } + verify = "/var/run/secrets/kubernetes.io/serviceaccount/ca.crt" + + return headers, verify + + +def url_from_resource( + namespace: str, + path: str, + kuberay_crd_version: str = KUBERAY_CRD_VER, + kubernetes_host: str = KUBERNETES_HOST, +) -> str: + """Convert resource path to REST URL for Kubernetes API server. + + Args: + namespace: The K8s namespace of the resource + path: The part of the resource path that starts with the resource type. + Supported resource types are "pods" and "rayclusters". + kuberay_crd_version: The API version of the KubeRay CRD. + Looks like "v1alpha1", "v1". + kubernetes_host: The host of the Kubernetes API server. + Uses $KUBERNETES_SERVICE_HOST and + $KUBERNETES_SERVICE_PORT to construct the kubernetes_host if not provided. + + When set by Kubernetes, + $KUBERNETES_SERVICE_HOST could be an IP address. That's why the https + scheme is added here. + + Defaults to "https://kubernetes.default:443". + """ + if kubernetes_host.startswith("http://"): + raise ValueError("Kubernetes host must be accessed over HTTPS.") + if not kubernetes_host.startswith("https://"): + kubernetes_host = "https://" + kubernetes_host + if path.startswith("pods"): + api_group = "/api/v1" + elif path.startswith("rayclusters"): + api_group = "/apis/ray.io/" + kuberay_crd_version + else: + raise NotImplementedError("Tried to access unknown entity at {}".format(path)) + return kubernetes_host + api_group + "/namespaces/" + namespace + "/" + path + + +def _worker_group_index(raycluster: Dict[str, Any], group_name: str) -> int: + """Extract worker group index from RayCluster.""" + group_names = [ + spec["groupName"] for spec in raycluster["spec"].get("workerGroupSpecs", []) + ] + return group_names.index(group_name) + + +def _worker_group_max_replicas( + raycluster: Dict[str, Any], group_index: int +) -> Optional[int]: + """Extract the maxReplicas of a worker group. + + If maxReplicas is unset, return None, to be interpreted as "no constraint". + At time of writing, it should be impossible for maxReplicas to be unset, but it's + better to handle this anyway. + """ + return raycluster["spec"]["workerGroupSpecs"][group_index].get("maxReplicas") + + +def _worker_group_replicas(raycluster: Dict[str, Any], group_index: int): + # 1 is the default replicas value used by the KubeRay operator + return raycluster["spec"]["workerGroupSpecs"][group_index].get("replicas", 1) + + +class IKubernetesHttpApiClient(ABC): + """ + An interface for a Kubernetes HTTP API client. + + This interface could be used to mock the Kubernetes API client in tests. + """ + + @abstractmethod + def get(self, path: str) -> Dict[str, Any]: + """Wrapper for REST GET of resource with proper headers.""" + pass + + @abstractmethod + def patch(self, path: str, payload: List[Dict[str, Any]]) -> Dict[str, Any]: + """Wrapper for REST PATCH of resource with proper headers.""" + pass + + +class KubernetesHttpApiClient(IKubernetesHttpApiClient): + def __init__(self, namespace: str, kuberay_crd_version: str = KUBERAY_CRD_VER): + self._kuberay_crd_version = kuberay_crd_version + self._namespace = namespace + self._token_expires_at = datetime.datetime.now() + TOKEN_REFRESH_PERIOD + self._headers, self._verify = None, None + + def _get_refreshed_headers_and_verify(self): + if (datetime.datetime.now() >= self._token_expires_at) or ( + self._headers is None or self._verify is None + ): + logger.info("Refreshing K8s API client token and certs.") + self._headers, self._verify = load_k8s_secrets() + self._token_expires_at = datetime.datetime.now() + TOKEN_REFRESH_PERIOD + return self._headers, self._verify + else: + return self._headers, self._verify + + def get(self, path: str) -> Dict[str, Any]: + """Wrapper for REST GET of resource with proper headers. + + Args: + path: The part of the resource path that starts with the resource type. + + Returns: + The JSON response of the GET request. + + Raises: + HTTPError: If the GET request fails. + """ + url = url_from_resource( + namespace=self._namespace, + path=path, + kuberay_crd_version=self._kuberay_crd_version, + ) + + headers, verify = self._get_refreshed_headers_and_verify() + result = requests.get( + url, + headers=headers, + timeout=KUBERAY_REQUEST_TIMEOUT_S, + verify=verify, + ) + if not result.status_code == 200: + result.raise_for_status() + return result.json() + + def patch(self, path: str, payload: List[Dict[str, Any]]) -> Dict[str, Any]: + """Wrapper for REST PATCH of resource with proper headers + + Args: + path: The part of the resource path that starts with the resource type. + payload: The JSON patch payload. + + Returns: + The JSON response of the PATCH request. + + Raises: + HTTPError: If the PATCH request fails. + """ + url = url_from_resource( + namespace=self._namespace, + path=path, + kuberay_crd_version=self._kuberay_crd_version, + ) + headers, verify = self._get_refreshed_headers_and_verify() + result = requests.patch( + url, + json.dumps(payload), + headers={**headers, "Content-type": "application/json-patch+json"}, + verify=verify, + ) + if not result.status_code == 200: + result.raise_for_status() + return result.json() + + +class KubeRayNodeProvider(BatchingNodeProvider): # type: ignore + def __init__( + self, + provider_config: Dict[str, Any], + cluster_name: str, + ): + logger.info("Creating KubeRayNodeProvider.") + self.namespace = provider_config["namespace"] + self.cluster_name = cluster_name + + self.k8s_api_client = KubernetesHttpApiClient(self.namespace) + + assert ( + provider_config.get(WORKER_LIVENESS_CHECK_KEY, True) is False + ), f"To use KubeRayNodeProvider, must set `{WORKER_LIVENESS_CHECK_KEY}:False`." + BatchingNodeProvider.__init__(self, provider_config, cluster_name) + + def get_node_data(self) -> Dict[NodeID, NodeData]: + """Queries K8s for pods in the RayCluster. Converts that pod data into a + map of pod name to Ray NodeData, as required by BatchingNodeProvider. + """ + # Store the raycluster CR + self._raycluster = self._get(f"rayclusters/{self.cluster_name}") + + # Get the pods resource version. + # Specifying a resource version in list requests is important for scalability: + # https://kubernetes.io/docs/reference/using-api/api-concepts/#semantics-for-get-and-list + resource_version = self._get_pods_resource_version() + if resource_version: + logger.info( + f"Listing pods for RayCluster {self.cluster_name}" + f" in namespace {self.namespace}" + f" at pods resource version >= {resource_version}." + ) + + # Filter pods by cluster_name. + label_selector = requests.utils.quote(f"ray.io/cluster={self.cluster_name}") + + resource_path = f"pods?labelSelector={label_selector}" + if resource_version: + resource_path += ( + f"&resourceVersion={resource_version}" + + "&resourceVersionMatch=NotOlderThan" + ) + + pod_list = self._get(resource_path) + fetched_resource_version = pod_list["metadata"]["resourceVersion"] + logger.info( + f"Fetched pod data at resource version" f" {fetched_resource_version}." + ) + + # Extract node data from the pod list. + node_data_dict = {} + for pod in pod_list["items"]: + # Kubernetes sets metadata.deletionTimestamp immediately after admitting a + # request to delete an object. Full removal of the object may take some time + # after the deletion timestamp is set. See link for details: + # https://kubernetes.io/docs/reference/using-api/api-concepts/#resource-deletion + if "deletionTimestamp" in pod["metadata"]: + # Ignore pods marked for termination. + continue + pod_name = pod["metadata"]["name"] + node_data_dict[pod_name] = node_data_from_pod(pod) + return node_data_dict + + def submit_scale_request(self, scale_request: ScaleRequest): + """Converts the scale request generated by BatchingNodeProvider into + a patch that modifies the RayCluster CR's replicas and/or workersToDelete + fields. Then submits the patch to the K8s API server. + """ + # Transform the scale request into a patch payload. + patch_payload = self._scale_request_to_patch_payload( + scale_request, self._raycluster + ) + + # Submit the patch to K8s. + logger.info( + "Autoscaler is submitting the following patch to RayCluster " + f"{self.cluster_name} in namespace {self.namespace}." + ) + logger.info(patch_payload) + self._submit_raycluster_patch(patch_payload) + + def safe_to_scale(self) -> bool: + """Returns False iff non_terminated_nodes contains any pods in the RayCluster's + workersToDelete lists. + + Explanation: + If there are any workersToDelete which are non-terminated, + we should wait for the operator to do its job and delete those + pods. Therefore, we back off the autoscaler update. + + If, on the other hand, all of the workersToDelete have already been cleaned up, + then we patch away the workersToDelete lists and return True. + In the future, we may consider having the operator clean up workersToDelete + on it own: + https://github.com/ray-project/kuberay/issues/733 + + Note (Dmitri): + It is stylistically bad that this function has a side effect. + """ + # Get the list of nodes. + node_set = set(self.node_data_dict.keys()) + worker_groups = self._raycluster["spec"].get("workerGroupSpecs", []) + + # Accumulates the indices of worker groups with non-empty workersToDelete + non_empty_worker_group_indices = [] + + for group_index, worker_group in enumerate(worker_groups): + workersToDelete = worker_group.get("scaleStrategy", {}).get( + "workersToDelete", [] + ) + if workersToDelete: + non_empty_worker_group_indices.append(group_index) + for worker in workersToDelete: + if worker in node_set: + # The operator hasn't removed this worker yet. Abort + # the autoscaler update. + logger.warning(f"Waiting for operator to remove worker {worker}.") + return False + + # All required workersToDelete have been removed. + # Clean up the workersToDelete field. + patch_payload = [] + for group_index in non_empty_worker_group_indices: + patch = worker_delete_patch(group_index, workers_to_delete=[]) + patch_payload.append(patch) + if patch_payload: + logger.info("Cleaning up workers to delete.") + logger.info(f"Submitting patch {patch_payload}.") + self._submit_raycluster_patch(patch_payload) + + # It's safe to proceed with the autoscaler update. + return True + + def _get_pods_resource_version(self) -> str: + """ + Extract a recent pods resource version by reading the head pod's + metadata.resourceVersion of the response. + """ + if not RAY_HEAD_POD_NAME: + return None + pod_resp = self._get(f"pods/{RAY_HEAD_POD_NAME}") + return pod_resp["metadata"]["resourceVersion"] + + def _scale_request_to_patch_payload( + self, scale_request: ScaleRequest, raycluster: Dict[str, Any] + ) -> List[Dict[str, Any]]: + """Converts autoscaler scale request into a RayCluster CR patch payload.""" + patch_payload = [] + # Collect patches for replica counts. + for node_type, target_replicas in scale_request.desired_num_workers.items(): + group_index = _worker_group_index(raycluster, node_type) + group_max_replicas = _worker_group_max_replicas(raycluster, group_index) + # Cap the replica count to maxReplicas. + if group_max_replicas is not None and group_max_replicas < target_replicas: + logger.warning( + "Autoscaler attempted to create " + + "more than maxReplicas pods of type {}.".format(node_type) + ) + target_replicas = group_max_replicas + # Check if we need to change the target count. + if target_replicas == _worker_group_replicas(raycluster, group_index): + # No patch required. + continue + # Need to patch replica count. Format the patch and add it to the payload. + patch = worker_replica_patch(group_index, target_replicas) + patch_payload.append(patch) + + # Maps node_type to nodes to delete for that group. + deletion_groups = defaultdict(list) + for worker in scale_request.workers_to_delete: + node_type = self.node_tags(worker)[TAG_RAY_USER_NODE_TYPE] + deletion_groups[node_type].append(worker) + + for node_type, workers_to_delete in deletion_groups.items(): + group_index = _worker_group_index(raycluster, node_type) + patch = worker_delete_patch(group_index, workers_to_delete) + patch_payload.append(patch) + + return patch_payload + + def _submit_raycluster_patch(self, patch_payload: List[Dict[str, Any]]): + """Submits a patch to modify a RayCluster CR.""" + path = "rayclusters/{}".format(self.cluster_name) + self._patch(path, patch_payload) + + def _get(self, path: str) -> Dict[str, Any]: + """Wrapper for REST GET of resource with proper headers.""" + return self.k8s_api_client.get(path) + + def _patch(self, path: str, payload: List[Dict[str, Any]]) -> Dict[str, Any]: + """Wrapper for REST PATCH of resource with proper headers.""" + return self.k8s_api_client.patch(path, payload) diff --git a/.venv/lib/python3.11/site-packages/ray/autoscaler/_private/kuberay/run_autoscaler.py b/.venv/lib/python3.11/site-packages/ray/autoscaler/_private/kuberay/run_autoscaler.py new file mode 100644 index 0000000000000000000000000000000000000000..105c41bde3b2d899ce240ae0f9ed9af468ab6f48 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/autoscaler/_private/kuberay/run_autoscaler.py @@ -0,0 +1,119 @@ +import logging +import os +import subprocess +import time + +import ray +from ray._private import ray_constants +from ray._private.ray_logging import setup_component_logger +from ray._private.services import get_node_ip_address +from ray._private.utils import try_to_create_directory +from ray._raylet import GcsClient +from ray.autoscaler._private.kuberay.autoscaling_config import AutoscalingConfigProducer +from ray.autoscaler._private.monitor import Monitor +from ray.autoscaler.v2.instance_manager.config import KubeRayConfigReader +from ray.autoscaler.v2.utils import is_autoscaler_v2 + +logger = logging.getLogger(__name__) + +BACKOFF_S = 5 + + +def _get_log_dir() -> str: + return os.path.join( + ray._private.utils.get_ray_temp_dir(), + ray._private.ray_constants.SESSION_LATEST, + "logs", + ) + + +def run_kuberay_autoscaler(cluster_name: str, cluster_namespace: str): + """Wait until the Ray head container is ready. Then start the autoscaler.""" + head_ip = get_node_ip_address() + ray_address = f"{head_ip}:6379" + while True: + try: + # Autoscaler Ray version might not exactly match GCS version, so skip the + # version check when checking GCS status. + subprocess.check_call( + [ + "ray", + "health-check", + "--address", + ray_address, + "--skip-version-check", + ] + ) + logger.info("The Ray head is ready. Starting the autoscaler.") + break + except subprocess.CalledProcessError: + logger.warning( + f"The Ray head is not ready. Will check again in {BACKOFF_S} seconds." + ) + time.sleep(BACKOFF_S) + + # The Ray head container sets up the log directory. Thus, we set up logging + # only after the Ray head is ready. + _setup_logging() + + # autoscaling_config_producer reads the RayCluster CR from K8s and uses the CR + # to output an autoscaling config. + autoscaling_config_producer = AutoscalingConfigProducer( + cluster_name, cluster_namespace + ) + + gcs_client = GcsClient(ray_address) + if is_autoscaler_v2(fetch_from_server=True, gcs_client=gcs_client): + from ray.autoscaler.v2.monitor import AutoscalerMonitor as MonitorV2 + + MonitorV2( + address=gcs_client.address, + config_reader=KubeRayConfigReader(autoscaling_config_producer), + log_dir=_get_log_dir(), + monitor_ip=head_ip, + ).run() + else: + Monitor( + address=gcs_client.address, + # The `autoscaling_config` arg can be a dict or a `Callable: () -> dict`. + # In this case, it's a callable. + autoscaling_config=autoscaling_config_producer, + monitor_ip=head_ip, + # Let the autoscaler process exit after it hits 5 exceptions. + # (See ray.autoscaler._private.constants.AUTOSCALER_MAX_NUM_FAILURES.) + # Kubernetes will then restart the autoscaler container. + retry_on_failure=False, + ).run() + + +def _setup_logging() -> None: + """Log to autoscaler log file + (typically, /tmp/ray/session_latest/logs/monitor.*) + + Also log to pod stdout (logs viewable with `kubectl logs -c autoscaler`). + """ + log_dir = _get_log_dir() + # The director should already exist, but try (safely) to create it just in case. + try_to_create_directory(log_dir) + + # Write logs at info level to monitor.log. + setup_component_logger( + logging_level=ray_constants.LOGGER_LEVEL, + logging_format=ray_constants.LOGGER_FORMAT, + log_dir=log_dir, + filename=ray_constants.MONITOR_LOG_FILE_NAME, # monitor.log + max_bytes=ray_constants.LOGGING_ROTATE_BYTES, + backup_count=ray_constants.LOGGING_ROTATE_BACKUP_COUNT, + ) + + # For the autoscaler, the root logger _also_ needs to write to stderr, not just + # ray_constants.MONITOR_LOG_FILE_NAME. + level = logging.getLevelName(ray_constants.LOGGER_LEVEL.upper()) + stderr_handler = logging._StderrHandler() + stderr_handler.setFormatter(logging.Formatter(ray_constants.LOGGER_FORMAT)) + stderr_handler.setLevel(level) + logging.root.setLevel(level) + logging.root.addHandler(stderr_handler) + + # The stdout handler was set up in the Ray CLI entry point. + # See ray.scripts.scripts::cli(). diff --git a/.venv/lib/python3.11/site-packages/ray/autoscaler/_private/kuberay/utils.py b/.venv/lib/python3.11/site-packages/ray/autoscaler/_private/kuberay/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..1b045b422b66aae2e0f5c400c4347cae8fac8a6e --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/autoscaler/_private/kuberay/utils.py @@ -0,0 +1,111 @@ +# Source: +# https://github.com/kubernetes-client/python/blob/master/kubernetes/utils/quantity.py +from decimal import Decimal, InvalidOperation +from functools import reduce +from typing import Optional + +# Mapping used to get generation for TPU-{accelerator}-head resource +# https://cloud.google.com/kubernetes-engine/docs/how-to/tpus#run +gke_tpu_accelerator_to_generation = { + "tpu-v4-podslice": "v4", + "tpu-v5-lite-device": "v5e", + "tpu-v5-lite-podslice": "v5e", + "tpu-v5p-slice": "v5p", + "tpu-v6e-slice": "v6e", +} + + +def parse_quantity(quantity): + """ + Parse kubernetes canonical form quantity like 200Mi to a decimal number. + Supported SI suffixes: + base1024: Ki | Mi | Gi | Ti | Pi | Ei + base1000: n | u | m | "" | k | M | G | T | P | E + + See + https://github.com/kubernetes/apimachinery/blob/master/pkg/api/resource/quantity.go + + Input: + quantity: string. kubernetes canonical form quantity + + Returns: + Decimal + + Raises: + ValueError on invalid or unknown input + """ + if isinstance(quantity, (int, float, Decimal)): + return Decimal(quantity) + + exponents = { + "n": -3, + "u": -2, + "m": -1, + "K": 1, + "k": 1, + "M": 2, + "G": 3, + "T": 4, + "P": 5, + "E": 6, + } + + quantity = str(quantity) + number = quantity + suffix = None + if len(quantity) >= 2 and quantity[-1] == "i": + if quantity[-2] in exponents: + number = quantity[:-2] + suffix = quantity[-2:] + elif len(quantity) >= 1 and quantity[-1] in exponents: + number = quantity[:-1] + suffix = quantity[-1:] + + try: + number = Decimal(number) + except InvalidOperation: + raise ValueError("Invalid number format: {}".format(number)) + + if suffix is None: + return number + + if suffix.endswith("i"): + base = 1024 + elif len(suffix) == 1: + base = 1000 + else: + raise ValueError("{} has unknown suffix".format(quantity)) + + # handle SI inconsistency + if suffix == "ki": + raise ValueError("{} has unknown suffix".format(quantity)) + + if suffix[0] not in exponents: + raise ValueError("{} has unknown suffix".format(quantity)) + + exponent = Decimal(exponents[suffix[0]]) + return number * (base**exponent) + + +def tpu_node_selectors_to_type(topology: str, accelerator: str) -> Optional[str]: + """Convert Kubernetes gke-tpu nodeSelectors to TPU accelerator_type + for a kuberay TPU worker group. + Args: + topology: value of the cloud.google.com/gke-tpu-topology Kubernetes + nodeSelector, describes the physical topology of the TPU podslice. + accelerator: value of the cloud.google.com/gke-tpu-accelerator nodeSelector, + the name of the TPU accelerator, e.g. tpu-v4-podslice + Returns: + A string, accelerator_type, e.g. "v4-8". + """ + if topology and accelerator: + generation = gke_tpu_accelerator_to_generation[accelerator] + # Reduce e.g. "2x2x2" to 8 + chip_dimensions = [int(chip_count) for chip_count in topology.split("x")] + num_chips = reduce(lambda x, y: x * y, chip_dimensions) + default_num_cores_per_chip = 1 + if generation == "v4" or generation == "v5p": + default_num_cores_per_chip = 2 + num_cores = num_chips * default_num_cores_per_chip + return f"{generation}-{num_cores}" + return None diff --git a/.venv/lib/python3.11/site-packages/ray/autoscaler/_private/readonly/__init__.py b/.venv/lib/python3.11/site-packages/ray/autoscaler/_private/readonly/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/.venv/lib/python3.11/site-packages/ray/autoscaler/_private/readonly/__pycache__/__init__.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/autoscaler/_private/readonly/__pycache__/__init__.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..368b6ed77e232c50292c5b6080a1eae85fd59989 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/autoscaler/_private/readonly/__pycache__/__init__.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/autoscaler/_private/readonly/__pycache__/node_provider.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/autoscaler/_private/readonly/__pycache__/node_provider.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..636fb05c0135eaf6f8652b34c3d9e33387eed44b Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/autoscaler/_private/readonly/__pycache__/node_provider.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/autoscaler/_private/readonly/node_provider.py b/.venv/lib/python3.11/site-packages/ray/autoscaler/_private/readonly/node_provider.py new file mode 100644 index 0000000000000000000000000000000000000000..692501bb760f28857d59d44d18202a483f356b7a --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/autoscaler/_private/readonly/node_provider.py @@ -0,0 +1,80 @@ +from typing import List, Tuple + +from ray.autoscaler._private.util import format_readonly_node_type +from ray.autoscaler.node_provider import NodeProvider +from ray.autoscaler.tags import ( + NODE_KIND_HEAD, + STATUS_UP_TO_DATE, + TAG_RAY_NODE_KIND, + TAG_RAY_NODE_NAME, + TAG_RAY_NODE_STATUS, + TAG_RAY_USER_NODE_TYPE, +) + + +class ReadOnlyNodeProvider(NodeProvider): + """A node provider that merely reports the current cluster state. + + This is used for laptop mode / manual cluster setup modes, in order to + provide status reporting in the same way for users.""" + + def __init__(self, provider_config, cluster_name): + NodeProvider.__init__(self, provider_config, cluster_name) + self.nodes = {} + + def is_readonly(self): + return True + + def _set_nodes(self, nodes: List[Tuple[str, str]]): + """Update the set of nodes in the cluster. + + Args: + nodes: List of (node_id, node_manager_address) tuples. + """ + new_nodes = {} + for node_id, node_manager_address in nodes: + # We make up a fake node type for each node (since each node + # could have its own unique configuration). + new_nodes[node_id] = { + # Keep prefix in sync with node config gen in monitor.py + "node_type": format_readonly_node_type(node_id), + "ip": node_manager_address, + } + self.nodes = new_nodes + + def non_terminated_nodes(self, tag_filters): + return list(self.nodes.keys()) + + def is_running(self, node_id): + return node_id in self.nodes + + def is_terminated(self, node_id): + return node_id not in self.nodes + + def node_tags(self, node_id): + tags = { + TAG_RAY_NODE_KIND: NODE_KIND_HEAD, + TAG_RAY_USER_NODE_TYPE: self.nodes[node_id]["node_type"], + TAG_RAY_NODE_NAME: node_id, + TAG_RAY_NODE_STATUS: STATUS_UP_TO_DATE, + } + return tags + + def external_ip(self, node_id): + return node_id + + def internal_ip(self, node_id): + return node_id + + def set_node_tags(self, node_id, tags): + raise AssertionError("Readonly node provider cannot be updated") + + def create_node(self, node_config, tags, count): + raise AssertionError("Readonly node provider cannot be updated") + + def terminate_node(self, node_id): + raise AssertionError("Readonly node provider cannot be updated") + + @staticmethod + def bootstrap_config(cluster_config): + return cluster_config diff --git a/.venv/lib/python3.11/site-packages/ray/autoscaler/aws/__init__.py b/.venv/lib/python3.11/site-packages/ray/autoscaler/aws/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/.venv/lib/python3.11/site-packages/ray/autoscaler/aws/__pycache__/__init__.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/autoscaler/aws/__pycache__/__init__.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..6a915ecdb4c5d4154a813d239853e4edc9f72dab Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/autoscaler/aws/__pycache__/__init__.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/autoscaler/aws/cloudwatch/prometheus.yml b/.venv/lib/python3.11/site-packages/ray/autoscaler/aws/cloudwatch/prometheus.yml new file mode 100644 index 0000000000000000000000000000000000000000..98a503a9fbf979c398f872f801726a31fa66774c --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/autoscaler/aws/cloudwatch/prometheus.yml @@ -0,0 +1,15 @@ +# Prometheus config file + +# my global config +global: + scrape_interval: 10s + evaluation_interval: 10s + scrape_timeout: 10s + +# use ray file-based service discovery file as scrape target. +scrape_configs: +- job_name: 'ray' + file_sd_configs: + - files: + - '/tmp/ray/prom_metrics_service_discovery.json' + refresh_interval: 1m diff --git a/.venv/lib/python3.11/site-packages/ray/autoscaler/aws/cloudwatch/ray_prometheus_waiter.sh b/.venv/lib/python3.11/site-packages/ray/autoscaler/aws/cloudwatch/ray_prometheus_waiter.sh new file mode 100644 index 0000000000000000000000000000000000000000..4494d5ece79081fc4b545b1ec4e9427703e96603 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/autoscaler/aws/cloudwatch/ray_prometheus_waiter.sh @@ -0,0 +1,23 @@ +#!/bin/bash + +MAX_ATTEMPTS=120 +DELAY_SECONDS=10 +RAY_PROM_METRICS_FILE_PATH="/tmp/ray/prom_metrics_service_discovery.json" +CLUSTER_NAME=$1 +while [ $MAX_ATTEMPTS -gt 0 ]; do + if [ -f $RAY_PROM_METRICS_FILE_PATH ]; then + echo "Ray Prometheus metrics service discovery file found at: $RAY_PROM_METRICS_FILE_PATH." + echo "Restarting cloudwatch agent.This may take a few minutes..." + sudo /opt/aws/amazon-cloudwatch-agent/bin/amazon-cloudwatch-agent-ctl -m ec2 -a stop + echo "Cloudwatch agent stopped, starting cloudwatch agent..." + sudo /opt/aws/amazon-cloudwatch-agent/bin/amazon-cloudwatch-agent-ctl -a fetch-config -m ec2 -s -c "ssm:AmazonCloudWatch-ray_agent_config_$CLUSTER_NAME" + echo "Cloudwatch agent successfully restarted!" + exit 0 + else + echo "Ray Prometheus metrics service discovery file not found at: $RAY_PROM_METRICS_FILE_PATH. Will check again in $DELAY_SECONDS seconds..." + sleep $DELAY_SECONDS + MAX_ATTEMPTS=$((MAX_ATTEMPTS-1)) + fi +done +echo "Ray Prometheus metrics service discovery file not found at: $RAY_PROM_METRICS_FILE_PATH. Ray system metrics will not be available in CloudWatch." +exit 1 diff --git a/.venv/lib/python3.11/site-packages/ray/autoscaler/aws/defaults.yaml b/.venv/lib/python3.11/site-packages/ray/autoscaler/aws/defaults.yaml new file mode 100644 index 0000000000000000000000000000000000000000..fedf41d7df49b2883f6d3822553b8ef87eb14364 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/autoscaler/aws/defaults.yaml @@ -0,0 +1,144 @@ +# An unique identifier for the head node and workers of this cluster. +cluster_name: default + +# The maximum number of workers nodes to launch in addition to the head +# node. +max_workers: 2 + +# The autoscaler will scale up the cluster faster with higher upscaling speed. +# E.g., if the task requires adding more nodes then autoscaler will gradually +# scale up the cluster in chunks of upscaling_speed*currently_running_nodes. +# This number should be > 0. +upscaling_speed: 1.0 + +# This executes all commands on all nodes in the docker container, +# and opens all the necessary ports to support the Ray cluster. +# Empty string means disabled. +docker: {} + +# If a node is idle for this many minutes, it will be removed. +idle_timeout_minutes: 5 + +# Cloud-provider specific configuration. +provider: + type: aws + region: us-west-2 + # Availability zone(s), comma-separated, that nodes may be launched in. + # Nodes will be launched in the first listed availability zone and will + # be tried in the subsequent availability zones if launching fails. + availability_zone: us-west-2a,us-west-2b + # Whether to allow node reuse. If set to False, nodes will be terminated + # instead of stopped. + cache_stopped_nodes: True # If not present, the default is True. + +# How Ray will authenticate with newly launched nodes. +auth: + ssh_user: ubuntu +# By default Ray creates a new private keypair, but you can also use your own. +# If you do so, make sure to also set "KeyName" in the head and worker node +# configurations below. +# ssh_private_key: /path/to/your/key.pem + +# Tell the autoscaler the allowed node types and the resources they provide. +# The key is the name of the node type, which is just for debugging purposes. +# The node config specifies the launch config and physical instance type. +available_node_types: + ray.head.default: + # The node type's CPU and GPU resources are auto-detected based on AWS instance type. + # If desired, you can override the autodetected CPU and GPU resources advertised to the autoscaler. + # You can also set custom resources. + # For example, to mark a node type as having 1 CPU, 1 GPU, and 5 units of a resource called "custom", set + # resources: {"CPU": 1, "GPU": 1, "custom": 5} + resources: {} + # Provider-specific config for this node type, e.g. instance type. By default + # Ray will auto-configure unspecified fields such as SubnetId and KeyName. + # For more documentation on available fields, see: + # http://boto3.readthedocs.io/en/latest/reference/services/ec2.html#EC2.ServiceResource.create_instances + node_config: + InstanceType: m5.large + # You can provision additional disk space with a conf as follows + BlockDeviceMappings: + - DeviceName: /dev/sda1 + Ebs: + VolumeSize: 256 + # Additional options in the boto docs. + ray.worker.default: + # The minimum number of nodes of this type to launch. + # This number should be >= 0. + min_workers: 0 + # The node type's CPU and GPU resources are auto-detected based on AWS instance type. + # If desired, you can override the autodetected CPU and GPU resources advertised to the autoscaler. + # You can also set custom resources. + # For example, to mark a node type as having 1 CPU, 1 GPU, and 5 units of a resource called "custom", set + # resources: {"CPU": 1, "GPU": 1, "custom": 5} + resources: {} + # Provider-specific config for this node type, e.g. instance type. By default + # Ray will auto-configure unspecified fields such as SubnetId and KeyName. + # For more documentation on available fields, see: + # http://boto3.readthedocs.io/en/latest/reference/services/ec2.html#EC2.ServiceResource.create_instances + node_config: + InstanceType: m5.large + # Run workers on spot by default. Comment this out to use on-demand. + InstanceMarketOptions: + MarketType: spot + # Additional options can be found in the boto docs, e.g. + # SpotOptions: + # MaxPrice: MAX_HOURLY_PRICE + # Additional options in the boto docs. + +# Specify the node type of the head node (as configured above). +head_node_type: ray.head.default + +# Files or directories to copy to the head and worker nodes. The format is a +# dictionary from REMOTE_PATH: LOCAL_PATH, e.g. +file_mounts: { +# "/path1/on/remote/machine": "/path1/on/local/machine", +# "/path2/on/remote/machine": "/path2/on/local/machine", +} + +# Files or directories to copy from the head node to the worker nodes. The format is a +# list of paths. The same path on the head node will be copied to the worker node. +# This behavior is a subset of the file_mounts behavior. In the vast majority of cases +# you should just use file_mounts. Only use this if you know what you're doing! +cluster_synced_files: [] + +# Whether changes to directories in file_mounts or cluster_synced_files in the head node +# should sync to the worker node continuously +file_mounts_sync_continuously: False + +# Patterns for files to exclude when running rsync up or rsync down +rsync_exclude: [] + +# Pattern files to use for filtering out files when running rsync up or rsync down. The file is searched for +# in the source directory and recursively through all subdirectories. For example, if .gitignore is provided +# as a value, the behavior will match git's behavior for finding and using .gitignore files. +rsync_filter: [] + +# List of commands that will be run before `setup_commands`. If docker is +# enabled, these commands will run outside the container and before docker +# is setup. +initialization_commands: [] + +# List of shell commands to run to set up nodes. +setup_commands: + - >- + (stat $HOME/anaconda3/envs/tensorflow2_p38/ &> /dev/null && + echo 'export PATH="$HOME/anaconda3/envs/tensorflow2_p38/bin:$PATH"' >> ~/.bashrc) || true + - which ray || pip install -U "ray[default] @ https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-3.0.0.dev0-cp38-cp38-manylinux2014_x86_64.whl" + +# Custom commands that will be run on the head node after common setup. +head_setup_commands: + - pip install 'boto3>=1.4.8' # 1.4.8 adds InstanceMarketOptions + +# Custom commands that will be run on worker nodes after common setup. +worker_setup_commands: [] + +# Command to start ray on the head node. You don't need to change this. +head_start_ray_commands: + - ray stop + - ulimit -n 65536; ray start --head --port=6379 --object-manager-port=8076 --autoscaling-config=~/ray_bootstrap_config.yaml --dashboard-host=0.0.0.0 + +# Command to start ray on worker nodes. You don't need to change this. +worker_start_ray_commands: + - ray stop + - ulimit -n 65536; ray start --address=$RAY_HEAD_IP:6379 --object-manager-port=8076 diff --git a/.venv/lib/python3.11/site-packages/ray/autoscaler/azure/__init__.py b/.venv/lib/python3.11/site-packages/ray/autoscaler/azure/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/.venv/lib/python3.11/site-packages/ray/autoscaler/azure/__pycache__/__init__.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/autoscaler/azure/__pycache__/__init__.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c63d2db24d4d33213043ba34139e3d539739bff1 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/autoscaler/azure/__pycache__/__init__.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/autoscaler/azure/defaults.yaml b/.venv/lib/python3.11/site-packages/ray/autoscaler/azure/defaults.yaml new file mode 100644 index 0000000000000000000000000000000000000000..592a0f02e68140c9a2e14c520f52c77ccf10e882 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/autoscaler/azure/defaults.yaml @@ -0,0 +1,152 @@ +# An unique identifier for the head node and workers of this cluster. +cluster_name: default + +# The maximum number of workers nodes to launch in addition to the head +# node. +max_workers: 2 + +# The autoscaler will scale up the cluster faster with higher upscaling speed. +# E.g., if the task requires adding more nodes then autoscaler will gradually +# scale up the cluster in chunks of upscaling_speed*currently_running_nodes. +# This number should be > 0. +upscaling_speed: 1.0 + +# This executes all commands on all nodes in the docker container, +# and opens all the necessary ports to support the Ray cluster. +# Empty object means disabled. +docker: {} + +# If a node is idle for this many minutes, it will be removed. +idle_timeout_minutes: 5 + +# Cloud-provider specific configuration. +provider: + type: azure + # https://azure.microsoft.com/en-us/global-infrastructure/locations + location: westus2 + resource_group: ray-cluster + # set subscription id otherwise the default from az cli will be used + # subscription_id: 00000000-0000-0000-0000-000000000000 + # set unique subnet mask or a random mask will be used + # subnet_mask: 10.0.0.0/16 + # set unique id for resources in this cluster + # if not set a default id will be generated based on the resource group and cluster name + # unique_id: RAY1 + +# How Ray will authenticate with newly launched nodes. +auth: + ssh_user: ubuntu + # you must specify paths to matching private and public key pair files + # use `ssh-keygen -t rsa -b 4096` to generate a new ssh key pair + ssh_private_key: ~/.ssh/id_rsa + # changes to this should match what is specified in file_mounts + ssh_public_key: ~/.ssh/id_rsa.pub + +# More specific customization to node configurations can be made using the ARM template azure-vm-template.json file +# See documentation here: https://docs.microsoft.com/en-us/azure/templates/microsoft.compute/2019-03-01/virtualmachines +# Changes to the local file will be used during deployment of the head node, however worker nodes deployment occurs +# on the head node, so changes to the template must be included in the wheel file used in setup_commands section below + +# Tell the autoscaler the allowed node types and the resources they provide. +# The key is the name of the node type, which is just for debugging purposes. +# The node config specifies the launch config and physical instance type. +available_node_types: + ray.head.default: + resources: {"CPU": 2} + # Provider-specific config, e.g. instance type. + node_config: + azure_arm_parameters: + vmSize: Standard_D2s_v3 + # List images https://docs.microsoft.com/en-us/azure/virtual-machines/linux/cli-ps-findimage + imagePublisher: microsoft-dsvm + imageOffer: ubuntu-1804 + imageSku: 1804-gen2 + imageVersion: latest + + ray.worker.default: + # The minimum number of nodes of this type to launch. + # This number should be >= 0. + min_workers: 0 + # The resources provided by this node type. + resources: {"CPU": 2} + # Provider-specific config, e.g. instance type. + node_config: + azure_arm_parameters: + vmSize: Standard_D2s_v3 + # List images https://docs.microsoft.com/en-us/azure/virtual-machines/linux/cli-ps-findimage + imagePublisher: microsoft-dsvm + imageOffer: ubuntu-1804 + imageSku: 1804-gen2 + imageVersion: latest + # comment lines below to not use Spot instances + priority: Spot + # set a maximum price for spot instances if desired + # billingProfile: + # maxPrice: -1 + +# Specify the node type of the head node (as configured above). +head_node_type: ray.head.default + +# Files or directories to copy to the head and worker nodes. The format is a +# dictionary from REMOTE_PATH: LOCAL_PATH, e.g. +file_mounts: { +# "/path1/on/remote/machine": "/path1/on/local/machine", +# "/path2/on/remote/machine": "/path2/on/local/machine", + "~/.ssh/id_rsa.pub": "~/.ssh/id_rsa.pub" +} + +# Files or directories to copy from the head node to the worker nodes. The format is a +# list of paths. The same path on the head node will be copied to the worker node. +# This behavior is a subset of the file_mounts behavior. In the vast majority of cases +# you should just use file_mounts. Only use this if you know what you're doing! +cluster_synced_files: [] + +# Whether changes to directories in file_mounts or cluster_synced_files in the head node +# should sync to the worker node continuously +file_mounts_sync_continuously: False + +# Patterns for files to exclude when running rsync up or rsync down +rsync_exclude: [] + +# Pattern files to use for filtering out files when running rsync up or rsync down. The file is searched for +# in the source directory and recursively through all subdirectories. For example, if .gitignore is provided +# as a value, the behavior will match git's behavior for finding and using .gitignore files. +rsync_filter: [] + +# List of commands that will be run before `setup_commands`. If docker is +# enabled, these commands will run outside the container and before docker +# is setup. +initialization_commands: + # get rid of annoying Ubuntu message + - touch ~/.sudo_as_admin_successful + +# List of shell commands to run to set up nodes. +setup_commands: + # Note: if you're developing Ray, you probably want to create an AMI that + # has your Ray repo pre-cloned. Then, you can replace the pip installs + # below with a git checkout (and possibly a recompile). + - (which conda && echo 'eval "$(conda shell.bash hook)"' >> ~/.bashrc) || true + # - (conda activate py38_pytorch &> /dev/null && echo 'conda activate py38_pytorch' >> ~/.bashrc) || true + - (conda activate py38_tensorflow &> /dev/null && echo 'conda activate py38_tensorflow' >> ~/.bashrc) || true + - which ray || pip install -U "ray[default] @ https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-3.0.0.dev0-cp38-cp38-manylinux2014_x86_64.whl" + # Consider uncommenting these if you also want to run apt-get commands during setup + # - sudo pkill -9 apt-get || true + # - sudo pkill -9 dpkg || true + # - sudo dpkg --configure -a + +# Custom commands that will be run on the head node after common setup. +head_setup_commands: + - pip install -U azure-cli-core==2.29.1 azure-identity==1.7.0 azure-mgmt-compute==23.1.0 azure-mgmt-network==19.0.0 azure-mgmt-resource==20.0.0 msrestazure==0.6.4 + +# Custom commands that will be run on worker nodes after common setup. +worker_setup_commands: [] + +# Command to start ray on the head node. You don't need to change this. +head_start_ray_commands: + - ray stop + - ulimit -n 65536; ray start --head --port=6379 --object-manager-port=8076 --autoscaling-config=~/ray_bootstrap_config.yaml --dashboard-host=0.0.0.0 + +# Command to start ray on worker nodes. You don't need to change this. +worker_start_ray_commands: + - ray stop + - ulimit -n 65536; ray start --address=$RAY_HEAD_IP:6379 --object-manager-port=8076 diff --git a/.venv/lib/python3.11/site-packages/ray/autoscaler/sdk/__init__.py b/.venv/lib/python3.11/site-packages/ray/autoscaler/sdk/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..7dcad853f6a118f03a4e1c4198f11f3be281c5f9 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/autoscaler/sdk/__init__.py @@ -0,0 +1,29 @@ +from ray.autoscaler.sdk.sdk import ( + bootstrap_config, + configure_logging, + create_or_update_cluster, + fillout_defaults, + get_docker_host_mount_location, + get_head_node_ip, + get_worker_node_ips, + register_callback_handler, + request_resources, + rsync, + run_on_cluster, + teardown_cluster, +) + +__all__ = [ + "create_or_update_cluster", + "teardown_cluster", + "run_on_cluster", + "rsync", + "get_head_node_ip", + "get_worker_node_ips", + "request_resources", + "configure_logging", + "bootstrap_config", + "fillout_defaults", + "register_callback_handler", + "get_docker_host_mount_location", +] diff --git a/.venv/lib/python3.11/site-packages/ray/autoscaler/sdk/__pycache__/__init__.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/autoscaler/sdk/__pycache__/__init__.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..f75639380d167eaed65de3151320de632cfe6e2a Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/autoscaler/sdk/__pycache__/__init__.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/autoscaler/sdk/__pycache__/sdk.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/autoscaler/sdk/__pycache__/sdk.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..82bca71a3e8b9af6e7ce3dac12bd81c46c62a1da Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/autoscaler/sdk/__pycache__/sdk.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/autoscaler/sdk/sdk.py b/.venv/lib/python3.11/site-packages/ray/autoscaler/sdk/sdk.py new file mode 100644 index 0000000000000000000000000000000000000000..8cb07e947016d003067f76b16effbdae9f66d05c --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/autoscaler/sdk/sdk.py @@ -0,0 +1,343 @@ +"""IMPORTANT: this is an experimental interface and not currently stable.""" + +import json +import os +import tempfile +from contextlib import contextmanager +from typing import Any, Callable, Dict, Iterator, List, Optional, Union + +from ray.autoscaler._private import commands +from ray.autoscaler._private.cli_logger import cli_logger +from ray.autoscaler._private.event_system import CreateClusterEvent # noqa: F401 +from ray.autoscaler._private.event_system import global_event_system # noqa: F401 +from ray.util.annotations import DeveloperAPI + + +@DeveloperAPI +def create_or_update_cluster( + cluster_config: Union[dict, str], + *, + no_restart: bool = False, + restart_only: bool = False, + no_config_cache: bool = False +) -> Dict[str, Any]: + """Create or updates an autoscaling Ray cluster from a config json. + + Args: + cluster_config (Union[str, dict]): Either the config dict of the + cluster, or a path pointing to a file containing the config. + no_restart: Whether to skip restarting Ray services during the + update. This avoids interrupting running jobs and can be used to + dynamically adjust autoscaler configuration. + restart_only: Whether to skip running setup commands and only + restart Ray. This cannot be used with 'no-restart'. + no_config_cache: Whether to disable the config cache and fully + resolve all environment settings from the Cloud provider again. + """ + with _as_config_file(cluster_config) as config_file: + return commands.create_or_update_cluster( + config_file=config_file, + override_min_workers=None, + override_max_workers=None, + no_restart=no_restart, + restart_only=restart_only, + yes=True, + override_cluster_name=None, + no_config_cache=no_config_cache, + redirect_command_output=None, + use_login_shells=True, + ) + + +@DeveloperAPI +def teardown_cluster( + cluster_config: Union[dict, str], + workers_only: bool = False, + keep_min_workers: bool = False, +) -> None: + """Destroys all nodes of a Ray cluster described by a config json. + + Args: + cluster_config (Union[str, dict]): Either the config dict of the + cluster, or a path pointing to a file containing the config. + workers_only: Whether to keep the head node running and only + teardown worker nodes. + keep_min_workers: Whether to keep min_workers (as specified + in the YAML) still running. + """ + with _as_config_file(cluster_config) as config_file: + return commands.teardown_cluster( + config_file=config_file, + yes=True, + workers_only=workers_only, + override_cluster_name=None, + keep_min_workers=keep_min_workers, + ) + + +@DeveloperAPI +def run_on_cluster( + cluster_config: Union[dict, str], + *, + cmd: Optional[str] = None, + run_env: str = "auto", + tmux: bool = False, + stop: bool = False, + no_config_cache: bool = False, + port_forward: Optional[commands.Port_forward] = None, + with_output: bool = False +) -> Optional[str]: + """Runs a command on the specified cluster. + + Args: + cluster_config (Union[str, dict]): Either the config dict of the + cluster, or a path pointing to a file containing the config. + cmd: the command to run, or None for a no-op command. + run_env: whether to run the command on the host or in a + container. Select between "auto", "host" and "docker". + tmux: whether to run in a tmux session + stop: whether to stop the cluster after command run + no_config_cache: Whether to disable the config cache and fully + resolve all environment settings from the Cloud provider again. + port_forward ( (int,int) or list[(int,int)]): port(s) to forward. + with_output: Whether to capture command output. + + Returns: + The output of the command as a string. + """ + with _as_config_file(cluster_config) as config_file: + return commands.exec_cluster( + config_file, + cmd=cmd, + run_env=run_env, + screen=False, + tmux=tmux, + stop=stop, + start=False, + override_cluster_name=None, + no_config_cache=no_config_cache, + port_forward=port_forward, + with_output=with_output, + ) + + +@DeveloperAPI +def rsync( + cluster_config: Union[dict, str], + *, + source: Optional[str], + target: Optional[str], + down: bool, + ip_address: Optional[str] = None, + use_internal_ip: bool = False, + no_config_cache: bool = False, + should_bootstrap: bool = True +): + """Rsyncs files to or from the cluster. + + Args: + cluster_config (Union[str, dict]): Either the config dict of the + cluster, or a path pointing to a file containing the config. + source: rsync source argument. + target: rsync target argument. + down: whether we're syncing remote -> local. + ip_address: Address of node. + use_internal_ip: Whether the provided ip_address is + public or private. + no_config_cache: Whether to disable the config cache and fully + resolve all environment settings from the Cloud provider again. + should_bootstrap: whether to bootstrap cluster config before syncing + + Raises: + RuntimeError if the cluster head node is not found. + """ + with _as_config_file(cluster_config) as config_file: + return commands.rsync( + config_file=config_file, + source=source, + target=target, + override_cluster_name=None, + down=down, + ip_address=ip_address, + use_internal_ip=use_internal_ip, + no_config_cache=no_config_cache, + all_nodes=False, + should_bootstrap=should_bootstrap, + ) + + +@DeveloperAPI +def get_head_node_ip(cluster_config: Union[dict, str]) -> str: + """Returns head node IP for given configuration file if exists. + + Args: + cluster_config (Union[str, dict]): Either the config dict of the + cluster, or a path pointing to a file containing the config. + + Returns: + The ip address of the cluster head node. + + Raises: + RuntimeError if the cluster is not found. + """ + with _as_config_file(cluster_config) as config_file: + return commands.get_head_node_ip(config_file) + + +@DeveloperAPI +def get_worker_node_ips(cluster_config: Union[dict, str]) -> List[str]: + """Returns worker node IPs for given configuration file. + + Args: + cluster_config (Union[str, dict]): Either the config dict of the + cluster, or a path pointing to a file containing the config. + + Returns: + List of worker node ip addresses. + + Raises: + RuntimeError if the cluster is not found. + """ + with _as_config_file(cluster_config) as config_file: + return commands.get_worker_node_ips(config_file) + + +@DeveloperAPI +def request_resources( + num_cpus: Optional[int] = None, bundles: Optional[List[dict]] = None +) -> None: + """Command the autoscaler to scale to accommodate the specified requests. + + The cluster will immediately attempt to scale to accommodate the requested + resources, bypassing normal upscaling speed constraints. This takes into + account existing resource usage. + + For example, suppose you call ``request_resources(num_cpus=100)`` and + there are 45 currently running tasks, each requiring 1 CPU. Then, enough + nodes will be added so up to 100 tasks can run concurrently. It does + **not** add enough nodes so that 145 tasks can run. + + This call is only a hint to the autoscaler. The actual resulting cluster + size may be slightly larger or smaller than expected depending on the + internal bin packing algorithm and max worker count restrictions. + + Args: + num_cpus: Scale the cluster to ensure this number of CPUs are + available. This request is persistent until another call to + request_resources() is made to override. + bundles (List[ResourceDict]): Scale the cluster to ensure this set of + resource shapes can fit. This request is persistent until another + call to request_resources() is made to override. + + Examples: + >>> from ray.autoscaler.sdk import request_resources + >>> # Request 1000 CPUs. + >>> request_resources(num_cpus=1000) # doctest: +SKIP + >>> # Request 64 CPUs and also fit a 1-GPU/4-CPU task. + >>> request_resources( # doctest: +SKIP + ... num_cpus=64, bundles=[{"GPU": 1, "CPU": 4}]) + >>> # Same as requesting num_cpus=3. + >>> request_resources( # doctest: +SKIP + ... bundles=[{"CPU": 1}, {"CPU": 1}, {"CPU": 1}]) + """ + if num_cpus is not None and not isinstance(num_cpus, int): + raise TypeError("num_cpus should be of type int.") + if bundles is not None: + if isinstance(bundles, List): + for bundle in bundles: + if isinstance(bundle, Dict): + for key in bundle.keys(): + if not (isinstance(key, str) and isinstance(bundle[key], int)): + raise TypeError( + "each bundle key should be str and value as int." + ) + else: + raise TypeError("each bundle should be a Dict.") + else: + raise TypeError("bundles should be of type List") + + return commands.request_resources(num_cpus, bundles) + + +@DeveloperAPI +def configure_logging( + log_style: Optional[str] = None, + color_mode: Optional[str] = None, + verbosity: Optional[int] = None, +): + """Configures logging for cluster command calls. + + Args: + log_style: If 'pretty', outputs with formatting and color. + If 'record', outputs record-style without formatting. + 'auto' defaults to 'pretty', and disables pretty logging + if stdin is *not* a TTY. Defaults to "auto". + color_mode (str): + Can be "true", "false", or "auto". + + Enables or disables `colorful`. + + If `color_mode` is "auto", is set to `not stdout.isatty()` + vebosity (int): + Output verbosity (0, 1, 2, 3). + + Low verbosity will disable `verbose` and `very_verbose` messages. + + """ + cli_logger.configure( + log_style=log_style, color_mode=color_mode, verbosity=verbosity + ) + + +@contextmanager +@DeveloperAPI +def _as_config_file(cluster_config: Union[dict, str]) -> Iterator[str]: + if isinstance(cluster_config, dict): + tmp = tempfile.NamedTemporaryFile("w", prefix="autoscaler-sdk-tmp-") + tmp.write(json.dumps(cluster_config)) + tmp.flush() + cluster_config = tmp.name + if not os.path.exists(cluster_config): + raise ValueError("Cluster config not found {}".format(cluster_config)) + yield cluster_config + + +@DeveloperAPI +def bootstrap_config( + cluster_config: Dict[str, Any], no_config_cache: bool = False +) -> Dict[str, Any]: + """Validate and add provider-specific fields to the config. For example, + IAM/authentication may be added here.""" + return commands._bootstrap_config(cluster_config, no_config_cache) + + +@DeveloperAPI +def fillout_defaults(config: Dict[str, Any]) -> Dict[str, Any]: + """Fillout default values for a cluster_config based on the provider.""" + from ray.autoscaler._private.util import fillout_defaults + + return fillout_defaults(config) + + +@DeveloperAPI +def register_callback_handler( + event_name: str, + callback: Union[Callable[[Dict], None], List[Callable[[Dict], None]]], +) -> None: + """Registers a callback handler for autoscaler events. + + Args: + event_name: Event that callback should be called on. See + CreateClusterEvent for details on the events available to be + registered against. + callback: Callable object that is invoked + when specified event occurs. + """ + global_event_system.add_callback_handler(event_name, callback) + + +@DeveloperAPI +def get_docker_host_mount_location(cluster_name: str) -> str: + """Return host path that Docker mounts attach to.""" + docker_mount_prefix = "/tmp/ray_tmp_mount/{cluster_name}" + return docker_mount_prefix.format(cluster_name=cluster_name) diff --git a/.venv/lib/python3.11/site-packages/ray/autoscaler/v2/__init__.py b/.venv/lib/python3.11/site-packages/ray/autoscaler/v2/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/.venv/lib/python3.11/site-packages/ray/autoscaler/v2/autoscaler.py b/.venv/lib/python3.11/site-packages/ray/autoscaler/v2/autoscaler.py new file mode 100644 index 0000000000000000000000000000000000000000..b4b7e59f072800a65b4784cb5647067fe15fe3db --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/autoscaler/v2/autoscaler.py @@ -0,0 +1,201 @@ +import logging +from queue import Queue +from typing import List, Optional + +from ray._raylet import GcsClient +from ray.autoscaler._private.providers import _get_node_provider +from ray.autoscaler.v2.event_logger import AutoscalerEventLogger +from ray.autoscaler.v2.instance_manager.cloud_providers.kuberay.cloud_provider import ( + KubeRayProvider, +) +from ray.autoscaler.v2.instance_manager.cloud_providers.read_only.cloud_provider import ( # noqa + ReadOnlyProvider, +) +from ray.autoscaler.v2.instance_manager.config import ( + AutoscalingConfig, + IConfigReader, + Provider, +) +from ray.autoscaler.v2.instance_manager.instance_manager import ( + InstanceManager, + InstanceUpdatedSubscriber, +) +from ray.autoscaler.v2.instance_manager.instance_storage import InstanceStorage +from ray.autoscaler.v2.instance_manager.node_provider import ( + ICloudInstanceProvider, + NodeProviderAdapter, +) +from ray.autoscaler.v2.instance_manager.reconciler import Reconciler +from ray.autoscaler.v2.instance_manager.storage import InMemoryStorage +from ray.autoscaler.v2.instance_manager.subscribers.cloud_instance_updater import ( + CloudInstanceUpdater, +) +from ray.autoscaler.v2.instance_manager.subscribers.ray_stopper import RayStopper +from ray.autoscaler.v2.metrics_reporter import AutoscalerMetricsReporter +from ray.autoscaler.v2.scheduler import ResourceDemandScheduler +from ray.autoscaler.v2.sdk import get_cluster_resource_state +from ray.core.generated.autoscaler_pb2 import AutoscalingState + +logger = logging.getLogger(__name__) + + +class Autoscaler: + def __init__( + self, + session_name: str, + config_reader: IConfigReader, + gcs_client: GcsClient, + event_logger: Optional[AutoscalerEventLogger] = None, + metrics_reporter: Optional[AutoscalerMetricsReporter] = None, + ) -> None: + """ + Args: + session_name: The name of the ray session. + config_reader: The config reader. + gcs_client: The GCS client. + event_logger: The event logger for emitting cluster events. + metrics_reporter: The metrics reporter for emitting cluster metrics. + """ + + self._config_reader = config_reader + + config = config_reader.get_cached_autoscaling_config() + logger.info(f"Using Autoscaling Config: \n{config.dump()}") + + self._gcs_client = gcs_client + self._cloud_instance_provider = None + self._instance_manager = None + self._ray_stop_errors_queue = Queue() + self._ray_install_errors_queue = Queue() + self._event_logger = event_logger + self._metrics_reporter = metrics_reporter + + self._init_cloud_instance_provider(config, config_reader) + self._init_instance_manager( + session_name=session_name, + config=config, + cloud_provider=self._cloud_instance_provider, + gcs_client=self._gcs_client, + ) + self._scheduler = ResourceDemandScheduler(self._event_logger) + + def _init_cloud_instance_provider( + self, config: AutoscalingConfig, config_reader: IConfigReader + ): + """ + Initialize the cloud provider, and its dependencies (the v1 node provider) + + Args: + config: The autoscaling config. + config_reader: The config reader. + + """ + provider_config = config.get_provider_config() + if provider_config["type"] == "kuberay": + provider_config["head_node_type"] = config.get_head_node_type() + self._cloud_instance_provider = KubeRayProvider( + config.get_config("cluster_name"), + provider_config, + ) + elif config.provider == Provider.READ_ONLY: + provider_config["gcs_address"] = self._gcs_client.address + self._cloud_instance_provider = ReadOnlyProvider( + provider_config=provider_config, + ) + else: + node_provider_v1 = _get_node_provider( + provider_config, + config.get_config("cluster_name"), + ) + + self._cloud_instance_provider = NodeProviderAdapter( + v1_provider=node_provider_v1, + config_reader=config_reader, + ) + + def _init_instance_manager( + self, + session_name: str, + cloud_provider: ICloudInstanceProvider, + gcs_client: GcsClient, + config: AutoscalingConfig, + ): + """ + Initialize the instance manager, and its dependencies. + """ + + instance_storage = InstanceStorage( + cluster_id=session_name, + storage=InMemoryStorage(), + ) + subscribers: List[InstanceUpdatedSubscriber] = [] + subscribers.append(CloudInstanceUpdater(cloud_provider=cloud_provider)) + subscribers.append( + RayStopper(gcs_client=gcs_client, error_queue=self._ray_stop_errors_queue) + ) + if not config.disable_node_updaters(): + # Supporting ray installer is only needed for providers that doesn't + # install or manage ray (e.g. AWS, GCP). These providers will be + # supported in the future. + raise NotImplementedError( + "RayInstaller is not supported yet in current " + "release of the Autoscaler V2. Therefore, providers " + "that update nodes (with `disable_node_updaters` set to True) " + "are not supported yet. Only KubeRay is supported for now which sets " + "disable_node_updaters to True in provider's config." + ) + + self._instance_manager = InstanceManager( + instance_storage=instance_storage, + instance_status_update_subscribers=subscribers, + ) + + def update_autoscaling_state( + self, + ) -> Optional[AutoscalingState]: + """ + Update the autoscaling state of the cluster by reconciling the current + state of the cluster resources, the cloud providers as well as instance + update subscribers with the desired state. + + Returns: + AutoscalingState: The new autoscaling state of the cluster or None if + the state is not updated. + + Raises: + No exception. + """ + + try: + ray_stop_errors = [] + while not self._ray_stop_errors_queue.empty(): + ray_stop_errors.append(self._ray_stop_errors_queue.get()) + + ray_install_errors = [] + while not self._ray_install_errors_queue.empty(): + ray_install_errors.append(self._ray_install_errors_queue.get()) + + # Get the current state of the ray cluster resources. + ray_cluster_resource_state = get_cluster_resource_state(self._gcs_client) + + # Refresh the config from the source + self._config_reader.refresh_cached_autoscaling_config() + autoscaling_config = self._config_reader.get_cached_autoscaling_config() + + return Reconciler.reconcile( + instance_manager=self._instance_manager, + scheduler=self._scheduler, + cloud_provider=self._cloud_instance_provider, + ray_cluster_resource_state=ray_cluster_resource_state, + non_terminated_cloud_instances=( + self._cloud_instance_provider.get_non_terminated() + ), + cloud_provider_errors=self._cloud_instance_provider.poll_errors(), + ray_install_errors=ray_install_errors, + ray_stop_errors=ray_stop_errors, + autoscaling_config=autoscaling_config, + metrics_reporter=self._metrics_reporter, + ) + except Exception as e: + logger.exception(e) + return None diff --git a/.venv/lib/python3.11/site-packages/ray/autoscaler/v2/event_logger.py b/.venv/lib/python3.11/site-packages/ray/autoscaler/v2/event_logger.py new file mode 100644 index 0000000000000000000000000000000000000000..961dc37bb0ad936e2adb5747ed304e80b9f2c530 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/autoscaler/v2/event_logger.py @@ -0,0 +1,157 @@ +import logging +from collections import defaultdict +from typing import Dict, List, Optional + +from ray._private.event.event_logger import EventLoggerAdapter +from ray.autoscaler.v2.instance_manager.config import NodeTypeConfig +from ray.autoscaler.v2.schema import NodeType +from ray.autoscaler.v2.utils import ResourceRequestUtil +from ray.core.generated.autoscaler_pb2 import ( + ClusterResourceConstraint, + GangResourceRequest, + ResourceRequest, +) +from ray.core.generated.instance_manager_pb2 import LaunchRequest, TerminationRequest + +logger = logging.getLogger(__name__) + + +class AutoscalerEventLogger: + """ + Logs events related to the autoscaler. + + # TODO: + - Add more logging for other events. + - Rate limit the events if too spammy. + """ + + def __init__(self, logger: EventLoggerAdapter): + self._logger = logger + + def log_cluster_scheduling_update( + self, + node_type_configs: Dict[NodeType, NodeTypeConfig], + cluster_shape: Dict[NodeType, int], + launch_requests: Optional[List[LaunchRequest]] = None, + terminate_requests: Optional[List[TerminationRequest]] = None, + infeasible_requests: Optional[List[ResourceRequest]] = None, + infeasible_gang_requests: Optional[List[GangResourceRequest]] = None, + infeasible_cluster_resource_constraints: Optional[ + List[ClusterResourceConstraint] + ] = None, + ) -> None: + """ + Log any update of the cluster scheduling state. + """ + + # Log any launch events. + if launch_requests: + launch_type_count = defaultdict(int) + for req in launch_requests: + launch_type_count[req.instance_type] += req.count + + for idx, (instance_type, count) in enumerate(launch_type_count.items()): + log_str = f"Adding {count} node(s) of type {instance_type}." + self._logger.info(f"{log_str}") + logger.info(f"{log_str}") + + # Log any terminate events. + if terminate_requests: + termination_by_causes_and_type = defaultdict(int) + for req in terminate_requests: + termination_by_causes_and_type[(req.cause, req.instance_type)] += 1 + + cause_reason_map = { + TerminationRequest.Cause.OUTDATED: "outdated", + TerminationRequest.Cause.MAX_NUM_NODES: "max number of worker nodes reached", # noqa + TerminationRequest.Cause.MAX_NUM_NODE_PER_TYPE: "max number of worker nodes per type reached", # noqa + TerminationRequest.Cause.IDLE: "idle", + } + + for idx, ((cause, instance_type), count) in enumerate( + termination_by_causes_and_type.items() + ): + log_str = f"Removing {count} nodes of type {instance_type} ({cause_reason_map[cause]})." # noqa + self._logger.info(f"{log_str}") + logger.info(f"{log_str}") + + # Cluster shape changes. + if launch_requests or terminate_requests: + total_resources = defaultdict(float) + + for node_type, count in cluster_shape.items(): + node_config = node_type_configs[node_type] + for resource_name, resource_quantity in node_config.resources.items(): + total_resources[resource_name] += resource_quantity * count + + num_cpus = total_resources.get("CPU", 0) + log_str = f"Resized to {int(num_cpus)} CPUs" + + if "GPU" in total_resources: + log_str += f", {int(total_resources['GPU'])} GPUs" + if "TPU" in total_resources: + log_str += f", {int(total_resources['TPU'])} TPUs" + + self._logger.info(f"{log_str}.") + self._logger.debug(f"Current cluster shape: {dict(cluster_shape)}.") + + # Log any infeasible requests. + if infeasible_requests: + requests_by_count = ResourceRequestUtil.group_by_count(infeasible_requests) + log_str = "No available node types can fulfill resource requests " + for idx, req_count in enumerate(requests_by_count): + resource_map = ResourceRequestUtil.to_resource_map(req_count.request) + log_str += f"{resource_map}*{req_count.count}" + if idx < len(requests_by_count) - 1: + log_str += ", " + + log_str += ( + ". Add suitable node types to this cluster to resolve this issue." + ) + self._logger.warning(log_str) + + if infeasible_gang_requests: + # Log for each placement group requests. + for gang_request in infeasible_gang_requests: + log_str = ( + "No available node types can fulfill " + "placement group requests (detail={details}): ".format( + details=gang_request.details + ) + ) + requests_by_count = ResourceRequestUtil.group_by_count( + gang_request.requests + ) + for idx, req_count in enumerate(requests_by_count): + resource_map = ResourceRequestUtil.to_resource_map( + req_count.request + ) + log_str += f"{resource_map}*{req_count.count}" + if idx < len(requests_by_count) - 1: + log_str += ", " + + log_str += ( + ". Add suitable node types to this cluster to resolve this issue." + ) + self._logger.warning(log_str) + + if infeasible_cluster_resource_constraints: + # We will only have max 1 cluster resource constraint for now since it's + # from `request_resources()` sdk, where the most recent call would override + # the previous one. + for infeasible_constraint in infeasible_cluster_resource_constraints: + log_str = "No available node types can fulfill cluster constraint: " + for i, requests_by_count in enumerate( + infeasible_constraint.resource_requests + ): + resource_map = ResourceRequestUtil.to_resource_map( + requests_by_count.request + ) + log_str += f"{resource_map}*{requests_by_count.count}" + if i < len(infeasible_constraint.resource_requests) - 1: + log_str += ", " + + log_str += ( + ". Add suitable node types to this cluster to resolve this issue." + ) + self._logger.warning(log_str) diff --git a/.venv/lib/python3.11/site-packages/ray/autoscaler/v2/instance_manager/__init__.py b/.venv/lib/python3.11/site-packages/ray/autoscaler/v2/instance_manager/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/.venv/lib/python3.11/site-packages/ray/autoscaler/v2/instance_manager/common.py b/.venv/lib/python3.11/site-packages/ray/autoscaler/v2/instance_manager/common.py new file mode 100644 index 0000000000000000000000000000000000000000..fa792f29e820b9d75c26f0120cf2de03be94c3ca --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/autoscaler/v2/instance_manager/common.py @@ -0,0 +1,472 @@ +import time +import uuid +from typing import Dict, List, Optional, Set + +from ray.core.generated.instance_manager_pb2 import Instance, InstanceUpdateEvent + + +class InstanceUtil: + """ + A helper class to group updates and operations on an Instance object defined + in instance_manager.proto + """ + + # Memoized reachable from sets, where the key is the instance status, and + # the value is the set of instance status that is reachable from the key + # instance status. + _reachable_from: Optional[ + Dict["Instance.InstanceStatus", Set["Instance.InstanceStatus"]] + ] = None + + @staticmethod + def new_instance( + instance_id: str, + instance_type: str, + status: Instance.InstanceStatus, + details: str = "", + ) -> Instance: + """ + Returns a new instance with the given status. + + Args: + instance_id: The instance id. + instance_type: The instance type. + status: The status of the new instance. + details: The details of the status transition. + """ + instance = Instance() + instance.version = 0 # it will be populated by the underlying storage. + instance.instance_id = instance_id + instance.instance_type = instance_type + instance.status = status + InstanceUtil._record_status_transition(instance, status, details) + return instance + + @staticmethod + def random_instance_id() -> str: + """ + Returns a random instance id. + """ + return str(uuid.uuid4()) + + @staticmethod + def is_cloud_instance_allocated(instance_status: Instance.InstanceStatus) -> bool: + """ + Returns True if the instance is in a status where there could exist + a cloud instance allocated by the cloud provider. + """ + assert instance_status != Instance.UNKNOWN + return instance_status in { + Instance.ALLOCATED, + Instance.RAY_INSTALLING, + Instance.RAY_RUNNING, + Instance.RAY_STOPPING, + Instance.RAY_STOP_REQUESTED, + Instance.RAY_STOPPED, + Instance.TERMINATING, + Instance.RAY_INSTALL_FAILED, + Instance.TERMINATION_FAILED, + } + + @staticmethod + def is_ray_running(instance_status: Instance.InstanceStatus) -> bool: + """ + Returns True if the instance is in a status where the ray process is + running on the cloud instance. + i.e. RAY_RUNNING, RAY_STOP_REQUESTED, RAY_STOPPING + """ + assert instance_status != Instance.UNKNOWN + + if instance_status in InstanceUtil.get_reachable_statuses( + Instance.RAY_STOPPING + ): + return False + + if instance_status in InstanceUtil.get_reachable_statuses(Instance.RAY_RUNNING): + return True + + return False + + @staticmethod + def is_ray_pending(instance_status: Instance.InstanceStatus) -> bool: + """ + Returns True if the instance is in a status where the ray process is + pending to be started on the cloud instance. + + """ + assert instance_status != Instance.UNKNOWN + # Not gonna be in a RAY_RUNNING status. + if Instance.RAY_RUNNING not in InstanceUtil.get_reachable_statuses( + instance_status + ): + return False + + # Already running ray. + if instance_status in InstanceUtil.get_reachable_statuses(Instance.RAY_RUNNING): + return False + + return True + + def is_ray_running_reachable(instance_status: Instance.InstanceStatus) -> bool: + """ + Returns True if the instance is in a status where it may transition + to RAY_RUNNING status. + """ + return Instance.RAY_RUNNING in InstanceUtil.get_reachable_statuses( + instance_status + ) + + @staticmethod + def set_status( + instance: Instance, + new_instance_status: Instance.InstanceStatus, + details: str = "", + ) -> bool: + """Transitions the instance to the new state. + + Args: + instance: The instance to update. + new_instance_status: The new status to transition to. + details: The details of the transition. + + Returns: + True if the status transition is successful, False otherwise. + """ + if ( + new_instance_status + not in InstanceUtil.get_valid_transitions()[instance.status] + ): + return False + instance.status = new_instance_status + InstanceUtil._record_status_transition(instance, new_instance_status, details) + return True + + @staticmethod + def _record_status_transition( + instance: Instance, status: Instance.InstanceStatus, details: str + ): + """Records the status transition. + + Args: + instance: The instance to update. + status: The new status to transition to. + """ + now_ns = time.time_ns() + instance.status_history.append( + Instance.StatusHistory( + instance_status=status, + timestamp_ns=now_ns, + details=details, + ) + ) + + @staticmethod + def has_timeout(instance: Instance, timeout_s: int) -> bool: + """ + Returns True if the instance has been in the current status for more + than the timeout_seconds. + + Args: + instance: The instance to check. + timeout_seconds: The timeout in seconds. + + Returns: + True if the instance has been in the current status for more than + the timeout_s seconds. + """ + cur_status = instance.status + + status_times_ns = InstanceUtil.get_status_transition_times_ns( + instance, select_instance_status=cur_status + ) + assert len(status_times_ns) >= 1, ( + f"instance {instance.instance_id} has {len(status_times_ns)} " + f"{Instance.InstanceStatus.Name(cur_status)} status" + ) + status_time_ns = sorted(status_times_ns)[-1] + if time.time_ns() - status_time_ns <= (timeout_s * 1e9): + return False + + return True + + @staticmethod + def get_valid_transitions() -> Dict[ + "Instance.InstanceStatus", Set["Instance.InstanceStatus"] + ]: + return { + # This is the initial status of a new instance. + Instance.QUEUED: { + # Cloud provider requested to launch a node for the instance. + # This happens when the a launch request is made to the node provider. + Instance.REQUESTED, + }, + # When in this status, a launch request to the node provider is made. + Instance.REQUESTED: { + # Cloud provider allocated a cloud instance for the instance. + # This happens when the cloud instance first appears in the list of + # running cloud instances from the cloud instance provider. + Instance.ALLOCATED, + # Retry the allocation, become queueing again. + Instance.QUEUED, + # Cloud provider fails to allocate one. Either as a timeout or + # the launch request fails immediately. + Instance.ALLOCATION_FAILED, + }, + # When in this status, the cloud instance is allocated and running. This + # happens when the cloud instance is present in node provider's list of + # running cloud instances. + Instance.ALLOCATED: { + # Ray needs to be install and launch on the provisioned cloud instance. + # This happens when the cloud instance is allocated, and the autoscaler + # is responsible for installing and launching ray on the cloud instance. + # For node provider that manages the ray installation and launching, + # this state is skipped. + Instance.RAY_INSTALLING, + # Ray is already installed on the provisioned cloud + # instance. It could be any valid ray status. + Instance.RAY_RUNNING, + Instance.RAY_STOPPING, + Instance.RAY_STOPPED, + # Instance is requested to be stopped, e.g. instance leaked: no matching + # Instance with the same type is found in the autoscaler's state. + Instance.TERMINATING, + # cloud instance somehow failed. + Instance.TERMINATED, + }, + # Ray process is being installed and started on the cloud instance. + # This status is skipped for node provider that manages the ray + # installation and launching. (e.g. Ray-on-Spark) + Instance.RAY_INSTALLING: { + # Ray installed and launched successfully, reported by the ray cluster. + # Similar to the Instance.ALLOCATED -> Instance.RAY_RUNNING transition, + # where the ray process is managed by the node provider. + Instance.RAY_RUNNING, + # Ray installation failed. This happens when the ray process failed to + # be installed and started on the cloud instance. + Instance.RAY_INSTALL_FAILED, + # Wen the ray node is reported as stopped by the ray cluster. + # This could happen that the ray process was stopped quickly after start + # such that a ray running node wasn't discovered and the RAY_RUNNING + # transition was skipped. + Instance.RAY_STOPPED, + # A cloud instance is being terminated (when the instance itself is no + # longer needed, e.g. instance is outdated, autoscaler is scaling down) + Instance.TERMINATING, + # cloud instance somehow failed during the installation process. + Instance.TERMINATED, + }, + # Ray process is installed and running on the cloud instance. When in this + # status, a ray node must be present in the ray cluster. + Instance.RAY_RUNNING: { + # Ray is requested to be stopped. + Instance.RAY_STOP_REQUESTED, + # Ray is stopping (currently draining), + # e.g. idle termination. + Instance.RAY_STOPPING, + # Ray is already stopped, as reported by the ray cluster. + Instance.RAY_STOPPED, + # A cloud instance is being terminated (when the instance itself is no + # longer needed, e.g. instance is outdated, autoscaler is scaling down) + Instance.TERMINATING, + # cloud instance somehow failed. + Instance.TERMINATED, + }, + # Ray process should be stopped on the cloud instance. The RayStopper + # subscriber will listen to this status and stop the ray process. + Instance.RAY_STOP_REQUESTED: { + # Ray is stopping on the cloud instance. + Instance.RAY_STOPPING, + # Ray stopped already. + Instance.RAY_STOPPED, + # Ray stop request failed (e.g. idle node no longer idle), + # ray is still running. + Instance.RAY_RUNNING, + # cloud instance somehow failed. + Instance.TERMINATED, + }, + # When in this status, the ray process is requested to be stopped to the + # ray cluster, but not yet present in the dead ray node list reported by + # the ray cluster. + Instance.RAY_STOPPING: { + # Ray is stopped, and the ray node is present in the dead ray node list + # reported by the ray cluster. + Instance.RAY_STOPPED, + # A cloud instance is being terminated (when the instance itself is no + # longer needed, e.g. instance is outdated, autoscaler is scaling down) + Instance.TERMINATING, + # cloud instance somehow failed. + Instance.TERMINATED, + }, + # When in this status, the ray process is stopped, and the ray node is + # present in the dead ray node list reported by the ray cluster. + Instance.RAY_STOPPED: { + # A cloud instance is being terminated (when the instance itself is no + # longer needed, e.g. instance is outdated, autoscaler is scaling down) + Instance.TERMINATING, + # cloud instance somehow failed. + Instance.TERMINATED, + }, + # When in this status, the cloud instance is requested to be stopped to + # the node provider. + Instance.TERMINATING: { + # When a cloud instance no longer appears in the list of running cloud + # instances from the node provider. + Instance.TERMINATED, + # When the cloud instance failed to be terminated. + Instance.TERMINATION_FAILED, + }, + # When in this status, the cloud instance failed to be terminated by the + # node provider. We will keep retrying. + Instance.TERMINATION_FAILED: { + # Retry the termination, become terminating again. + Instance.TERMINATING, + }, + # Whenever a cloud instance disappears from the list of running cloud + # instances from the node provider, the instance is marked as stopped. Since + # we guarantee 1:1 mapping of a Instance to a cloud instance, this is a + # terminal state. + Instance.TERMINATED: set(), # Terminal state. + # When in this status, the cloud instance failed to be allocated by the + # node provider. + Instance.ALLOCATION_FAILED: set(), # Terminal state. + Instance.RAY_INSTALL_FAILED: { + # Autoscaler requests to shutdown the instance when ray install failed. + Instance.TERMINATING, + # cloud instance somehow failed. + Instance.TERMINATED, + }, + # Initial state before the instance is created. Should never be used. + Instance.UNKNOWN: set(), + } + + @staticmethod + def get_status_transitions( + instance: Instance, + select_instance_status: Optional["Instance.InstanceStatus"] = None, + ) -> List["Instance.StatusHistory"]: + """ + Returns the status history of the instance. + + Args: + instance: The instance. + select_instance_status: The go-to status to search for, i.e. select + only status history when the instance transitions into the status. + If None, returns all status updates. + """ + history = [] + for status_update in instance.status_history: + if ( + select_instance_status + and status_update.instance_status != select_instance_status + ): + continue + history.append(status_update) + return history + + @staticmethod + def get_last_status_transition( + instance: Instance, + select_instance_status: Optional["Instance.InstanceStatus"] = None, + ) -> Optional["Instance.StatusHistory"]: + """ + Returns the last status transition of the instance. + + Args: + instance: The instance. + instance_status: The status to search for. If None, returns the last + status update. + """ + history = InstanceUtil.get_status_transitions(instance, select_instance_status) + history.sort(key=lambda x: x.timestamp_ns) + if history: + return history[-1] + return None + + @staticmethod + def get_status_transition_times_ns( + instance: Instance, + select_instance_status: Optional["Instance.InstanceStatus"] = None, + ) -> List[int]: + """ + Returns a list of timestamps of the instance status update. + + Args: + instance: The instance. + instance_status: The status to search for. If None, returns all + status updates timestamps. + + Returns: + The list of timestamps of the instance status updates. + """ + return [ + e.timestamp_ns + for e in InstanceUtil.get_status_transitions( + instance, select_instance_status + ) + ] + + @classmethod + def get_reachable_statuses( + cls, + instance_status: Instance.InstanceStatus, + ) -> Set["Instance.InstanceStatus"]: + """ + Returns the set of instance status that is reachable from the given + instance status following the status transitions. + This method is memoized. + Args: + instance_status: The instance status to start from. + Returns: + The set of instance status that is reachable from the given instance + status. + """ + if cls._reachable_from is None: + cls._compute_reachable() + return cls._reachable_from[instance_status] + + @staticmethod + def get_log_str_for_update(instance: Instance, update: InstanceUpdateEvent) -> str: + """Returns a log string for the given instance update.""" + if update.upsert: + return ( + f"New instance " + f"{Instance.InstanceStatus.Name(update.new_instance_status)} (id=" + f"{instance.instance_id}, type={instance.instance_type}, " + f"cloud_instance_id={instance.cloud_instance_id}, " + f"ray_id={instance.node_id}): {update.details}" + ) + return ( + f"Update instance " + f"{Instance.InstanceStatus.Name(instance.status)}->" + f"{Instance.InstanceStatus.Name(update.new_instance_status)} (id=" + f"{instance.instance_id}, type={instance.instance_type}, " + f"cloud_instance_id={instance.cloud_instance_id}, " + f"ray_id={instance.node_id}): {update.details}" + ) + + @classmethod + def _compute_reachable(cls): + """ + Computes and memorize the from status sets for each status machine with + a DFS search. + """ + valid_transitions = cls.get_valid_transitions() + + def dfs(graph, start, visited): + """ + Regular DFS algorithm to find all reachable nodes from a given node. + """ + for next_node in graph[start]: + if next_node not in visited: + # We delay adding the visited set here so we could capture + # the self loop. + visited.add(next_node) + dfs(graph, next_node, visited) + return visited + + # Initialize the graphs + cls._reachable_from = {} + for status in Instance.InstanceStatus.values(): + # All nodes reachable from 'start' + visited = set() + cls._reachable_from[status] = dfs(valid_transitions, status, visited) diff --git a/.venv/lib/python3.11/site-packages/ray/autoscaler/v2/instance_manager/config.py b/.venv/lib/python3.11/site-packages/ray/autoscaler/v2/instance_manager/config.py new file mode 100644 index 0000000000000000000000000000000000000000..feba9fdfbddefda847d094580d50ec6368d17506 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/autoscaler/v2/instance_manager/config.py @@ -0,0 +1,541 @@ +import copy +import logging +from abc import ABC, abstractmethod +from dataclasses import dataclass, field +from enum import Enum +from pathlib import Path +from typing import Any, Dict, List, Optional + +import yaml + +from ray._private.ray_constants import env_integer +from ray._private.utils import binary_to_hex +from ray._raylet import GcsClient +from ray.autoscaler._private.constants import ( + AUTOSCALER_MAX_CONCURRENT_LAUNCHES, + DEFAULT_UPSCALING_SPEED, + DISABLE_LAUNCH_CONFIG_CHECK_KEY, + DISABLE_NODE_UPDATERS_KEY, +) +from ray.autoscaler._private.kuberay.autoscaling_config import AutoscalingConfigProducer +from ray.autoscaler._private.monitor import BASE_READONLY_CONFIG +from ray.autoscaler._private.util import ( + format_readonly_node_type, + hash_launch_conf, + hash_runtime_conf, + prepare_config, + validate_config, +) +from ray.autoscaler.v2.schema import NodeType +from ray.autoscaler.v2.sdk import get_cluster_resource_state +from ray.autoscaler.v2.utils import is_head_node + +logger = logging.getLogger(__name__) + + +class Provider(Enum): + UNKNOWN = 0 + ALIYUN = 1 + AWS = 2 + AZURE = 3 + GCP = 4 + KUBERAY = 5 + LOCAL = 6 + READ_ONLY = 7 + + +class IConfigReader(ABC): + """An interface for reading Autoscaling config. + + A utility class that reads autoscaling configs from various sources: + - File + - In-memory dict + - Remote config service (e.g. KubeRay's config) + + Example: + reader = FileConfigReader("path/to/config.yaml") + # Get the recently cached config. + config = reader.get_cached_autoscaling_config() + + ... + # Refresh the cached config. + reader.refresh_cached_autoscaling_config() + config = reader.get_cached_autoscaling_config() + + """ + + @abstractmethod + def get_cached_autoscaling_config(self) -> "AutoscalingConfig": + """Returns the recently read autoscaling config. + + Returns: + AutoscalingConfig: The recently read autoscaling config. + """ + pass + + @abstractmethod + def refresh_cached_autoscaling_config(self): + """Read the config from the source.""" + pass + + +@dataclass(frozen=True) +class InstanceReconcileConfig: + # The timeout for waiting for a REQUESTED instance to be ALLOCATED. + request_status_timeout_s: int = env_integer( + "RAY_AUTOSCALER_RECONCILE_REQUEST_STATUS_TIMEOUT_S", 10 * 60 + ) + # The timeout for waiting for a ALLOCATED instance to be RAY_RUNNING. + allocate_status_timeout_s: int = env_integer( + "RAY_AUTOSCALER_RECONCILE_ALLOCATE_STATUS_TIMEOUT_S", 300 + ) + # The timeout for waiting for a RAY_INSTALLING instance to be RAY_RUNNING. + ray_install_status_timeout_s: int = env_integer( + "RAY_AUTOSCALER_RECONCILE_RAY_INSTALL_STATUS_TIMEOUT_S", 30 * 60 + ) + # The timeout for waiting for a TERMINATING instance to be TERMINATED. + terminating_status_timeout_s: int = env_integer( + "RAY_AUTOSCALER_RECONCILE_TERMINATING_STATUS_TIMEOUT_S", 300 + ) + # The timeout for waiting for a RAY_STOP_REQUESTED instance + # to be RAY_STOPPING or RAY_STOPPED. + ray_stop_requested_status_timeout_s: int = env_integer( + "RAY_AUTOSCALER_RECONCILE_RAY_STOP_REQUESTED_STATUS_TIMEOUT_S", 300 + ) + # The interval for raise a warning when an instance in transient status + # is not updated for a long time. + transient_status_warn_interval_s: int = env_integer( + "RAY_AUTOSCALER_RECONCILE_TRANSIENT_STATUS_WARN_INTERVAL_S", 90 + ) + # The number of times to retry requesting to allocate an instance. + max_num_retry_request_to_allocate: int = env_integer( + "RAY_AUTOSCALER_RECONCILE_MAX_NUM_RETRY_REQUEST_TO_ALLOCATE", 3 + ) + + +@dataclass +class NodeTypeConfig: + """ + NodeTypeConfig is the helper class to provide node type specific configs. + This maps to subset of the `available_node_types` field in the + autoscaling config. + """ + + # Node type name + name: NodeType + # The minimal number of worker nodes to be launched for this node type. + min_worker_nodes: int + # The maximal number of worker nodes can be launched for this node type. + max_worker_nodes: int + # Idle timeout seconds for worker nodes of this node type. + idle_timeout_s: Optional[float] = None + # The total resources on the node. + resources: Dict[str, float] = field(default_factory=dict) + # The labels on the node. + labels: Dict[str, str] = field(default_factory=dict) + # The node config's launch config hash. It's calculated from the auth + # config, and the node's config in the `AutoscalingConfig` for the node + # type when launching the node. It's used to detect config changes. + launch_config_hash: str = "" + + def __post_init__(self): + assert self.min_worker_nodes <= self.max_worker_nodes + assert self.min_worker_nodes >= 0 + + +class AutoscalingConfig: + """ + AutoscalingConfig is the helper class to provide autoscaling + related configs. + + # TODO(rickyx): + 1. Move the config validation logic here. + 2. Deprecate the ray-schema.json for validation because it's + static thus not possible to validate the config with interdependency + of each other. + """ + + def __init__( + self, + configs: Dict[str, Any], + skip_content_hash: bool = False, + ) -> None: + """ + Args: + configs : The raw configs dict. + skip_content_hash : + Whether to skip file mounts/ray command hash calculation. + """ + self._sync_continuously = False + self.update_configs(configs, skip_content_hash) + + def update_configs(self, configs: Dict[str, Any], skip_content_hash: bool) -> None: + self._configs = prepare_config(configs) + validate_config(self._configs) + if skip_content_hash: + return + self._calculate_hashes() + self._sync_continuously = self._configs.get( + "generate_file_mounts_contents_hash", True + ) + + def _calculate_hashes(self) -> None: + logger.info("Calculating hashes for file mounts and ray commands.") + self._runtime_hash, self._file_mounts_contents_hash = hash_runtime_conf( + self._configs.get("file_mounts", {}), + self._configs.get("cluster_synced_files", []), + [ + self._configs.get("worker_setup_commands", []), + self._configs.get("worker_start_ray_commands", []), + ], + generate_file_mounts_contents_hash=self._configs.get( + "generate_file_mounts_contents_hash", True + ), + ) + + def get_cloud_node_config(self, ray_node_type: NodeType) -> Dict[str, Any]: + return copy.deepcopy( + self.get_node_type_specific_config(ray_node_type, "node_config") or {} + ) + + def get_docker_config(self, ray_node_type: NodeType) -> Dict[str, Any]: + """ + Return the docker config for the specified node type. + If it's a head node, the image will be chosen in the following order: + 1. Node specific docker image. + 2. The 'docker' config's 'head_image' field. + 3. The 'docker' config's 'image' field. + If it's a worker node, the image will be chosen in the following order: + 1. Node specific docker image. + 2. The 'docker' config's 'worker_image' field. + 3. The 'docker' config's 'image' field. + """ + # TODO(rickyx): It's unfortunate we have multiple fields in ray-schema.json + # that can specify docker images. We should consolidate them. + docker_config = copy.deepcopy(self._configs.get("docker", {})) + node_specific_docker_config = self._configs["available_node_types"][ + ray_node_type + ].get("docker", {}) + # Override the global docker config with node specific docker config. + docker_config.update(node_specific_docker_config) + + if self._configs.get("head_node_type") == ray_node_type: + if "head_image" in docker_config: + logger.info( + "Overwriting image={} by head_image({}) for head node docker.".format( # noqa: E501 + docker_config["image"], docker_config["head_image"] + ) + ) + docker_config["image"] = docker_config["head_image"] + else: + if "worker_image" in docker_config: + logger.info( + "Overwriting image={} by worker_image({}) for worker node docker.".format( # noqa: E501 + docker_config["image"], docker_config["worker_image"] + ) + ) + docker_config["image"] = docker_config["worker_image"] + + # These fields should be merged. + docker_config.pop("head_image", None) + docker_config.pop("worker_image", None) + return docker_config + + def get_worker_start_ray_commands(self) -> List[str]: + return self._configs.get("worker_start_ray_commands", []) + + def get_head_setup_commands(self) -> List[str]: + return self._configs.get("head_setup_commands", []) + + def get_head_start_ray_commands(self) -> List[str]: + return self._configs.get("head_start_ray_commands", []) + + def get_worker_setup_commands(self, ray_node_type: NodeType) -> List[str]: + """ + Return the worker setup commands for the specified node type. + + If the node type specific worker setup commands are not specified, + return the global worker setup commands. + """ + worker_setup_command = self.get_node_type_specific_config( + ray_node_type, "worker_setup_commands" + ) + if worker_setup_command is None: + # Return global worker setup commands if node type specific + # worker setup commands are not specified. + logger.info( + "Using global worker setup commands for {}".format(ray_node_type) + ) + return self._configs.get("worker_setup_commands", []) + return worker_setup_command + + def get_initialization_commands(self, ray_node_type: NodeType) -> List[str]: + """ + Return the initialization commands for the specified node type. + + If the node type specific initialization commands are not specified, + return the global initialization commands. + """ + initialization_command = self.get_node_type_specific_config( + ray_node_type, "initialization_commands" + ) + if initialization_command is None: + logger.info( + "Using global initialization commands for {}".format(ray_node_type) + ) + return self._configs.get("initialization_commands", []) + return initialization_command + + def get_node_type_specific_config( + self, ray_node_type: NodeType, config_name: str + ) -> Optional[Any]: + node_specific_config = self._configs["available_node_types"].get( + ray_node_type, {} + ) + return node_specific_config.get(config_name, None) + + def get_node_resources(self, ray_node_type: NodeType) -> Dict[str, float]: + return copy.deepcopy( + self.get_node_type_specific_config(ray_node_type, "resources") or {} + ) + + def get_node_labels(self, ray_node_type: NodeType) -> Dict[str, str]: + return copy.deepcopy( + self.get_node_type_specific_config(ray_node_type, "labels") or {} + ) + + def get_config(self, config_name, default=None) -> Any: + return self._configs.get(config_name, default) + + def get_provider_instance_type(self, ray_node_type: NodeType) -> str: + provider = self.provider + node_config = self.get_node_type_specific_config(ray_node_type, "node_config") + if provider in [Provider.AWS, Provider.ALIYUN]: + return node_config.get("InstanceType", "") + elif provider == Provider.AZURE: + return node_config.get("azure_arm_parameters", {}).get("vmSize", "") + elif provider == Provider.GCP: + return node_config.get("machineType", "") + elif provider in [Provider.KUBERAY, Provider.LOCAL, Provider.UNKNOWN]: + return "" + else: + raise ValueError(f"Unknown provider {provider}") + + def get_node_type_configs(self) -> Dict[NodeType, NodeTypeConfig]: + """ + Returns the node type configs from the `available_node_types` field. + + Returns: + Dict[NodeType, NodeTypeConfig]: The node type configs. + """ + available_node_types = self._configs.get("available_node_types", {}) + if not available_node_types: + return None + node_type_configs = {} + auth_config = self._configs.get("auth", {}) + head_node_type = self.get_head_node_type() + assert head_node_type + for node_type, node_config in available_node_types.items(): + launch_config_hash = hash_launch_conf( + node_config.get("node_config", {}), auth_config + ) + max_workers_nodes = node_config.get("max_workers", 0) + if head_node_type == node_type: + max_workers_nodes += 1 + + node_type_configs[node_type] = NodeTypeConfig( + name=node_type, + min_worker_nodes=node_config.get("min_workers", 0), + max_worker_nodes=max_workers_nodes, + idle_timeout_s=node_config.get("idle_timeout_s", None), + resources=node_config.get("resources", {}), + labels=node_config.get("labels", {}), + launch_config_hash=launch_config_hash, + ) + return node_type_configs + + def get_head_node_type(self) -> NodeType: + """ + Returns the head node type. + + If there is only one node type, return the only node type as the head + node type. + If there are multiple node types, return the head node type specified + in the config. + """ + available_node_types = self._configs.get("available_node_types", {}) + if len(available_node_types) == 1: + return list(available_node_types.keys())[0] + return self._configs.get("head_node_type") + + def get_max_num_worker_nodes(self) -> Optional[int]: + return self.get_config("max_workers", None) + + def get_max_num_nodes(self) -> Optional[int]: + max_num_workers = self.get_max_num_worker_nodes() + if max_num_workers is not None: + return max_num_workers + 1 # For head node + return None + + def get_raw_config_mutable(self) -> Dict[str, Any]: + return self._configs + + def get_upscaling_speed(self) -> float: + return self.get_config("upscaling_speed", DEFAULT_UPSCALING_SPEED) + + def get_max_concurrent_launches(self) -> int: + return AUTOSCALER_MAX_CONCURRENT_LAUNCHES + + def disable_node_updaters(self) -> bool: + provider_config = self._configs.get("provider", {}) + return provider_config.get(DISABLE_NODE_UPDATERS_KEY, True) + + def get_idle_timeout_s(self) -> Optional[float]: + """ + Returns the idle timeout in seconds if present in config, otherwise None. + """ + idle_timeout_s = self.get_config("idle_timeout_minutes", None) + return idle_timeout_s * 60 if idle_timeout_s is not None else None + + def disable_launch_config_check(self) -> bool: + provider_config = self.get_provider_config() + return provider_config.get(DISABLE_LAUNCH_CONFIG_CHECK_KEY, True) + + def get_instance_reconcile_config(self) -> InstanceReconcileConfig: + # TODO(rickyx): we need a way to customize these configs, + # either extending the current ray-schema.json, or just use another + # schema validation paths. + return InstanceReconcileConfig() + + def get_provider_config(self) -> Dict[str, Any]: + return self._configs.get("provider", {}) + + def dump(self) -> str: + return yaml.safe_dump(self._configs) + + @property + def provider(self) -> Provider: + provider_str = self._configs.get("provider", {}).get("type", "") + if provider_str == "local": + return Provider.LOCAL + elif provider_str == "aws": + return Provider.AWS + elif provider_str == "azure": + return Provider.AZURE + elif provider_str == "gcp": + return Provider.GCP + elif provider_str == "aliyun": + return Provider.ALIYUN + elif provider_str == "kuberay": + return Provider.KUBERAY + elif provider_str == "readonly": + return Provider.READ_ONLY + else: + return Provider.UNKNOWN + + @property + def runtime_hash(self) -> str: + return self._runtime_hash + + @property + def file_mounts_contents_hash(self) -> str: + return self._file_mounts_contents_hash + + +class FileConfigReader(IConfigReader): + """A class that reads cluster config from a yaml file.""" + + def __init__(self, config_file: str, skip_content_hash: bool = True) -> None: + """ + Args: + config_file: The path to the config file. + skip_content_hash: Whether to skip file mounts/ray command + hash calculation. Default to True. + """ + self._config_file_path = Path(config_file).resolve() + self._skip_content_hash = skip_content_hash + self._cached_config = self._read() + + def _read(self) -> AutoscalingConfig: + with open(self._config_file_path) as f: + config = yaml.safe_load(f.read()) + return AutoscalingConfig(config, skip_content_hash=self._skip_content_hash) + + def get_cached_autoscaling_config(self) -> AutoscalingConfig: + """ + Returns: + AutoscalingConfig: The autoscaling config. + """ + + return self._cached_config + + def refresh_cached_autoscaling_config(self): + self._cached_config = self._read() + + +class KubeRayConfigReader(IConfigReader): + """A class that reads cluster config from a K8s RayCluster CR.""" + + def __init__(self, config_producer: AutoscalingConfigProducer): + self._config_producer = config_producer + self._cached_config = self._generate_configs_from_k8s() + + def _generate_configs_from_k8s(self) -> AutoscalingConfig: + return AutoscalingConfig(self._config_producer()) + + def get_cached_autoscaling_config(self) -> AutoscalingConfig: + """ + Returns: + AutoscalingConfig: The autoscaling config. + """ + return self._cached_config + + def refresh_cached_autoscaling_config(self): + """ + Reads the configs from the K8s RayCluster CR. + + This reads from the K8s API server every time to pick up changes. + """ + self._cached_config = self._generate_configs_from_k8s() + + +class ReadOnlyProviderConfigReader(IConfigReader): + """A class that reads cluster config for a read-only provider. + + This is used for laptop mode / manual cluster setup modes, in order to + provide status reporting in the same way for users.""" + + def __init__(self, gcs_address: str): + self._configs = BASE_READONLY_CONFIG + self._gcs_client = GcsClient(address=gcs_address) + + def refresh_cached_autoscaling_config(self) -> AutoscalingConfig: + # Update the config with node types from GCS. + ray_cluster_resource_state = get_cluster_resource_state(self._gcs_client) + + # Format each node type's config from the running nodes. + available_node_types = {} + + head_node_type = None + for node_state in ray_cluster_resource_state.node_states: + node_type = format_readonly_node_type(binary_to_hex(node_state.node_id)) + if is_head_node(node_state): + head_node_type = node_type + + available_node_types[node_type] = { + "resources": dict(node_state.total_resources), + "min_workers": 0, + "max_workers": 0 if is_head_node(node_state) else 1, + "node_config": {}, + } + if available_node_types: + self._configs["available_node_types"].update(available_node_types) + self._configs["max_workers"] = len(available_node_types) + assert head_node_type, "Head node type should be found." + self._configs["head_node_type"] = head_node_type + + # Don't idle terminated nodes in read-only mode. + self._configs.pop("idle_timeout_minutes", None) + + def get_cached_autoscaling_config(self) -> AutoscalingConfig: + return AutoscalingConfig(self._configs, skip_content_hash=True) diff --git a/.venv/lib/python3.11/site-packages/ray/autoscaler/v2/instance_manager/instance_manager.py b/.venv/lib/python3.11/site-packages/ray/autoscaler/v2/instance_manager/instance_manager.py new file mode 100644 index 0000000000000000000000000000000000000000..6a1f6e207408efac43bb4f78ced3ca24898ebf17 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/autoscaler/v2/instance_manager/instance_manager.py @@ -0,0 +1,270 @@ +import logging +from abc import ABC, abstractmethod +from typing import List, Optional + +from ray.autoscaler.v2.instance_manager.common import InstanceUtil +from ray.autoscaler.v2.instance_manager.instance_storage import InstanceStorage +from ray.core.generated.instance_manager_pb2 import ( + GetInstanceManagerStateReply, + GetInstanceManagerStateRequest, + Instance, + InstanceUpdateEvent, + NodeKind, + StatusCode, + UpdateInstanceManagerStateReply, + UpdateInstanceManagerStateRequest, +) + +logger = logging.getLogger(__name__) + + +class InstanceUpdatedSubscriber(ABC): + """Subscribers to instance status changes.""" + + @abstractmethod + def notify(self, events: List[InstanceUpdateEvent]) -> None: + pass + + +class InstanceManager: + """ + See `InstanceManagerService` in instance_manager.proto + + This handles updates to an instance, or inserts a new instance if + it's an insert update. We should only be inserting new instances + of the below statuses: + 1. ALLOCATED: For unmanaged instance not initialized by InstanceManager, + e.g. head node + 2. QUEUED: For new instance being queued to launch. + 3. TERMINATING: For leaked cloud instance that needs to be terminated. + + For full status transitions, see: + https://docs.google.com/document/d/1NzQjA8Mh-oMc-QxXOa529oneWCoA8sDiVoNkBqqDb4U/edit#heading=h.k9a1sp4qpqj4 + + Not thread safe, should be used as a singleton. + """ + + def __init__( + self, + instance_storage: InstanceStorage, + instance_status_update_subscribers: Optional[List[InstanceUpdatedSubscriber]], + ): + self._instance_storage = instance_storage + self._status_update_subscribers = instance_status_update_subscribers or [] + + def update_instance_manager_state( + self, request: UpdateInstanceManagerStateRequest + ) -> UpdateInstanceManagerStateReply: + """ + Updates the instance manager state. + + If there's any failure, no updates would be made and the reply + would contain the latest version of the instance manager state, + and the error info. + + Args: + request: The request to update the instance manager state. + + Returns: + The reply to the request. + """ + + # Handle updates + ids_to_updates = {update.instance_id: update for update in request.updates} + to_update_instances, version = self._instance_storage.get_instances( + instance_ids=ids_to_updates.keys() + ) + + if request.expected_version >= 0 and request.expected_version != version: + err_str = ( + f"Version mismatch: expected: {request.expected_version}, " + f"actual: {version}" + ) + logger.warning(err_str) + return self._get_update_im_state_reply( + StatusCode.VERSION_MISMATCH, + version, + err_str, + ) + + # Handle instances states update. + to_upsert_instances = [] + for instance_id, update in ids_to_updates.items(): + if instance_id in to_update_instances: + instance = self._update_instance( + to_update_instances[instance_id], update + ) + else: + instance = self._create_instance(update) + + to_upsert_instances.append(instance) + + # Updates the instance storage. + result = self._instance_storage.batch_upsert_instances( + updates=to_upsert_instances, + expected_storage_version=version, + ) + + if not result.success: + if result.version != version: + err_str = ( + f"Version mismatch: expected: {version}, actual: {result.version}" + ) + logger.warning(err_str) + return self._get_update_im_state_reply( + StatusCode.VERSION_MISMATCH, result.version, err_str + ) + else: + err_str = "Failed to update instance storage." + logger.error(err_str) + return self._get_update_im_state_reply( + StatusCode.UNKNOWN_ERRORS, result.version, err_str + ) + + # Successful updates. + for subscriber in self._status_update_subscribers: + subscriber.notify(request.updates) + + return self._get_update_im_state_reply(StatusCode.OK, result.version) + + def get_instance_manager_state( + self, request: GetInstanceManagerStateRequest + ) -> GetInstanceManagerStateReply: + """ + Gets the instance manager state. + + Args: + request: The request to get the instance manager state. + + Returns: + The reply to the request. + """ + reply = GetInstanceManagerStateReply() + instances, version = self._instance_storage.get_instances() + reply.state.instances.extend(instances.values()) + reply.state.version = version + reply.status.code = StatusCode.OK + + return reply + + ######################################### + # Private methods + ######################################### + + @staticmethod + def _get_update_im_state_reply( + status_code: StatusCode, version: int, error_message: str = "" + ) -> UpdateInstanceManagerStateReply: + """ + Returns a UpdateInstanceManagerStateReply with the given status code and + version. + + Args: + status_code: The status code. + version: The version. + error_message: The error message if any. + + Returns: + The reply. + """ + reply = UpdateInstanceManagerStateReply() + reply.status.code = status_code + reply.version = version + if error_message: + reply.status.message = error_message + return reply + + @staticmethod + def _apply_update(instance: Instance, update: InstanceUpdateEvent): + """ + Apply status specific update to the instance. + + Args: + instance: The instance to update. + update: The update to apply. + """ + if update.new_instance_status == Instance.ALLOCATED: + assert ( + update.cloud_instance_id + ), "ALLOCATED update must have cloud_instance_id" + assert update.node_kind in [ + NodeKind.WORKER, + NodeKind.HEAD, + ], "ALLOCATED update must have node_kind as WORKER or HEAD" + assert update.instance_type, "ALLOCATED update must have instance_type" + assert ( + update.cloud_instance_id + ), "ALLOCATED update must have cloud_instance_id" + instance.cloud_instance_id = update.cloud_instance_id + instance.node_kind = update.node_kind + instance.instance_type = update.instance_type + elif update.new_instance_status == Instance.RAY_RUNNING: + assert update.ray_node_id, "RAY_RUNNING update must have ray_node_id" + instance.node_id = update.ray_node_id + elif update.new_instance_status == Instance.REQUESTED: + assert ( + update.launch_request_id + ), "REQUESTED update must have launch_request_id" + assert update.instance_type, "REQUESTED update must have instance_type" + instance.launch_request_id = update.launch_request_id + instance.instance_type = update.instance_type + elif update.new_instance_status == Instance.TERMINATING: + assert ( + update.cloud_instance_id + ), "TERMINATING update must have cloud instance id" + + @staticmethod + def _create_instance(update: InstanceUpdateEvent) -> Instance: + """ + Create a new instance from the given update. + """ + + assert update.upsert, "upsert must be true for creating new instance." + + assert update.new_instance_status in [ + # For unmanaged instance not initialized by InstanceManager, + # e.g. head node + Instance.ALLOCATED, + # For new instance being queued to launch. + Instance.QUEUED, + # For leaked cloud instance that needs to be terminated. + Instance.TERMINATING, + ], ( + "Invalid status for new instance, must be one of " + "[ALLOCATED, QUEUED, TERMINATING]" + ) + + # Create a new instance first for common fields. + instance = InstanceUtil.new_instance( + instance_id=update.instance_id, + instance_type=update.instance_type, + status=update.new_instance_status, + details=update.details, + ) + + # Apply the status specific updates. + logger.info(InstanceUtil.get_log_str_for_update(instance, update)) + InstanceManager._apply_update(instance, update) + return instance + + @staticmethod + def _update_instance(instance: Instance, update: InstanceUpdateEvent) -> Instance: + """ + Update the instance with the given update. + + Args: + instance: The instance to update. + update: The update to apply. + + Returns: + The updated instance. + """ + logger.info(InstanceUtil.get_log_str_for_update(instance, update)) + assert InstanceUtil.set_status(instance, update.new_instance_status), ( + "Invalid status transition from " + f"{Instance.InstanceStatus.Name(instance.status)} to " + f"{Instance.InstanceStatus.Name(update.new_instance_status)}" + ) + InstanceManager._apply_update(instance, update) + + return instance diff --git a/.venv/lib/python3.11/site-packages/ray/autoscaler/v2/instance_manager/instance_storage.py b/.venv/lib/python3.11/site-packages/ray/autoscaler/v2/instance_manager/instance_storage.py new file mode 100644 index 0000000000000000000000000000000000000000..645cd1eb904a83478f01803fd652ad82b4cb1b39 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/autoscaler/v2/instance_manager/instance_storage.py @@ -0,0 +1,151 @@ +import copy +import logging +from typing import Dict, List, Optional, Set, Tuple + +from ray.autoscaler.v2.instance_manager.storage import Storage, StoreStatus +from ray.core.generated.instance_manager_pb2 import Instance + +logger = logging.getLogger(__name__) + + +class InstanceStorage: + """Instance storage stores the states of instances in the storage.""" + + def __init__( + self, + cluster_id: str, + storage: Storage, + ) -> None: + self._storage = storage + self._cluster_id = cluster_id + self._table_name = f"instance_table@{cluster_id}" + + def batch_upsert_instances( + self, + updates: List[Instance], + expected_storage_version: Optional[int] = None, + ) -> StoreStatus: + """Upsert instances into the storage. If the instance already exists, + it will be updated. Otherwise, it will be inserted. If the + expected_storage_version is specified, the update will fail if the + current storage version does not match the expected version. + + Note the version of the upserted instances will be set to the current + storage version. + + Args: + updates: A list of instances to be upserted. + expected_storage_version: The expected storage version. + + Returns: + StoreStatus: A tuple of (success, storage_version). + """ + mutations = {} + version = self._storage.get_version() + # handle version mismatch + if expected_storage_version and expected_storage_version != version: + return StoreStatus(False, version) + + for instance in updates: + instance = copy.deepcopy(instance) + # the instance version is set to 0, it will be + # populated by the storage entry's verion on read + instance.version = 0 + mutations[instance.instance_id] = instance.SerializeToString() + + result, version = self._storage.batch_update( + self._table_name, mutations, {}, expected_storage_version + ) + + return StoreStatus(result, version) + + def upsert_instance( + self, + instance: Instance, + expected_instance_version: Optional[int] = None, + expected_storage_verison: Optional[int] = None, + ) -> StoreStatus: + """Upsert an instance in the storage. + If the expected_instance_version is specified, the update will fail + if the current instance version does not match the expected version. + Similarly, if the expected_storage_version is + specified, the update will fail if the current storage version does not + match the expected version. + + Note the version of the upserted instances will be set to the current + storage version. + + Args: + instance: The instance to be updated. + expected_instance_version: The expected instance version. + expected_storage_version: The expected storage version. + + Returns: + StoreStatus: A tuple of (success, storage_version). + """ + instance = copy.deepcopy(instance) + # the instance version is set to 0, it will be + # populated by the storage entry's verion on read + instance.version = 0 + result, version = self._storage.update( + self._table_name, + key=instance.instance_id, + value=instance.SerializeToString(), + expected_entry_version=expected_instance_version, + expected_storage_version=expected_storage_verison, + insert_only=False, + ) + + return StoreStatus(result, version) + + def get_instances( + self, + instance_ids: List[str] = None, + status_filter: Set[int] = None, + ) -> Tuple[Dict[str, Instance], int]: + """Get instances from the storage. + + Args: + instance_ids: A list of instance ids to be retrieved. If empty, all + instances will be retrieved. + status_filter: Only instances with the specified status will be returned. + + Returns: + Tuple[Dict[str, Instance], int]: A tuple of (instances, version). + The instances is a dictionary of (instance_id, instance) pairs. + """ + instance_ids = instance_ids or [] + status_filter = status_filter or set() + pairs, version = self._storage.get(self._table_name, instance_ids) + instances = {} + for instance_id, (instance_data, entry_version) in pairs.items(): + instance = Instance() + instance.ParseFromString(instance_data) + instance.version = entry_version + if status_filter and instance.status not in status_filter: + continue + instances[instance_id] = instance + return instances, version + + def batch_delete_instances( + self, instance_ids: List[str], expected_storage_version: Optional[int] = None + ) -> StoreStatus: + """Delete instances from the storage. If the expected_version is + specified, the update will fail if the current storage version does not + match the expected version. + + Args: + to_delete: A list of instances to be deleted. + expected_version: The expected storage version. + + Returns: + StoreStatus: A tuple of (success, storage_version). + """ + version = self._storage.get_version() + if expected_storage_version and expected_storage_version != version: + return StoreStatus(False, version) + + result = self._storage.batch_update( + self._table_name, {}, instance_ids, expected_storage_version + ) + return result diff --git a/.venv/lib/python3.11/site-packages/ray/autoscaler/v2/instance_manager/node_provider.py b/.venv/lib/python3.11/site-packages/ray/autoscaler/v2/instance_manager/node_provider.py new file mode 100644 index 0000000000000000000000000000000000000000..1358fcda5a6c4862087416c4ba19727f18bd5850 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/autoscaler/v2/instance_manager/node_provider.py @@ -0,0 +1,522 @@ +import logging +import math +import time +from abc import ABC, abstractmethod +from concurrent.futures import ThreadPoolExecutor +from dataclasses import dataclass +from queue import Queue +from typing import Any, Dict, List, Optional + +from ray.autoscaler._private.constants import ( + AUTOSCALER_MAX_CONCURRENT_LAUNCHES, + AUTOSCALER_MAX_LAUNCH_BATCH, +) +from ray.autoscaler._private.util import hash_launch_conf +from ray.autoscaler.node_provider import NodeProvider as NodeProviderV1 +from ray.autoscaler.tags import ( + NODE_KIND_HEAD, + NODE_KIND_UNMANAGED, + NODE_KIND_WORKER, + STATUS_UNINITIALIZED, + TAG_RAY_LAUNCH_CONFIG, + TAG_RAY_LAUNCH_REQUEST, + TAG_RAY_NODE_KIND, + TAG_RAY_NODE_NAME, + TAG_RAY_NODE_STATUS, + TAG_RAY_USER_NODE_TYPE, +) +from ray.autoscaler.v2.instance_manager.config import IConfigReader +from ray.autoscaler.v2.schema import NodeType +from ray.core.generated.instance_manager_pb2 import NodeKind + +logger = logging.getLogger(__name__) + +# Type Alias. This is a **unique identifier** for a cloud instance in the cluster. +# The provider should guarantee that this id is unique across the cluster, +# such that: +# - When a cloud instance is created and running, no other cloud instance in the +# cluster has the same id. +# - When a cloud instance is terminated, no other cloud instance in the cluster will +# be assigned the same id later. +CloudInstanceId = str + + +@dataclass +class CloudInstance: + """ + A class that represents a cloud instance in the cluster, with necessary metadata + of the cloud instance. + """ + + # The cloud instance id. + cloud_instance_id: CloudInstanceId + # The node type of the cloud instance. + node_type: NodeType + # The node kind, i.e head or worker. + node_kind: NodeKind + # If the cloud instance is already running. + is_running: bool + # Update request id from which the cloud instance is launched. + # This could be None if the cloud instance couldn't be associated with requests + # by the cloud provider: e.g. cloud provider doesn't support per-instance + # extra metadata. + # This is fine for now since the reconciler should be able to know how + # to handle cloud instances w/o request ids. + # TODO: make this a required field. + request_id: Optional[str] = None + + +class CloudInstanceProviderError(Exception): + """ + An base error class that represents an error that happened in the cloud instance + provider. + """ + + # The timestamp of the error occurred in nanoseconds. + timestamp_ns: int + + def __init__(self, msg, timestamp_ns) -> None: + super().__init__(msg) + self.timestamp_ns = timestamp_ns + + +class LaunchNodeError(CloudInstanceProviderError): + # The node type that failed to launch. + node_type: NodeType + # Number of nodes that failed to launch. + count: int + # A unique id that identifies from which update request the error originates. + request_id: str + + def __init__( + self, + node_type: NodeType, + count: int, + request_id: str, + timestamp_ns: int, + details: str = "", + cause: Optional[Exception] = None, + ) -> None: + msg = ( + f"Failed to launch {count} nodes of type {node_type} with " + f"request id {request_id}: {details}" + ) + super().__init__(msg, timestamp_ns=timestamp_ns) + self.node_type = node_type + self.count = count + self.request_id = request_id + if cause: + self.__cause__ = cause + + def __repr__(self) -> str: + return ( + f"LaunchNodeError(node_type={self.node_type}, count={self.count}, " + f"request_id={self.request_id}): {self.__cause__}" + ) + + +class TerminateNodeError(CloudInstanceProviderError): + # The cloud instance id of the node that failed to terminate. + cloud_instance_id: CloudInstanceId + # A unique id that identifies from which update request the error originates. + request_id: str + + def __init__( + self, + cloud_instance_id: CloudInstanceId, + request_id: str, + timestamp_ns: int, + details: str = "", + cause: Optional[Exception] = None, + ) -> None: + msg = ( + f"Failed to terminate node {cloud_instance_id} with " + f"request id {request_id}: {details}" + ) + super().__init__(msg, timestamp_ns=timestamp_ns) + self.cloud_instance_id = cloud_instance_id + self.request_id = request_id + if cause: + self.__cause__ = cause + + def __repr__(self) -> str: + return ( + f"TerminateNodeError(cloud_instance_id={self.cloud_instance_id}, " + f"request_id={self.request_id}): {self.__cause__}" + ) + + +class ICloudInstanceProvider(ABC): + """ + The interface for a cloud instance provider. + + This interface is a minimal interface that should be implemented by the + various cloud instance providers (e.g. AWS, and etc). + + The cloud instance provider is responsible for managing the cloud instances in the + cluster. It provides the following main functionalities: + - Launch new cloud instances. + - Terminate existing running instances. + - Get the non-terminated cloud instances in the cluster. + - Poll the errors that happened for the updates to the cloud instance provider. + + Below properties of the cloud instance provider are assumed with this interface: + + 1. Eventually consistent + The cloud instance provider is expected to be eventually consistent with the + cluster state. For example, when a cloud instance is request to be terminated + or launched, the provider may not immediately reflect the change in its state. + However, the provider is expected to eventually reflect the change in its state. + + 2. Asynchronous + The provider could also be asynchronous, where the termination/launch + request may not immediately return the result of the request. + + 3. Unique cloud instance ids + Cloud instance ids are expected to be unique across the cluster. + + 4. Idempotent updates + For the update APIs (e.g. ensure_min_nodes, terminate), the provider may use the + request ids to provide idempotency. + + Usage: + ``` + provider: ICloudInstanceProvider = ... + + # Update the cluster with a desired shape. + provider.launch( + shape={ + "worker_nodes": 10, + "ray_head": 1, + }, + request_id="1", + ) + + # Get the non-terminated nodes of the cloud instance provider. + running = provider.get_non_terminated() + + # Poll the errors + errors = provider.poll_errors() + + # Terminate nodes. + provider.terminate( + ids=["cloud_instance_id_1", "cloud_instance_id_2"], + request_id="2", + ) + + # Process the state of the provider. + ... + ``` + """ + + @abstractmethod + def get_non_terminated(self) -> Dict[CloudInstanceId, CloudInstance]: + """Get the non-terminated cloud instances in the cluster. + + Returns: + A dictionary of the non-terminated cloud instances in the cluster. + The key is the cloud instance id, and the value is the cloud instance. + """ + pass + + @abstractmethod + def terminate(self, ids: List[CloudInstanceId], request_id: str) -> None: + """ + Terminate the cloud instances asynchronously. + + This method is expected to be idempotent, i.e. if the same request id is used + to terminate the same cloud instances, this should be a no-op if + the cloud instances are already terminated or being terminated. + + Args: + ids: the cloud instance ids to terminate. + request_id: a unique id that identifies the request. + """ + pass + + @abstractmethod + def launch( + self, + shape: Dict[NodeType, int], + request_id: str, + ) -> None: + """Launch the cloud instances asynchronously. + + Args: + shape: A map from node type to number of nodes to launch. + request_id: a unique id that identifies the update request. + """ + pass + + @abstractmethod + def poll_errors(self) -> List[CloudInstanceProviderError]: + """ + Poll the errors that happened since the last poll. + + This method would also clear the errors that happened since the last poll. + + Returns: + The errors that happened since the last poll. + """ + pass + + +@dataclass(frozen=True) +class CloudInstanceLaunchRequest: + """ + The arguments to launch a node. + """ + + # The node type to launch. + node_type: NodeType + # Number of nodes to launch. + count: int + # A unique id that identifies the request. + request_id: str + + +@dataclass(frozen=True) +class CloudInstanceTerminateRequest: + """ + The arguments to terminate a node. + """ + + # The cloud instance id of the node to terminate. + cloud_instance_id: CloudInstanceId + # A unique id that identifies the request. + request_id: str + + +class NodeProviderAdapter(ICloudInstanceProvider): + """ + Warps a NodeProviderV1 to a ICloudInstanceProvider. + + TODO(rickyx): + The current adapter right now consists of two sets of APIs: + - v1: the old APIs that are used by the autoscaler, where + we forward the calls to the NodeProviderV1. + - v2: the new APIs that are used by the autoscaler v2, this is + defined in the ICloudInstanceProvider interface. + + We should eventually remove the v1 APIs and only use the v2 APIs. + It's currently left as a TODO since changing the v1 APIs would + requires a lot of changes in the cluster launcher codebase. + """ + + def __init__( + self, + v1_provider: NodeProviderV1, + config_reader: IConfigReader, + max_launch_batch_per_type: int = AUTOSCALER_MAX_LAUNCH_BATCH, + max_concurrent_launches: int = AUTOSCALER_MAX_CONCURRENT_LAUNCHES, + ) -> None: + """ + Args: + v1_provider: The v1 node provider to wrap. + config_reader: The config reader to read the autoscaling config. + max_launch_batch_per_type: The maximum number of nodes to launch per + node type in a single batch. + max_concurrent_launches: The maximum number of concurrent launches. + """ + + super().__init__() + self._v1_provider = v1_provider + self._config_reader = config_reader + # Executor to async launching and terminating nodes. + self._main_executor = ThreadPoolExecutor( + max_workers=1, thread_name_prefix="ray::NodeProviderAdapter" + ) + + # v1 legacy rate limiting on the node provider launch calls. + self._max_launch_batch_per_type = max_launch_batch_per_type + max_batches = math.ceil( + max_concurrent_launches / float(max_launch_batch_per_type) + ) + self._node_launcher_executors = ThreadPoolExecutor( + max_workers=max_batches, + thread_name_prefix="ray::NodeLauncherPool", + ) + + # Queue to retrieve new errors occur in the multi-thread executors + # temporarily. + self._errors_queue = Queue() + + def get_non_terminated(self) -> Dict[CloudInstanceId, CloudInstance]: + nodes = {} + + cloud_instance_ids = self._v1_non_terminated_nodes({}) + # Filter out nodes that are not running. + # This is efficient since the provider is expected to cache the + # running status of the nodes. + for cloud_instance_id in cloud_instance_ids: + node_tags = self._v1_node_tags(cloud_instance_id) + node_kind_tag = node_tags.get(TAG_RAY_NODE_KIND, NODE_KIND_UNMANAGED) + if node_kind_tag == NODE_KIND_UNMANAGED: + # Filter out unmanaged nodes. + continue + elif node_kind_tag == NODE_KIND_WORKER: + node_kind = NodeKind.WORKER + elif node_kind_tag == NODE_KIND_HEAD: + node_kind = NodeKind.HEAD + else: + raise ValueError(f"Invalid node kind: {node_kind_tag}") + + nodes[cloud_instance_id] = CloudInstance( + cloud_instance_id=cloud_instance_id, + node_type=node_tags.get(TAG_RAY_USER_NODE_TYPE, ""), + is_running=self._v1_is_running(cloud_instance_id), + request_id=node_tags.get(TAG_RAY_LAUNCH_REQUEST, ""), + node_kind=node_kind, + ) + + return nodes + + def poll_errors(self) -> List[CloudInstanceProviderError]: + errors = [] + while not self._errors_queue.empty(): + errors.append(self._errors_queue.get_nowait()) + return errors + + def launch( + self, + shape: Dict[NodeType, int], + request_id: str, + ) -> None: + self._main_executor.submit(self._do_launch, shape, request_id) + + def terminate(self, ids: List[CloudInstanceId], request_id: str) -> None: + self._main_executor.submit(self._do_terminate, ids, request_id) + + ########################################### + # Private APIs + ########################################### + + def _do_launch( + self, + shape: Dict[NodeType, int], + request_id: str, + ) -> None: + """ + Launch the cloud instances by calling into the v1 base node provider. + + Args: + shape: The requested to launch node type and number of nodes. + request_id: The request id that identifies the request. + """ + for node_type, count in shape.items(): + # Keep submitting the launch requests to the launch pool in batches. + while count > 0: + to_launch = min(count, self._max_launch_batch_per_type) + self._node_launcher_executors.submit( + self._launch_nodes_by_type, + node_type, + to_launch, + request_id, + ) + count -= to_launch + + def _do_terminate(self, ids: List[CloudInstanceId], request_id: str) -> None: + """ + Terminate the cloud instances by calling into the v1 base node provider. + + If errors happen during the termination, the errors will be put into the + errors queue. + + Args: + ids: The cloud instance ids to terminate. + request_id: The request id that identifies the request. + """ + + try: + self._v1_terminate_nodes(ids) + except Exception as e: + for id in ids: + error = TerminateNodeError(id, request_id, int(time.time_ns())) + error.__cause__ = e + self._errors_queue.put(error) + + def _launch_nodes_by_type( + self, + node_type: NodeType, + count: int, + request_id: str, + ) -> None: + """ + Launch nodes of the given node type. + + Args: + node_type: The node type to launch. + count: Number of nodes to launch. + request_id: A unique id that identifies the request. + + Raises: + ValueError: If the node type is invalid. + LaunchNodeError: If the launch failed and raised by the underlying provider. + """ + # Check node type is valid. + try: + config = self._config_reader.get_cached_autoscaling_config() + launch_config = config.get_cloud_node_config(node_type) + resources = config.get_node_resources(node_type) + labels = config.get_node_labels(node_type) + + # This is to be compatible with the v1 node launcher. + # See more in https://github.com/ray-project/ray/blob/6f5a189bc463e52c51a70f8aea41fb2950b443e8/python/ray/autoscaler/_private/node_launcher.py#L78-L85 # noqa + # TODO: this should be synced with what's stored in the IM, it should + # probably be made as a metadata field in the cloud instance. This is + # another incompatibility with KubeRay. + launch_hash = hash_launch_conf(launch_config, config.get_config("auth", {})) + node_tags = { + TAG_RAY_NODE_NAME: "ray-{}-worker".format( + config.get_config("cluster_name", "") + ), + TAG_RAY_NODE_KIND: NODE_KIND_WORKER, + TAG_RAY_NODE_STATUS: STATUS_UNINITIALIZED, + TAG_RAY_LAUNCH_CONFIG: launch_hash, + TAG_RAY_LAUNCH_REQUEST: request_id, + TAG_RAY_USER_NODE_TYPE: node_type, + } + + logger.info("Launching {} nodes of type {}.".format(count, node_type)) + self._v1_provider.create_node_with_resources_and_labels( + launch_config, node_tags, count, resources, labels + ) + logger.info("Launched {} nodes of type {}.".format(count, node_type)) + except Exception as e: + error = LaunchNodeError(node_type, count, request_id, int(time.time_ns())) + error.__cause__ = e + self._errors_queue.put(error) + + ########################################### + # V1 Legacy APIs + ########################################### + """ + Below are the necessary legacy APIs from the V1 node provider. + These are needed as of now to provide the needed features + for V2 node provider. + The goal is to eventually remove these APIs and only use the + V2 APIs by modifying the individual node provider to inherit + from ICloudInstanceProvider. + """ + + def _v1_terminate_nodes( + self, ids: List[CloudInstanceId] + ) -> Optional[Dict[str, Any]]: + return self._v1_provider.terminate_nodes(ids) + + def _v1_non_terminated_nodes( + self, tag_filters: Dict[str, str] + ) -> List[CloudInstanceId]: + return self._v1_provider.non_terminated_nodes(tag_filters) + + def _v1_is_running(self, node_id: CloudInstanceId) -> bool: + return self._v1_provider.is_running(node_id) + + def _v1_post_process(self) -> None: + self._v1_provider.post_process() + + def _v1_node_tags(self, node_id: CloudInstanceId) -> Dict[str, str]: + return self._v1_provider.node_tags(node_id) + + def _v1_safe_to_scale(self) -> bool: + return self._v1_provider.safe_to_scale() diff --git a/.venv/lib/python3.11/site-packages/ray/autoscaler/v2/instance_manager/ray_installer.py b/.venv/lib/python3.11/site-packages/ray/autoscaler/v2/instance_manager/ray_installer.py new file mode 100644 index 0000000000000000000000000000000000000000..0356b252eadd01c8e435d1b26864ceef08a9793d --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/autoscaler/v2/instance_manager/ray_installer.py @@ -0,0 +1,99 @@ +import dataclasses +import logging +import subprocess + +from ray.autoscaler._private.updater import NodeUpdater +from ray.autoscaler._private.util import with_envs, with_head_node_ip +from ray.autoscaler.node_provider import NodeProvider as NodeProviderV1 +from ray.autoscaler.v2.instance_manager.config import AutoscalingConfig +from ray.core.generated.instance_manager_pb2 import Instance + +logger = logging.getLogger(__name__) + + +@dataclasses.dataclass(frozen=True) +class RayInstallError: + # Instance manager's instance id. + im_instance_id: str + # Error details. + details: str + + +class RayInstaller(object): + """ + RayInstaller is responsible for installing ray on the target instance. + """ + + def __init__( + self, + provider: NodeProviderV1, + config: AutoscalingConfig, + process_runner=subprocess, + ) -> None: + self._provider = provider + self._config = config + self._process_runner = process_runner + + def install_ray(self, instance: Instance, head_node_ip: str) -> bool: + """ + Install ray on the target instance synchronously. + TODO:(rickyx): This runs in another thread, and errors are silently + ignored. We should propagate the error to the main thread. + """ + setup_commands = self._config.get_worker_setup_commands(instance.instance_type) + ray_start_commands = self._config.get_worker_start_ray_commands() + docker_config = self._config.get_docker_config(instance.instance_type) + + logger.info( + f"Creating new (spawn_updater) updater thread for node" + f" {instance.cloud_instance_id}." + ) + provider_instance_type_name = self._config.get_provider_instance_type( + instance.instance_type + ) + updater = NodeUpdater( + node_id=instance.instance_id, + provider_config=self._config.get_config("provider"), + provider=self._provider, + auth_config=self._config.get_config("auth"), + cluster_name=self._config.get_config("cluster_name"), + file_mounts=self._config.get_config("file_mounts"), + initialization_commands=with_head_node_ip( + self._config.get_initialization_commands(instance.instance_type), + head_node_ip, + ), + setup_commands=with_head_node_ip(setup_commands, head_node_ip), + # This will prepend envs to the begin of the ray start commands, e.g. + # `RAY_HEAD_IP= \ + # RAY_CLOUD_INSTANCE_ID= \ + # ray start --head ...` + # See src/ray/common/constants.h for ENV name definitions. + ray_start_commands=with_envs( + ray_start_commands, + { + "RAY_HEAD_IP": head_node_ip, + "RAY_CLOUD_INSTANCE_ID": instance.instance_id, + "RAY_NODE_TYPE_NAME": instance.instance_type, + "RAY_CLOUD_INSTANCE_TYPE_NAME": provider_instance_type_name, + }, + ), + runtime_hash=self._config.runtime_hash, + file_mounts_contents_hash=self._config.file_mounts_contents_hash, + is_head_node=False, + cluster_synced_files=self._config.get_config("cluster_synced_files"), + rsync_options={ + "rsync_exclude": self._config.get_config("rsync_exclude"), + "rsync_filter": self._config.get_config("rsync_filter"), + }, + use_internal_ip=True, + docker_config=docker_config, + node_resources=self._config.get_node_resources(instance.instance_type), + node_labels=self._config.get_node_labels(instance.instance_type), + process_runner=self._process_runner, + ) + try: + updater.run() + except Exception: + # Errors has already been handled. + return False + return True diff --git a/.venv/lib/python3.11/site-packages/ray/autoscaler/v2/instance_manager/reconciler.py b/.venv/lib/python3.11/site-packages/ray/autoscaler/v2/instance_manager/reconciler.py new file mode 100644 index 0000000000000000000000000000000000000000..e13ebe7c07257fe6e92d4e1e0d782d194b7bdfeb --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/autoscaler/v2/instance_manager/reconciler.py @@ -0,0 +1,1565 @@ +import logging +import math +import time +import uuid +from collections import defaultdict +from typing import Dict, List, Optional, Set, Tuple + +from ray._private.utils import binary_to_hex +from ray.autoscaler.v2.instance_manager.common import InstanceUtil +from ray.autoscaler.v2.instance_manager.config import ( + AutoscalingConfig, + InstanceReconcileConfig, + Provider, +) +from ray.autoscaler.v2.instance_manager.instance_manager import InstanceManager +from ray.autoscaler.v2.instance_manager.node_provider import ( + CloudInstance, + CloudInstanceId, + CloudInstanceProviderError, + ICloudInstanceProvider, + LaunchNodeError, + TerminateNodeError, +) +from ray.autoscaler.v2.instance_manager.ray_installer import RayInstallError +from ray.autoscaler.v2.instance_manager.subscribers.ray_stopper import RayStopError +from ray.autoscaler.v2.metrics_reporter import AutoscalerMetricsReporter +from ray.autoscaler.v2.scheduler import IResourceScheduler, SchedulingRequest +from ray.autoscaler.v2.schema import AutoscalerInstance, NodeType +from ray.autoscaler.v2.sdk import is_head_node +from ray.core.generated.autoscaler_pb2 import ( + AutoscalingState, + ClusterResourceState, + FailedInstanceRequest, + NodeState, + NodeStatus, + PendingInstance, + PendingInstanceRequest, +) +from ray.core.generated.instance_manager_pb2 import GetInstanceManagerStateRequest +from ray.core.generated.instance_manager_pb2 import Instance as IMInstance +from ray.core.generated.instance_manager_pb2 import ( + InstanceUpdateEvent as IMInstanceUpdateEvent, +) +from ray.core.generated.instance_manager_pb2 import ( + NodeKind, + StatusCode, + UpdateInstanceManagerStateRequest, +) + +logger = logging.getLogger(__name__) + + +class Reconciler: + """ + A singleton class that reconciles the instance states of the instance manager + for autoscaler. + + """ + + @staticmethod + def reconcile( + instance_manager: InstanceManager, + scheduler: IResourceScheduler, + cloud_provider: ICloudInstanceProvider, + ray_cluster_resource_state: ClusterResourceState, + non_terminated_cloud_instances: Dict[CloudInstanceId, CloudInstance], + autoscaling_config: AutoscalingConfig, + cloud_provider_errors: Optional[List[CloudInstanceProviderError]] = None, + ray_install_errors: Optional[List[RayInstallError]] = None, + ray_stop_errors: Optional[List[RayStopError]] = None, + metrics_reporter: Optional[AutoscalerMetricsReporter] = None, + _logger: Optional[logging.Logger] = None, + ) -> AutoscalingState: + """ + The reconcile method computes InstanceUpdateEvents for the instance manager + by: + + 1. Reconciling the instance manager's instances with external states like + the cloud provider's, the ray cluster's states, the ray installer's results. + It performs "passive" status transitions for the instances (where the status + transition should only be reflecting the external states of the cloud provider + and the ray cluster, and should not be actively changing them) + + 2. Stepping the instances to the active states by computing instance status + transitions that are needed and updating the instance manager's state. + These transitions should be "active" where the transitions have side effects + (through InstanceStatusSubscriber) to the cloud provider and the ray cluster. + + Args: + instance_manager: The instance manager to reconcile. + ray_cluster_resource_state: The ray cluster's resource state. + non_terminated_cloud_instances: The non-terminated cloud instances from + the cloud provider. + cloud_provider_errors: The errors from the cloud provider. + ray_install_errors: The errors from RayInstaller. + ray_stop_errors: The errors from RayStopper. + metrics_reporter: The metric reporter to report the autoscaler metrics. + _logger: The logger (for testing). + + """ + cloud_provider_errors = cloud_provider_errors or [] + ray_install_errors = ray_install_errors or [] + ray_stop_errors = ray_stop_errors or [] + + autoscaling_state = AutoscalingState() + autoscaling_state.last_seen_cluster_resource_state_version = ( + ray_cluster_resource_state.cluster_resource_state_version + ) + Reconciler._sync_from( + instance_manager=instance_manager, + ray_nodes=ray_cluster_resource_state.node_states, + non_terminated_cloud_instances=non_terminated_cloud_instances, + cloud_provider_errors=cloud_provider_errors, + ray_install_errors=ray_install_errors, + ray_stop_errors=ray_stop_errors, + autoscaling_config=autoscaling_config, + ) + + Reconciler._step_next( + autoscaling_state=autoscaling_state, + instance_manager=instance_manager, + scheduler=scheduler, + cloud_provider=cloud_provider, + ray_cluster_resource_state=ray_cluster_resource_state, + non_terminated_cloud_instances=non_terminated_cloud_instances, + autoscaling_config=autoscaling_config, + _logger=_logger, + ) + + Reconciler._report_metrics( + instance_manager=instance_manager, + autoscaling_config=autoscaling_config, + metrics_reporter=metrics_reporter, + ) + + return autoscaling_state + + @staticmethod + def _sync_from( + instance_manager: InstanceManager, + ray_nodes: List[NodeState], + non_terminated_cloud_instances: Dict[CloudInstanceId, CloudInstance], + cloud_provider_errors: List[CloudInstanceProviderError], + ray_install_errors: List[RayInstallError], + ray_stop_errors: List[RayStopError], + autoscaling_config: AutoscalingConfig, + ): + """ + Reconcile the instance states of the instance manager from external states like + the cloud provider's, the ray cluster's states, the ray installer's results, + etc. + + For each instance, we try to figure out if we need to transition the instance + status to a new status, and if so, what the new status should be. + + These transitions should be purely "passive", meaning they should only be + reflecting the external states of the cloud provider and the ray cluster, + and should not be actively changing the states of the cloud provider or the ray + cluster. + + More specifically, we will reconcile status transitions for: + 1. QUEUED/REQUESTED -> ALLOCATED: + When a instance with launch request id (indicating a previous launch + request was made) could be assigned to an unassigned cloud instance + of the same instance type. + 2. REQUESTED -> ALLOCATION_FAILED: + When there's an error from the cloud provider for launch failure so + that the instance becomes ALLOCATION_FAILED. + 3. * -> RAY_RUNNING: + When a ray node on a cloud instance joins the ray cluster, we will + transition the instance to RAY_RUNNING. + 4. * -> TERMINATED: + When the cloud instance is already terminated, we will transition the + instance to TERMINATED. + 5. TERMINATING -> TERMINATION_FAILED: + When there's an error from the cloud provider for termination failure. + 6. * -> RAY_STOPPED: + When ray was stopped on the cloud instance, we will transition the + instance to RAY_STOPPED. + 7. * -> RAY_INSTALL_FAILED: + When there's an error from RayInstaller. + 8. RAY_STOP_REQUESTED -> RAY_RUNNING: + When requested to stop ray, but failed to stop/drain the ray node + (e.g. idle termination drain rejected by the node). + + Args: + instance_manager: The instance manager to reconcile. + ray_nodes: The ray cluster's states of ray nodes. + non_terminated_cloud_instances: The non-terminated cloud instances from + the cloud provider. + cloud_provider_errors: The errors from the cloud provider. + ray_install_errors: The errors from RayInstaller. + ray_stop_errors: The errors from RayStopper. + + """ + + # Handle 1 & 2 for cloud instance allocation. + Reconciler._handle_cloud_instance_allocation( + instance_manager, + non_terminated_cloud_instances, + cloud_provider_errors, + ) + Reconciler._handle_cloud_instance_terminated( + instance_manager, non_terminated_cloud_instances + ) + + Reconciler._handle_cloud_instance_termination_errors( + instance_manager, cloud_provider_errors + ) + + Reconciler._handle_extra_cloud_instances( + instance_manager, non_terminated_cloud_instances, ray_nodes + ) + + Reconciler._handle_ray_status_transition( + instance_manager, ray_nodes, autoscaling_config + ) + + Reconciler._handle_ray_install_failed(instance_manager, ray_install_errors) + + Reconciler._handle_ray_stop_failed(instance_manager, ray_stop_errors, ray_nodes) + + @staticmethod + def _step_next( + autoscaling_state: AutoscalingState, + instance_manager: InstanceManager, + scheduler: IResourceScheduler, + cloud_provider: ICloudInstanceProvider, + ray_cluster_resource_state: ClusterResourceState, + non_terminated_cloud_instances: Dict[CloudInstanceId, CloudInstance], + autoscaling_config: AutoscalingConfig, + _logger: Optional[logging.Logger] = None, + ): + """ + Step the reconciler to the next state by computing instance status transitions + that are needed and updating the instance manager's state. + + Specifically, we will: + 1. Shut down leak cloud instances + Leaked cloud instances that are not managed by the instance manager. + 2. Terminating instances with ray stopped or ray install failure. + 3. Scale down the cluster: + (* -> RAY_STOP_REQUESTED/TERMINATING) + b. Extra cloud due to max nodes config. + c. Cloud instances with outdated configs. + 4. Scale up the cluster: + (new QUEUED) + Create new instances based on the IResourceScheduler's decision for + scaling up. + 5. Request cloud provider to launch new instances. + (QUEUED -> REQUESTED) + 6. Install ray + (ALLOCATED -> RAY_INSTALLING) + When ray could be installed and launched. + 7. Handle any stuck instances with timeouts. + + Args: + instance_manager: The instance manager to reconcile. + scheduler: The resource scheduler to make scaling decisions. + ray_cluster_resource_state: The ray cluster's resource state. + non_terminated_cloud_instances: The non-terminated cloud instances from + the cloud provider. + autoscaling_config: The autoscaling config. + _logger: The logger (for testing). + + """ + + Reconciler._handle_stuck_instances( + instance_manager=instance_manager, + reconcile_config=autoscaling_config.get_instance_reconcile_config(), + _logger=_logger or logger, + ) + + Reconciler._scale_cluster( + autoscaling_state=autoscaling_state, + instance_manager=instance_manager, + ray_state=ray_cluster_resource_state, + scheduler=scheduler, + autoscaling_config=autoscaling_config, + ) + + Reconciler._handle_instances_launch( + instance_manager=instance_manager, autoscaling_config=autoscaling_config + ) + + Reconciler._terminate_instances(instance_manager=instance_manager) + if not autoscaling_config.disable_node_updaters(): + Reconciler._install_ray( + instance_manager=instance_manager, + non_terminated_cloud_instances=non_terminated_cloud_instances, + ) + + Reconciler._fill_autoscaling_state( + instance_manager=instance_manager, autoscaling_state=autoscaling_state + ) + + ####################################################### + # Utility methods for reconciling instance states. + ####################################################### + + @staticmethod + def _handle_cloud_instance_allocation( + instance_manager: InstanceManager, + non_terminated_cloud_instances: Dict[CloudInstanceId, CloudInstance], + cloud_provider_errors: List[CloudInstanceProviderError], + ): + im_instances, version = Reconciler._get_im_instances(instance_manager) + updates = {} + + # Compute intermediate states. + + instances_with_launch_requests: List[IMInstance] = [] + for instance in im_instances: + if instance.status != IMInstance.REQUESTED: + continue + + assert ( + instance.launch_request_id + ), "Instance in REQUESTED status should have launch_request_id set." + instances_with_launch_requests.append(instance) + + assigned_cloud_instance_ids: Set[CloudInstanceId] = { + instance.cloud_instance_id for instance in im_instances + } + launch_errors: Dict[str, LaunchNodeError] = { + error.request_id: error + for error in cloud_provider_errors + if isinstance(error, LaunchNodeError) + } + unassigned_cloud_instances_by_type: Dict[ + str, List[CloudInstance] + ] = defaultdict(list) + + for cloud_instance_id, cloud_instance in non_terminated_cloud_instances.items(): + if cloud_instance_id not in assigned_cloud_instance_ids: + unassigned_cloud_instances_by_type[cloud_instance.node_type].append( + cloud_instance + ) + + # Sort the request instance by the increasing request time. + instances_with_launch_requests.sort( + key=lambda instance: InstanceUtil.get_status_transition_times_ns( + instance, IMInstance.REQUESTED + ) + ) + + # For each instance, try to allocate or fail the allocation. + for instance in instances_with_launch_requests: + # Try allocate or fail with errors. + update_event = Reconciler._try_resolve_pending_allocation( + instance, unassigned_cloud_instances_by_type, launch_errors + ) + if not update_event: + continue + + updates[instance.instance_id] = update_event + + # Update the instance manager for the events. + Reconciler._update_instance_manager(instance_manager, version, updates) + + @staticmethod + def _try_resolve_pending_allocation( + im_instance: IMInstance, + unassigned_cloud_instances_by_type: Dict[str, List[CloudInstance]], + launch_errors: Dict[str, LaunchNodeError], + ) -> Optional[IMInstanceUpdateEvent]: + """ + Allocate, or fail the cloud instance allocation for the instance. + + Args: + im_instance: The instance to allocate or fail. + unassigned_cloud_instances_by_type: The unassigned cloud instances by type. + launch_errors: The launch errors from the cloud provider. + + Returns: + Instance update to ALLOCATED: if there's a matching unassigned cloud + instance with the same type. + Instance update to ALLOCATION_FAILED: if the instance allocation failed + with errors. + None: if there's no update. + + """ + unassigned_cloud_instance = None + + # Try to allocate an unassigned cloud instance. + # TODO(rickyx): We could also look at the launch request id + # on the cloud node and the im instance later once all node providers + # support request id. For now, we only look at the instance type. + if len(unassigned_cloud_instances_by_type.get(im_instance.instance_type, [])): + unassigned_cloud_instance = unassigned_cloud_instances_by_type[ + im_instance.instance_type + ].pop() + + if unassigned_cloud_instance: + return IMInstanceUpdateEvent( + instance_id=im_instance.instance_id, + new_instance_status=IMInstance.ALLOCATED, + cloud_instance_id=unassigned_cloud_instance.cloud_instance_id, + node_kind=unassigned_cloud_instance.node_kind, + instance_type=unassigned_cloud_instance.node_type, + details=( + "allocated unassigned cloud instance " + f"{unassigned_cloud_instance.cloud_instance_id}" + ), + ) + + # If there's a launch error, transition to ALLOCATION_FAILED. + launch_error = launch_errors.get(im_instance.launch_request_id) + if launch_error and launch_error.node_type == im_instance.instance_type: + return IMInstanceUpdateEvent( + instance_id=im_instance.instance_id, + new_instance_status=IMInstance.ALLOCATION_FAILED, + details=f"launch failed with {str(launch_error)}", + ) + # No update. + return None + + @staticmethod + def _handle_ray_stop_failed( + instance_manager: InstanceManager, + ray_stop_errors: List[RayStopError], + ray_nodes: List[NodeState], + ): + """ + The instance requested to stop ray, but failed to stop/drain the ray node. + E.g. connection errors, idle termination drain rejected by the node. + + We will transition the instance back to RAY_RUNNING. + + Args: + instance_manager: The instance manager to reconcile. + ray_stop_errors: The errors from RayStopper. + + """ + instances, version = Reconciler._get_im_instances(instance_manager) + updates = {} + + ray_stop_errors_by_instance_id = { + error.im_instance_id: error for error in ray_stop_errors + } + + ray_nodes_by_ray_node_id = {binary_to_hex(n.node_id): n for n in ray_nodes} + + ray_stop_requested_instances = { + instance.instance_id: instance + for instance in instances + if instance.status == IMInstance.RAY_STOP_REQUESTED + } + + for instance_id, instance in ray_stop_requested_instances.items(): + stop_error = ray_stop_errors_by_instance_id.get(instance_id) + if not stop_error: + continue + + assert instance.node_id + ray_node = ray_nodes_by_ray_node_id.get(instance.node_id) + assert ray_node is not None and ray_node.status in [ + NodeStatus.RUNNING, + NodeStatus.IDLE, + ], ( + "There should be a running ray node for instance with ray stop " + "requested failed." + ) + + updates[instance_id] = IMInstanceUpdateEvent( + instance_id=instance_id, + new_instance_status=IMInstance.RAY_RUNNING, + details="failed to stop/drain ray", + ray_node_id=instance.node_id, + ) + + Reconciler._update_instance_manager(instance_manager, version, updates) + + @staticmethod + def _handle_ray_install_failed( + instance_manager: InstanceManager, ray_install_errors: List[RayInstallError] + ): + + instances, version = Reconciler._get_im_instances(instance_manager) + updates = {} + + # Get all instances with RAY_INSTALLING status. + instances_with_ray_installing = { + instance.instance_id: instance + for instance in instances + if instance.status == IMInstance.RAY_INSTALLING + } + + install_errors = {error.im_instance_id: error for error in ray_install_errors} + + # For each instance with RAY_INSTALLING status, check if there's any + # install error. + for instance_id, instance in instances_with_ray_installing.items(): + install_error = install_errors.get(instance_id) + if install_error: + updates[instance_id] = IMInstanceUpdateEvent( + instance_id=instance_id, + new_instance_status=IMInstance.RAY_INSTALL_FAILED, + details=( + f"failed to install ray with errors: {install_error.details}" + ), + ) + + # Update the instance manager for the events. + Reconciler._update_instance_manager(instance_manager, version, updates) + + @staticmethod + def _handle_cloud_instance_terminated( + instance_manager: InstanceManager, + non_terminated_cloud_instances: Dict[CloudInstanceId, CloudInstance], + ): + """ + For any IM (instance manager) instance with a cloud node id, if the mapped + cloud instance is no longer running, transition the instance to TERMINATED. + + Args: + instance_manager: The instance manager to reconcile. + non_terminated_cloud_instances: The non-terminated cloud instances from + the cloud provider. + """ + updates = {} + instances, version = Reconciler._get_im_instances(instance_manager) + + non_terminated_instances_with_cloud_instance_assigned = { + instance.cloud_instance_id: instance + for instance in instances + if instance.cloud_instance_id and instance.status != IMInstance.TERMINATED + } + + for ( + cloud_instance_id, + instance, + ) in non_terminated_instances_with_cloud_instance_assigned.items(): + if cloud_instance_id in non_terminated_cloud_instances.keys(): + # The cloud instance is still running. + continue + + # The cloud instance is terminated. + updates[instance.instance_id] = IMInstanceUpdateEvent( + instance_id=instance.instance_id, + new_instance_status=IMInstance.TERMINATED, + details=f"cloud instance {cloud_instance_id} no longer found", + ) + + Reconciler._update_instance_manager(instance_manager, version, updates) + + @staticmethod + def _handle_cloud_instance_termination_errors( + instance_manager: InstanceManager, + cloud_provider_errors: List[CloudInstanceProviderError], + ): + """ + If any TERMINATING instances have termination errors, transition the instance to + TERMINATION_FAILED. + + We will retry the termination for the TERMINATION_FAILED instances in the next + reconciler step. + + Args: + instance_manager: The instance manager to reconcile. + cloud_provider_errors: The errors from the cloud provider. + + """ + instances, version = Reconciler._get_im_instances(instance_manager) + updates = {} + + termination_errors = { + error.cloud_instance_id: error + for error in cloud_provider_errors + if isinstance(error, TerminateNodeError) + } + + terminating_instances_by_cloud_instance_id = { + instance.cloud_instance_id: instance + for instance in instances + if instance.status == IMInstance.TERMINATING + } + + for cloud_instance_id, failure in termination_errors.items(): + instance = terminating_instances_by_cloud_instance_id.get(cloud_instance_id) + if not instance: + # The instance is no longer in TERMINATING status. + continue + + updates[instance.instance_id] = IMInstanceUpdateEvent( + instance_id=instance.instance_id, + new_instance_status=IMInstance.TERMINATION_FAILED, + details=f"termination failed: {str(failure)}", + ) + + Reconciler._update_instance_manager(instance_manager, version, updates) + + @staticmethod + def _get_im_instances( + instance_manager: InstanceManager, + ) -> Tuple[List[IMInstance], int]: + reply = instance_manager.get_instance_manager_state( + request=GetInstanceManagerStateRequest() + ) + assert reply.status.code == StatusCode.OK + im_state = reply.state + return im_state.instances, im_state.version + + @staticmethod + def _update_instance_manager( + instance_manager: InstanceManager, + version: int, + updates: Dict[str, IMInstanceUpdateEvent], + ) -> None: + if not updates: + return + + updates = list(updates.values()) or [] + + reply = instance_manager.update_instance_manager_state( + request=UpdateInstanceManagerStateRequest( + expected_version=version, + updates=updates, + ) + ) + # TODO: While it's possible that a version mismatch + # happens, or some other failures could happen. But given + # the current implementation: + # 1. There's only 1 writer (the reconciler) for updating the instance + # manager states, so there shouldn't be version mismatch. + # 2. Any failures in one reconciler step should be caught at a higher + # level and be retried in the next reconciler step. If the IM + # fails to be updated, we don't have sufficient info to handle it + # here. + assert ( + reply.status.code == StatusCode.OK + ), f"Failed to update instance manager: {reply}" + + @staticmethod + def _handle_ray_status_transition( + instance_manager: InstanceManager, + ray_nodes: List[NodeState], + autoscaling_config: AutoscalingConfig, + ): + """ + Handle the ray status transition for the instance manager. + + If a new ray node running on the instance, transition it to RAY_RUNNING. + If a ray node stopped, transition it to RAY_STOPPED. + If a ray node is draining, transition it to RAY_STOPPING. + + Args: + instance_manager: The instance manager to reconcile. + ray_nodes: The ray cluster's states of ray nodes. + """ + instances, version = Reconciler._get_im_instances(instance_manager) + updates = {} + + im_instances_by_cloud_instance_id = { + i.cloud_instance_id: i for i in instances if i.cloud_instance_id + } + ray_nodes_by_cloud_instance_id = {} + for n in ray_nodes: + if n.instance_id: + ray_nodes_by_cloud_instance_id[n.instance_id] = n + else: + if autoscaling_config.provider == Provider.READ_ONLY: + # We will use the node id as the cloud instance id for read-only + # provider. + ray_nodes_by_cloud_instance_id[binary_to_hex(n.node_id)] = n + else: + # This should only happen to a ray node that's not managed by us. + logger.warning( + f"Ray node {binary_to_hex(n.node_id)} has no instance id. " + "This only happens to a ray node not managed by autoscaler. " + "If not, please file a bug at " + "https://github.com/ray-project/ray" + ) + + for cloud_instance_id, ray_node in ray_nodes_by_cloud_instance_id.items(): + assert cloud_instance_id in im_instances_by_cloud_instance_id, ( + f"Ray node {binary_to_hex(ray_node.node_id)} has no matching " + f"instance with cloud instance id={cloud_instance_id}. We should " + "not see a ray node with cloud instance id not found in IM since " + "we have reconciled all cloud instances, and ray nodes by now." + ) + + im_instance = im_instances_by_cloud_instance_id[cloud_instance_id] + reconciled_im_status = Reconciler._reconciled_im_status_from_ray_status( + ray_node.status, im_instance.status + ) + + if reconciled_im_status != im_instance.status: + updates[im_instance.instance_id] = IMInstanceUpdateEvent( + instance_id=im_instance.instance_id, + new_instance_status=reconciled_im_status, + details=( + f"ray node {binary_to_hex(ray_node.node_id)} is " + f"{NodeStatus.Name(ray_node.status)}" + ), + ray_node_id=binary_to_hex(ray_node.node_id), + ) + + Reconciler._update_instance_manager(instance_manager, version, updates) + + @staticmethod + def _reconciled_im_status_from_ray_status( + ray_status: NodeStatus, cur_im_status: IMInstance.InstanceStatus + ) -> "IMInstance.InstanceStatus": + """ + Reconcile the instance status from the ray node status. + Args: + ray_status: the current ray node status. + cur_im_status: the current IM instance status. + Returns: + The reconciled IM instance status + + Raises: + ValueError: If the ray status is unknown. + """ + reconciled_im_status = None + if ray_status in [NodeStatus.RUNNING, NodeStatus.IDLE]: + reconciled_im_status = IMInstance.RAY_RUNNING + elif ray_status == NodeStatus.DEAD: + reconciled_im_status = IMInstance.RAY_STOPPED + elif ray_status == NodeStatus.DRAINING: + reconciled_im_status = IMInstance.RAY_STOPPING + else: + raise ValueError(f"Unknown ray status: {ray_status}") + + if ( + cur_im_status == reconciled_im_status + or cur_im_status + in InstanceUtil.get_reachable_statuses(reconciled_im_status) + ): + # No need to reconcile if the instance is already in the reconciled status + # or has already transitioned beyond it. + return cur_im_status + + return reconciled_im_status + + @staticmethod + def _handle_instances_launch( + instance_manager: InstanceManager, autoscaling_config: AutoscalingConfig + ): + + instances, version = Reconciler._get_im_instances(instance_manager) + + queued_instances = [] + requested_instances = [] + allocated_instances = [] + + for instance in instances: + if instance.status == IMInstance.QUEUED: + queued_instances.append(instance) + elif instance.status == IMInstance.REQUESTED: + requested_instances.append(instance) + elif instance.cloud_instance_id: + allocated_instances.append(instance) + + if not queued_instances: + # No QUEUED instances + return + + to_launch = Reconciler._compute_to_launch( + queued_instances, + requested_instances, + allocated_instances, + autoscaling_config.get_upscaling_speed(), + autoscaling_config.get_max_concurrent_launches(), + ) + + # Transition the instances to REQUESTED for instance launcher to + # launch them. + updates = {} + new_launch_request_id = str(uuid.uuid4()) + for instance_type, instances in to_launch.items(): + for instance in instances: + # Reuse launch request id for any QUEUED instances that have been + # requested before due to retry. + launch_request_id = ( + new_launch_request_id + if len(instance.launch_request_id) == 0 + else instance.launch_request_id + ) + updates[instance.instance_id] = IMInstanceUpdateEvent( + instance_id=instance.instance_id, + new_instance_status=IMInstance.REQUESTED, + launch_request_id=launch_request_id, + instance_type=instance_type, + details=( + f"requested to launch {instance_type} with request id " + f"{launch_request_id}" + ), + ) + + Reconciler._update_instance_manager(instance_manager, version, updates) + + @staticmethod + def _compute_to_launch( + queued_instances: List[IMInstance], + requested_instances: List[IMInstance], + allocated_instances: List[IMInstance], + upscaling_speed: float, + max_concurrent_launches: int, + ) -> Dict[NodeType, List[IMInstance]]: + def _group_by_type(instances): + instances_by_type = defaultdict(list) + for instance in instances: + instances_by_type[instance.instance_type].append(instance) + return instances_by_type + + # Sort the instances by the time they were queued. + def _sort_by_earliest_queued(instance: IMInstance) -> List[int]: + queue_times = InstanceUtil.get_status_transition_times_ns( + instance, IMInstance.QUEUED + ) + return sorted(queue_times) + + queued_instances_by_type = _group_by_type(queued_instances) + requested_instances_by_type = _group_by_type(requested_instances) + allocated_instances_by_type = _group_by_type(allocated_instances) + + total_num_requested_to_launch = len(requested_instances) + all_to_launch: Dict[NodeType : List[IMInstance]] = defaultdict(list) + + for ( + instance_type, + queued_instances_for_type, + ) in queued_instances_by_type.items(): + requested_instances_for_type = requested_instances_by_type.get( + instance_type, [] + ) + allocated_instances_for_type = allocated_instances_by_type.get( + instance_type, [] + ) + + num_desired_to_upscale = max( + 1, + math.ceil( + upscaling_speed + * ( + len(requested_instances_for_type) + + len(allocated_instances_for_type) + ) + ), + ) + + # Enforce global limit, at most we can launch `max_concurrent_launches` + num_to_launch = min( + max_concurrent_launches - total_num_requested_to_launch, + num_desired_to_upscale, + ) + + # Cap both ends 0 <= num_to_launch <= num_queued + num_to_launch = max(0, num_to_launch) + num_to_launch = min(len(queued_instances_for_type), num_to_launch) + + to_launch = sorted(queued_instances_for_type, key=_sort_by_earliest_queued)[ + :num_to_launch + ] + + all_to_launch[instance_type].extend(to_launch) + total_num_requested_to_launch += num_to_launch + + return all_to_launch + + @staticmethod + def _handle_stuck_instances( + instance_manager: InstanceManager, + reconcile_config: InstanceReconcileConfig, + _logger: logging.Logger, + ): + """ + Handle stuck instances with timeouts. + + Instances could be stuck in the following status and needs to be updated: + - REQUESTED: cloud provider is slow/fails to launch instances. + - ALLOCATED: ray fails to be started on the instance. + - RAY_INSTALLING: ray fails to be installed on the instance. + - TERMINATING: cloud provider is slow/fails to terminate instances. + + Instances could be in the following status which could be unbounded or + transient, and we don't have a timeout mechanism to handle them. We would + warn if they are stuck for too long: + - RAY_STOPPING: ray taking time to drain. + - QUEUED: cloud provider is slow to launch instances, resulting in long + queue. + + Reconciler should handle below statuses, if not, could be slow + reconcilation loop or a bug: + - RAY_INSTALL_FAILED + - RAY_STOPPED + - TERMINATION_FAILED + + + Args: + instance_manager: The instance manager to reconcile. + reconcile_config: The instance reconcile config. + _logger: The logger to log the warning messages. It's used for testing. + + """ + instances, version = Reconciler._get_im_instances(instance_manager) + + instances_by_status = defaultdict(list) + for instance in instances: + instances_by_status[instance.status].append(instance) + + im_updates = {} + + # Fail or retry the cloud instance allocation if it's stuck + # in the REQUESTED state. + for instance in instances_by_status[IMInstance.REQUESTED]: + update = Reconciler._handle_stuck_requested_instance( + instance, + reconcile_config.request_status_timeout_s, + reconcile_config.max_num_retry_request_to_allocate, + ) + if update: + im_updates[instance.instance_id] = update + + # Leaked ALLOCATED instances should be terminated. + # This usually happens when ray fails to be started on the instance, so + # it's unable to be RAY_RUNNING after a long time. + for instance in instances_by_status[IMInstance.ALLOCATED]: + assert ( + instance.cloud_instance_id + ), "cloud instance id should be set on ALLOCATED instance" + update = Reconciler._handle_stuck_instance( + instance, + reconcile_config.allocate_status_timeout_s, + new_status=IMInstance.TERMINATING, + cloud_instance_id=instance.cloud_instance_id, + ) + if update: + im_updates[instance.instance_id] = update + + # Fail the installation if it's stuck in RAY_INSTALLING for too long. + # If RAY_INSTALLING is stuck for too long, it's likely that the instance + # is not able to install ray, so we should also fail the installation. + for instance in instances_by_status[IMInstance.RAY_INSTALLING]: + update = Reconciler._handle_stuck_instance( + instance, + reconcile_config.ray_install_status_timeout_s, + new_status=IMInstance.RAY_INSTALL_FAILED, + ) + if update: + im_updates[instance.instance_id] = update + + # If we tried to terminate the instance, but it doesn't terminate (disappear + # from the cloud provider) after a long time, we fail the termination. + # This will trigger another attempt to terminate the instance. + for instance in instances_by_status[IMInstance.TERMINATING]: + update = Reconciler._handle_stuck_instance( + instance, + reconcile_config.terminating_status_timeout_s, + new_status=IMInstance.TERMINATION_FAILED, + ) + if update: + im_updates[instance.instance_id] = update + + # If we tried to stop ray on the instance, but it doesn't stop after a long + # time, we will transition it back to RAY_RUNNING as the stop/drain somehow + # failed. If it had succeed, we should have transitioned it to RAY_STOPPING + # or RAY_STOPPED. + for instance in instances_by_status[IMInstance.RAY_STOP_REQUESTED]: + update = Reconciler._handle_stuck_instance( + instance, + reconcile_config.ray_stop_requested_status_timeout_s, + new_status=IMInstance.RAY_RUNNING, + ray_node_id=instance.node_id, + ) + if update: + im_updates[instance.instance_id] = update + + # These statues could be unbounded or transient, and we don't have a timeout + # mechanism to handle them. We only warn if they are stuck for too long. + for status in [ + # Ray taking time to drain. We could also have a timeout when Drain protocol + # supports timeout. + IMInstance.RAY_STOPPING, + # These should just be transient, we will terminate instances with this + # status in the next reconciler step. + IMInstance.RAY_INSTALL_FAILED, + IMInstance.RAY_STOPPED, + IMInstance.TERMINATION_FAILED, + # Instances could be in the QUEUED status for a long time if the cloud + # provider is slow to launch instances. + IMInstance.QUEUED, + ]: + Reconciler._warn_stuck_instances( + instances_by_status[status], + status=status, + warn_interval_s=reconcile_config.transient_status_warn_interval_s, + logger=_logger, + ) + + Reconciler._update_instance_manager(instance_manager, version, im_updates) + + @staticmethod + def _warn_stuck_instances( + instances: List[IMInstance], + status: IMInstance.InstanceStatus, + warn_interval_s: int, + logger: logging.Logger, + ): + """Warn if any instance is stuck in a transient/unbounded status for too + long. + """ + for instance in instances: + status_times_ns = InstanceUtil.get_status_transition_times_ns( + instance, select_instance_status=status + ) + assert len(status_times_ns) >= 1 + status_time_ns = sorted(status_times_ns)[-1] + + if time.time_ns() - status_time_ns > warn_interval_s * 1e9: + logger.warning( + "Instance {}({}) is stuck in {} for {} seconds.".format( + instance.instance_id, + IMInstance.InstanceStatus.Name(instance.status), + IMInstance.InstanceStatus.Name(status), + (time.time_ns() - status_time_ns) // 1e9, + ) + ) + + @staticmethod + def _is_head_node_running(instance_manager: InstanceManager) -> bool: + """ + Check if the head node is running and ready. + + If we scale up the cluster before head node is running, + it would cause issues when launching the worker nodes. + + There are corner cases when the GCS is up (so the ray cluster resource + state is retrievable from the GCS), but the head node's raylet is not + running so the head node is missing from the reported nodes. This happens + when the head node is still starting up, or the raylet is not running + due to some issues, and this would yield false. + + Args: + instance_manager: The instance manager to reconcile. + + Returns: + True if the head node is running and ready, False otherwise. + """ + + im_instances, _ = Reconciler._get_im_instances(instance_manager) + + for instance in im_instances: + if instance.node_kind == NodeKind.HEAD: + if instance.status == IMInstance.RAY_RUNNING: + return True + return False + + @staticmethod + def _scale_cluster( + autoscaling_state: AutoscalingState, + instance_manager: InstanceManager, + ray_state: ClusterResourceState, + scheduler: IResourceScheduler, + autoscaling_config: AutoscalingConfig, + ) -> None: + """ + Scale the cluster based on the resource state and the resource scheduler's + decision: + + - It launches new instances if needed. + - It terminates extra ray nodes if they should be shut down (preemption + or idle termination) + + Args: + autoscaling_state: The autoscaling state to reconcile. + instance_manager: The instance manager to reconcile. + ray_state: The ray cluster's resource state. + scheduler: The resource scheduler to make scaling decisions. + autoscaling_config: The autoscaling config. + + """ + + # Get the current instance states. + im_instances, version = Reconciler._get_im_instances(instance_manager) + + autoscaler_instances = [] + ray_nodes_by_id = { + binary_to_hex(node.node_id): node for node in ray_state.node_states + } + + for im_instance in im_instances: + ray_node = ray_nodes_by_id.get(im_instance.node_id) + autoscaler_instances.append( + AutoscalerInstance( + ray_node=ray_node, + im_instance=im_instance, + cloud_instance_id=( + im_instance.cloud_instance_id + if im_instance.cloud_instance_id + else None + ), + ) + ) + + # TODO(rickyx): We should probably name it as "Planner" or "Scaler" + # or "ClusterScaler" + sched_request = SchedulingRequest( + node_type_configs=autoscaling_config.get_node_type_configs(), + max_num_nodes=autoscaling_config.get_max_num_nodes(), + resource_requests=ray_state.pending_resource_requests, + gang_resource_requests=ray_state.pending_gang_resource_requests, + cluster_resource_constraints=ray_state.cluster_resource_constraints, + current_instances=autoscaler_instances, + idle_timeout_s=autoscaling_config.get_idle_timeout_s(), + disable_launch_config_check=( + autoscaling_config.disable_launch_config_check() + ), + ) + + # Ask scheduler for updates to the cluster shape. + reply = scheduler.schedule(sched_request) + + # Populate the autoscaling state. + autoscaling_state.infeasible_resource_requests.extend( + reply.infeasible_resource_requests + ) + autoscaling_state.infeasible_gang_resource_requests.extend( + reply.infeasible_gang_resource_requests + ) + autoscaling_state.infeasible_cluster_resource_constraints.extend( + reply.infeasible_cluster_resource_constraints + ) + + if not Reconciler._is_head_node_running(instance_manager): + # We shouldn't be scaling the cluster until the head node is ready. + # This could happen when the head node (i.e. the raylet) is still + # pending registration even though GCS is up. + # We will wait until the head node is running and ready to avoid + # scaling the cluster from min worker nodes constraint. + return + + if autoscaling_config.provider == Provider.READ_ONLY: + # We shouldn't be scaling the cluster if the provider is read-only. + return + + # Scale the clusters if needed. + to_launch = reply.to_launch + to_terminate = reply.to_terminate + updates = {} + # Add terminating instances. + for terminate_request in to_terminate: + instance_id = terminate_request.instance_id + updates[terminate_request.instance_id] = IMInstanceUpdateEvent( + instance_id=instance_id, + new_instance_status=IMInstance.RAY_STOP_REQUESTED, + termination_request=terminate_request, + details=f"draining ray: {terminate_request.details}", + ) + + # Add new instances. + for launch_request in to_launch: + for _ in range(launch_request.count): + instance_id = InstanceUtil.random_instance_id() + updates[instance_id] = IMInstanceUpdateEvent( + instance_id=instance_id, + new_instance_status=IMInstance.QUEUED, + instance_type=launch_request.instance_type, + upsert=True, + details=( + f"queuing new instance of {launch_request.instance_type} " + "from scheduler" + ), + ) + + Reconciler._update_instance_manager(instance_manager, version, updates) + + @staticmethod + def _terminate_instances(instance_manager: InstanceManager): + """ + Terminate instances with the below statuses: + - RAY_STOPPED: ray was stopped on the cloud instance. + - RAY_INSTALL_FAILED: ray installation failed on the cloud instance, + we will not retry. + - TERMINATION_FAILED: cloud provider failed to terminate the instance + or timeout for termination happened, we will retry again. + + Args: + instance_manager: The instance manager to reconcile. + """ + + im_instances, version = Reconciler._get_im_instances(instance_manager) + updates = {} + for instance in im_instances: + if instance.status not in [ + IMInstance.RAY_STOPPED, + IMInstance.RAY_INSTALL_FAILED, + IMInstance.TERMINATION_FAILED, + ]: + continue + + # Terminate the instance. + updates[instance.instance_id] = IMInstanceUpdateEvent( + instance_id=instance.instance_id, + new_instance_status=IMInstance.TERMINATING, + cloud_instance_id=instance.cloud_instance_id, + details="terminating instance from " + f"{IMInstance.InstanceStatus.Name(instance.status)}", + ) + + Reconciler._update_instance_manager(instance_manager, version, updates) + + @staticmethod + def _install_ray( + instance_manager: InstanceManager, + non_terminated_cloud_instances: Dict[CloudInstanceId, CloudInstance], + ) -> None: + """ + Install ray on the allocated instances when it's ready (cloud instance + should be running) + + This is needed if ray installation needs to be performed by + the instance manager. + + Args: + instance_manager: The instance manager to reconcile. + """ + im_instances, version = Reconciler._get_im_instances(instance_manager) + updates = {} + for instance in im_instances: + if instance.status != IMInstance.ALLOCATED: + continue + + if instance.node_kind == NodeKind.HEAD: + # Skip head node. + continue + + cloud_instance = non_terminated_cloud_instances.get( + instance.cloud_instance_id + ) + + assert cloud_instance, ( + f"Cloud instance {instance.cloud_instance_id} is not found " + "in non_terminated_cloud_instances." + ) + + if not cloud_instance.is_running: + # It might still be pending (e.g. setting up ssh) + continue + + # Install ray on the running cloud instance + updates[instance.instance_id] = IMInstanceUpdateEvent( + instance_id=instance.instance_id, + new_instance_status=IMInstance.RAY_INSTALLING, + details="installing ray", + ) + + Reconciler._update_instance_manager(instance_manager, version, updates) + + @staticmethod + def _fill_autoscaling_state( + instance_manager: InstanceManager, + autoscaling_state: AutoscalingState, + ) -> None: + + # Use the IM instance version for the autoscaler_state_version + instances, version = Reconciler._get_im_instances(instance_manager) + autoscaling_state.autoscaler_state_version = version + + # Group instances by status + instances_by_status = defaultdict(list) + for instance in instances: + instances_by_status[instance.status].append(instance) + + # Pending instance requests + instances_by_launch_request = defaultdict(list) + queued_instances = [] + for instance in ( + instances_by_status[IMInstance.REQUESTED] + + instances_by_status[IMInstance.QUEUED] + ): + if instance.launch_request_id: + instances_by_launch_request[instance.launch_request_id].append(instance) + else: + queued_instances.append(instance) + + for _, instances in instances_by_launch_request.items(): + num_instances_by_type = defaultdict(int) + for instance in instances: + num_instances_by_type[instance.instance_type] += 1 + + # All instances with same request id should have the same + # request time. + request_update = InstanceUtil.get_last_status_transition( + instances[0], IMInstance.REQUESTED + ) + request_time_ns = request_update.timestamp_ns if request_update else 0 + + for instance_type, count in num_instances_by_type.items(): + autoscaling_state.pending_instance_requests.append( + PendingInstanceRequest( + ray_node_type_name=instance_type, + count=int(count), + request_ts=int(request_time_ns // 1e9), + ) + ) + + # Pending instances + for instance in ( + instances_by_status[IMInstance.ALLOCATED] + + instances_by_status[IMInstance.RAY_INSTALLING] + ): + + status_history = sorted( + instance.status_history, key=lambda x: x.timestamp_ns, reverse=True + ) + autoscaling_state.pending_instances.append( + PendingInstance( + instance_id=instance.instance_id, + ray_node_type_name=instance.instance_type, + details=status_history[0].details, + ) + ) + + # Failed instance requests + for instance in instances_by_status[IMInstance.ALLOCATION_FAILED]: + request_status_update = InstanceUtil.get_last_status_transition( + instance, IMInstance.REQUESTED + ) + failed_status_update = InstanceUtil.get_last_status_transition( + instance, IMInstance.ALLOCATION_FAILED + ) + failed_time = ( + failed_status_update.timestamp_ns if failed_status_update else 0 + ) + request_time = ( + request_status_update.timestamp_ns if request_status_update else 0 + ) + autoscaling_state.failed_instance_requests.append( + FailedInstanceRequest( + ray_node_type_name=instance.instance_type, + start_ts=int(request_time // 1e9), + failed_ts=int( + failed_time // 1e9, + ), + reason=failed_status_update.details, + count=1, + ) + ) + + @staticmethod + def _handle_stuck_requested_instance( + instance: IMInstance, timeout_s: int, max_num_retry_request_to_allocate: int + ) -> Optional[IMInstanceUpdateEvent]: + """ + Fail the cloud instance allocation if it's stuck in the REQUESTED state. + + Args: + instance: The instance to handle. + timeout_s: The timeout in seconds. + max_num_retry_request_to_allocate: The maximum number of times an instance + could be requested to allocate. + + Returns: + Instance update to ALLOCATION_FAILED: if the instance allocation failed + with errors. + None: if there's no update. + + """ + if not InstanceUtil.has_timeout(instance, timeout_s): + # Not timeout yet, be patient. + return None + + all_request_times_ns = sorted( + InstanceUtil.get_status_transition_times_ns( + instance, select_instance_status=IMInstance.REQUESTED + ) + ) + + # Fail the allocation if we have tried too many times. + if len(all_request_times_ns) > max_num_retry_request_to_allocate: + return IMInstanceUpdateEvent( + instance_id=instance.instance_id, + new_instance_status=IMInstance.ALLOCATION_FAILED, + details=( + "failed to allocate cloud instance after " + f"{len(all_request_times_ns)} attempts > " + f"max_num_retry_request_to_allocate={max_num_retry_request_to_allocate}" # noqa + ), + ) + + # Retry the allocation if we could by transitioning to QUEUED again. + return IMInstanceUpdateEvent( + instance_id=instance.instance_id, + new_instance_status=IMInstance.QUEUED, + details=f"queue again to launch after timeout={timeout_s}s", + ) + + @staticmethod + def _handle_stuck_instance( + instance: IMInstance, + timeout_s: int, + new_status: IMInstance.InstanceStatus, + **update_kwargs: Dict, + ) -> Optional[IMInstanceUpdateEvent]: + """ + Fail the instance if it's stuck in the status for too long. + + Args: + instance: The instance to handle. + timeout_s: The timeout in seconds. + new_status: The new status to transition to. + update_kwargs: The update kwargs for InstanceUpdateEvent + + Returns: + Instance update to the new status: if the instance is stuck in the status + for too long. + None: if there's no update. + + """ + if not InstanceUtil.has_timeout(instance, timeout_s): + # Not timeout yet, be patient. + return None + + return IMInstanceUpdateEvent( + instance_id=instance.instance_id, + new_instance_status=new_status, + details=f"timeout={timeout_s}s at status " + f"{IMInstance.InstanceStatus.Name(instance.status)}", + **update_kwargs, + ) + + @staticmethod + def _handle_extra_cloud_instances( + instance_manager: InstanceManager, + non_terminated_cloud_instances: Dict[CloudInstanceId, CloudInstance], + ray_nodes: List[NodeState], + ): + """ + For extra cloud instances (i.e. cloud instances that are non terminated as + returned by cloud provider, but not managed by the instance manager), we + will create new IM instances with ALLOCATED status. + + Such instances could either be: + 1. Leaked instances that are incorrectly started by the cloud instance + provider, and they would be terminated eventually if they fail to + transition to RAY_RUNNING by stuck instances reconciliation, or they + would join the ray cluster and be terminated when the cluster scales down. + 2. Instances that are started by the cloud instance provider intentionally + but not yet discovered by the instance manager. This could happen for + a. Head node that's started before the autoscaler. + b. Worker nodes that's started by the cloud provider upon users' + actions: i.e. KubeRay scaling up the cluster with ray cluster config + change. + 3. Ray nodes with cloud instance id not in the cloud provider. This could + happen if there's delay in the Ray's state (i.e. cloud instance already + terminated, but the ray node is still not dead yet). + + Args: + instance_manager: The instance manager to reconcile. + non_terminated_cloud_instances: The non-terminated cloud instances from + the cloud provider. + ray_nodes: The ray cluster's states of ray nodes. + """ + Reconciler._handle_extra_cloud_instances_from_cloud_provider( + instance_manager, non_terminated_cloud_instances + ) + Reconciler._handle_extra_cloud_instances_from_ray_nodes( + instance_manager, ray_nodes + ) + + @staticmethod + def _handle_extra_cloud_instances_from_cloud_provider( + instance_manager: InstanceManager, + non_terminated_cloud_instances: Dict[CloudInstanceId, CloudInstance], + ): + """ + For extra cloud instances that are not managed by the instance manager but + are running in the cloud provider, we will create new IM instances with + ALLOCATED status. + + Args: + instance_manager: The instance manager to reconcile. + non_terminated_cloud_instances: The non-terminated cloud instances from + the cloud provider. + """ + updates = {} + + instances, version = Reconciler._get_im_instances(instance_manager) + cloud_instance_ids_managed_by_im = { + instance.cloud_instance_id + for instance in instances + if instance.cloud_instance_id + } + + # Find the extra cloud instances that are not managed by the instance manager. + for cloud_instance_id, cloud_instance in non_terminated_cloud_instances.items(): + if cloud_instance_id in cloud_instance_ids_managed_by_im: + continue + updates[cloud_instance_id] = IMInstanceUpdateEvent( + instance_id=InstanceUtil.random_instance_id(), # Assign a new id. + cloud_instance_id=cloud_instance_id, + new_instance_status=IMInstance.ALLOCATED, + node_kind=cloud_instance.node_kind, + instance_type=cloud_instance.node_type, + details=( + "allocated unmanaged cloud instance :" + f"{cloud_instance.cloud_instance_id} " + f"({NodeKind.Name(cloud_instance.node_kind)}) from cloud provider" + ), + upsert=True, + ) + Reconciler._update_instance_manager(instance_manager, version, updates) + + @staticmethod + def _handle_extra_cloud_instances_from_ray_nodes( + instance_manager: InstanceManager, ray_nodes: List[NodeState] + ): + """ + For extra cloud instances reported by Ray but not managed by the instance + manager, we will create new IM instances with ALLOCATED status. + + Args: + instance_manager: The instance manager to reconcile. + ray_nodes: The ray cluster's states of ray nodes. + """ + updates = {} + + instances, version = Reconciler._get_im_instances(instance_manager) + cloud_instance_ids_managed_by_im = { + instance.cloud_instance_id + for instance in instances + if instance.cloud_instance_id + } + + for ray_node in ray_nodes: + if not ray_node.instance_id: + continue + + cloud_instance_id = ray_node.instance_id + if cloud_instance_id in cloud_instance_ids_managed_by_im: + continue + + is_head = is_head_node(ray_node) + updates[cloud_instance_id] = IMInstanceUpdateEvent( + instance_id=InstanceUtil.random_instance_id(), # Assign a new id. + cloud_instance_id=cloud_instance_id, + new_instance_status=IMInstance.ALLOCATED, + node_kind=NodeKind.HEAD if is_head else NodeKind.WORKER, + instance_type=ray_node.ray_node_type_name, + details=( + "allocated unmanaged worker cloud instance from ray node: " + f"{binary_to_hex(ray_node.node_id)}" + ), + upsert=True, + ) + + Reconciler._update_instance_manager(instance_manager, version, updates) + + @staticmethod + def _report_metrics( + instance_manager: InstanceManager, + autoscaling_config: AutoscalingConfig, + metrics_reporter: Optional[AutoscalerMetricsReporter] = None, + ): + if not metrics_reporter: + return + + instances, _ = Reconciler._get_im_instances(instance_manager) + node_type_configs = autoscaling_config.get_node_type_configs() + + metrics_reporter.report_instances(instances, node_type_configs) + metrics_reporter.report_resources(instances, node_type_configs) diff --git a/.venv/lib/python3.11/site-packages/ray/autoscaler/v2/instance_manager/storage.py b/.venv/lib/python3.11/site-packages/ray/autoscaler/v2/instance_manager/storage.py new file mode 100644 index 0000000000000000000000000000000000000000..40447bf796962de488a8f11add7f3f0786dace3a --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/autoscaler/v2/instance_manager/storage.py @@ -0,0 +1,180 @@ +import copy +from abc import ABCMeta, abstractmethod +from collections import defaultdict, namedtuple +from threading import Lock +from typing import Dict, List, Optional, Tuple + +StoreStatus = namedtuple("StoreStatus", ["success", "version"]) +VersionedValue = namedtuple("VersionedValue", ["value", "version"]) + + +class Storage(metaclass=ABCMeta): + """Interface for a storage backend that stores the state of nodes in the cluster. + + The storage is thread-safe. + + The storage is versioned, which means that each successful stage change to the + storage will bump the version number. The version number can be used to + implement optimistic concurrency control. + + Each entry in the storage table is also versioned. The version number of an entry + is the last version number when the entry is updated. + """ + + @abstractmethod + def batch_update( + self, + table: str, + mutation: Optional[Dict[str, str]] = None, + deletion: Optional[List[str]] = None, + expected_storage_version: Optional[int] = None, + ) -> StoreStatus: + """Batch update the storage table. This method is atomic. + + Args: + table: The name of the table. + mutation: A dictionary of key-value pairs to be updated. + deletion: A list of keys to be deleted. + expected_storage_version: The expected storage version. The + update will fail if the version does not match the + current storage version. + + Returns: + StoreStatus: A tuple of (success, version). If the update is + successful, returns (True, new_version). + Otherwise, returns (False, current_version). + """ + raise NotImplementedError("batch_update() has to be implemented") + + @abstractmethod + def update( + self, + table: str, + key: str, + value: str, + expected_entry_version: Optional[int] = None, + insert_only: bool = False, + ) -> StoreStatus: + """Update a single entry in the storage table. + + Args: + table: The name of the table. + key: The key of the entry. + value: The value of the entry. + expected_entry_version: The expected version of the entry. + The update will fail if the version does not match the current + version of the entry. + insert_only: If True, the update will + fail if the entry already exists. + Returns: + StoreStatus: A tuple of (success, version). If the update is + successful, returns (True, new_version). Otherwise, + returns (False, current_version). + """ + raise NotImplementedError("update() has to be implemented") + + @abstractmethod + def get_all(self, table: str) -> Tuple[Dict[str, Tuple[str, int]], int]: + raise NotImplementedError("get_all() has to be implemented") + + @abstractmethod + def get( + self, table: str, keys: List[str] + ) -> Tuple[Dict[str, Tuple[str, int]], int]: + """Get a list of entries from the storage table. + + Args: + table: The name of the table. + keys: A list of keys to be retrieved. If the list is empty, + all entries in the table will be returned. + + Returns: + Tuple[Dict[str, VersionedValue], int]: A tuple of + (entries, storage_version). The entries is a dictionary of + (key, (value, entry_version)) pairs. The entry_version is the + version of the entry when it was last updated. The + storage_version is the current storage version. + """ + raise NotImplementedError("get() has to be implemented") + + @abstractmethod + def get_version(self) -> int: + """Get the current storage version. + + Returns: + int: The current storage version. + """ + raise NotImplementedError("get_version() has to be implemented") + + +class InMemoryStorage(Storage): + """An in-memory implementation of the Storage interface. This implementation + is not durable""" + + def __init__(self): + self._version = 0 + self._tables = defaultdict(dict) + self._lock = Lock() + + def batch_update( + self, + table: str, + mutation: Dict[str, str] = None, + deletion: List[str] = None, + expected_version: Optional[int] = None, + ) -> StoreStatus: + mutation = mutation if mutation else {} + deletion = deletion if deletion else [] + with self._lock: + if expected_version is not None and expected_version != self._version: + return StoreStatus(False, self._version) + self._version += 1 + key_value_pairs_with_version = { + key: VersionedValue(value, self._version) + for key, value in mutation.items() + } + self._tables[table].update(key_value_pairs_with_version) + for deleted_key in deletion: + self._tables[table].pop(deleted_key, None) + return StoreStatus(True, self._version) + + def update( + self, + table: str, + key: str, + value: str, + expected_entry_version: Optional[int] = None, + expected_storage_version: Optional[int] = None, + insert_only: bool = False, + ) -> StoreStatus: + with self._lock: + if ( + expected_storage_version is not None + and expected_storage_version != self._version + ): + return StoreStatus(False, self._version) + if insert_only and key in self._tables[table]: + return StoreStatus(False, self._version) + _, version = self._tables[table].get(key, (None, -1)) + if expected_entry_version is not None and expected_entry_version != version: + return StoreStatus(False, self._version) + self._version += 1 + self._tables[table][key] = VersionedValue(value, self._version) + return StoreStatus(True, self._version) + + def get_all(self, table: str) -> Tuple[Dict[str, VersionedValue], int]: + with self._lock: + return (copy.deepcopy(self._tables[table]), self._version) + + def get(self, table: str, keys: List[str]) -> Tuple[Dict[str, VersionedValue], int]: + if not keys: + return self.get_all(table) + with self._lock: + result = {} + for key in keys: + if key in self._tables.get(table, {}): + result[key] = self._tables[table][key] + return StoreStatus(result, self._version) + + def get_version(self) -> int: + return self._version diff --git a/.venv/lib/python3.11/site-packages/ray/autoscaler/v2/metrics_reporter.py b/.venv/lib/python3.11/site-packages/ray/autoscaler/v2/metrics_reporter.py new file mode 100644 index 0000000000000000000000000000000000000000..4fbe6e10f1db419ed2ee73949fde94129d55fee9 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/autoscaler/v2/metrics_reporter.py @@ -0,0 +1,100 @@ +from collections import defaultdict +from typing import Dict, List + +from ray.autoscaler._private.prom_metrics import AutoscalerPrometheusMetrics +from ray.autoscaler.v2.instance_manager.common import InstanceUtil +from ray.autoscaler.v2.instance_manager.config import NodeTypeConfig +from ray.autoscaler.v2.schema import NodeType +from ray.core.generated.instance_manager_pb2 import Instance as IMInstance + + +class AutoscalerMetricsReporter: + def __init__(self, prom_metrics: AutoscalerPrometheusMetrics) -> None: + self._prom_metrics = prom_metrics + + def report_instances( + self, + instances: List[IMInstance], + node_type_configs: Dict[NodeType, NodeTypeConfig], + ): + """ + Record autoscaler metrics for: + - pending_nodes: Nodes that are launching/pending ray start + - active_nodes: Active nodes (nodes running ray) + - recently_failed_nodes: Nodes that are being terminated. + - stopped_nodes: Nodes that are terminated. + """ + # map of instance type to a dict of status to count. + status_count_by_type: Dict[NodeType : Dict[str, int]] = {} + # initialize the status count by type. + for instance_type in node_type_configs.keys(): + status_count_by_type[instance_type] = { + "pending": 0, + "running": 0, + "terminating": 0, + "terminated": 0, + } + + for instance in instances: + if InstanceUtil.is_ray_pending(instance.status): + status_count_by_type[instance.instance_type]["pending"] += 1 + elif InstanceUtil.is_ray_running(instance.status): + status_count_by_type[instance.instance_type]["running"] += 1 + elif instance.status == IMInstance.TERMINATING: + status_count_by_type[instance.instance_type]["terminating"] += 1 + elif instance.status == IMInstance.TERMINATED: + status_count_by_type[instance.instance_type]["terminated"] += 1 + + for instance_type, status_count in status_count_by_type.items(): + self._prom_metrics.pending_nodes.labels( + SessionName=self._prom_metrics.session_name, NodeType=instance_type + ).set(status_count["pending"]) + + self._prom_metrics.active_nodes.labels( + SessionName=self._prom_metrics.session_name, NodeType=instance_type + ).set(status_count["running"]) + + self._prom_metrics.recently_failed_nodes.labels( + SessionName=self._prom_metrics.session_name, NodeType=instance_type + ).set(status_count["terminating"]) + + self._prom_metrics.stopped_nodes.inc(status_count["terminated"]) + + def report_resources( + self, + instances: List[IMInstance], + node_type_configs: Dict[NodeType, NodeTypeConfig], + ): + """ + Record autoscaler metrics for: + - pending_resources: Pending resources + - cluster_resources: Cluster resources (resources running on the cluster) + """ + # pending resources. + pending_resources = defaultdict(float) + cluster_resources = defaultdict(float) + + def _add_resources(resource_map, node_type_configs, node_type, count): + node_resources = node_type_configs[node_type].resources + for resource_name, resource_value in node_resources.items(): + resource_map[resource_name] += resource_value * count + + for instance in instances: + if InstanceUtil.is_ray_pending(instance.status): + _add_resources( + pending_resources, node_type_configs, instance.instance_type, 1 + ) + elif InstanceUtil.is_ray_running(instance.status): + _add_resources( + cluster_resources, node_type_configs, instance.instance_type, 1 + ) + + for resource_name, resource_value in pending_resources.items(): + self._prom_metrics.pending_resources.labels( + SessionName=self._prom_metrics.session_name, resource=resource_name + ).set(resource_value) + + for resource_name, resource_value in cluster_resources.items(): + self._prom_metrics.cluster_resources.labels( + SessionName=self._prom_metrics.session_name, resource=resource_name + ).set(resource_value) diff --git a/.venv/lib/python3.11/site-packages/ray/autoscaler/v2/monitor.py b/.venv/lib/python3.11/site-packages/ray/autoscaler/v2/monitor.py new file mode 100644 index 0000000000000000000000000000000000000000..591ecf8d2332999934e7cae197bbb0c9be76823a --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/autoscaler/v2/monitor.py @@ -0,0 +1,302 @@ +"""Autoscaler monitoring loop daemon. + +See autoscaler._private/monitor.py for the legacy implementation. All the legacy flags +are supported here, but the new implementation uses the new autoscaler v2. +""" + +import argparse +import logging +import os +import sys +import time +from typing import Optional + +import ray +import ray._private.ray_constants as ray_constants +import ray._private.utils +from ray._private.event.event_logger import get_event_logger +from ray._private.ray_logging import setup_component_logger +from ray._private.usage.usage_lib import record_extra_usage_tag +from ray._private.worker import SCRIPT_MODE +from ray._raylet import GcsClient +from ray.autoscaler._private.constants import ( + AUTOSCALER_METRIC_PORT, + AUTOSCALER_UPDATE_INTERVAL_S, +) +from ray.autoscaler._private.prom_metrics import AutoscalerPrometheusMetrics +from ray.autoscaler.v2.autoscaler import Autoscaler +from ray.autoscaler.v2.event_logger import AutoscalerEventLogger +from ray.autoscaler.v2.instance_manager.config import ( + FileConfigReader, + IConfigReader, + ReadOnlyProviderConfigReader, +) +from ray.autoscaler.v2.metrics_reporter import AutoscalerMetricsReporter +from ray.core.generated.autoscaler_pb2 import AutoscalingState +from ray.core.generated.event_pb2 import Event as RayEvent +from ray.core.generated.usage_pb2 import TagKey + +try: + import prometheus_client +except ImportError: + prometheus_client = None + + +logger = logging.getLogger(__name__) + + +class AutoscalerMonitor: + """Autoscaling monitor. + + This process periodically collects stats from the GCS and triggers + autoscaler updates. + + TODO: + We should also handle autoscaler failures properly in the future. + Right now, we don't restart autoscaler if it fails (internal reconciliation + however, should not fail the autoscaler process). + With the Reconciler able to handle extra cloud instances, we could in fact + recover the autoscaler process from reconciliation. + """ + + def __init__( + self, + address: str, + config_reader: IConfigReader, + log_dir: Optional[str] = None, + monitor_ip: Optional[str] = None, + ): + # Record v2 usage (we do this as early as possible to capture usage) + record_autoscaler_v2_usage(GcsClient(address)) + + self.gcs_address = address + worker = ray._private.worker.global_worker + # TODO: eventually plumb ClusterID through to here + self.gcs_client = GcsClient(address=self.gcs_address) + + if monitor_ip: + monitor_addr = f"{monitor_ip}:{AUTOSCALER_METRIC_PORT}" + self.gcs_client.internal_kv_put( + b"AutoscalerMetricsAddress", monitor_addr.encode(), True, None + ) + self._session_name = self._get_session_name(self.gcs_client) + logger.info(f"session_name: {self._session_name}") + worker.set_mode(SCRIPT_MODE) + head_node_ip = self.gcs_address.split(":")[0] + + self.autoscaler = None + if log_dir: + try: + ray_event_logger = get_event_logger( + RayEvent.SourceType.AUTOSCALER, log_dir + ) + self.event_logger = AutoscalerEventLogger(ray_event_logger) + except Exception: + self.event_logger = None + else: + self.event_logger = None + + prom_metrics = AutoscalerPrometheusMetrics(session_name=self._session_name) + self.metric_reporter = AutoscalerMetricsReporter(prom_metrics) + + if monitor_ip and prometheus_client: + # If monitor_ip wasn't passed in, then don't attempt to start the + # metric server to keep behavior identical to before metrics were + # introduced + try: + logger.info( + "Starting autoscaler metrics server on port {}".format( + AUTOSCALER_METRIC_PORT + ) + ) + kwargs = {"addr": "127.0.0.1"} if head_node_ip == "127.0.0.1" else {} + prometheus_client.start_http_server( + port=AUTOSCALER_METRIC_PORT, + registry=prom_metrics.registry, + **kwargs, + ) + except Exception: + logger.exception( + "An exception occurred while starting the metrics server." + ) + elif not prometheus_client: + logger.warning( + "`prometheus_client` not found, so metrics will not be exported." + ) + + self.autoscaler = Autoscaler( + session_name=self._session_name, + config_reader=config_reader, + gcs_client=self.gcs_client, + event_logger=self.event_logger, + metrics_reporter=self.metric_reporter, + ) + + @staticmethod + def _get_session_name(gcs_client: GcsClient) -> Optional[str]: + """Obtain the session name from the GCS. + + If the GCS doesn't respond, session name is considered None. + In this case, the metrics reported from the monitor won't have + the correct session name. + """ + session_name = gcs_client.internal_kv_get( + b"session_name", + ray_constants.KV_NAMESPACE_SESSION, + timeout=10, + ) + + if session_name: + session_name = session_name.decode() + + return session_name + + @staticmethod + def _report_autoscaling_state( + gcs_client: GcsClient, autoscaling_state: AutoscalingState + ): + """Report the autoscaling state to the GCS.""" + try: + gcs_client.report_autoscaling_state(autoscaling_state.SerializeToString()) + except Exception: + logger.exception("Error reporting autoscaling state to GCS.") + + def _run(self): + """Run the monitor loop.""" + + while True: + autoscaling_state = self.autoscaler.update_autoscaling_state() + if autoscaling_state: + # report autoscaling state + self._report_autoscaling_state(self.gcs_client, autoscaling_state) + else: + logger.warning("No autoscaling state to report.") + + # Wait for a autoscaler update interval before processing the next + # round of messages. + time.sleep(AUTOSCALER_UPDATE_INTERVAL_S) + + def run(self): + try: + self._run() + except Exception: + logger.exception("Error in monitor loop") + raise + + +def record_autoscaler_v2_usage(gcs_client: GcsClient) -> None: + """ + Record usage for autoscaler v2. + """ + try: + record_extra_usage_tag(TagKey.AUTOSCALER_VERSION, "v2", gcs_client) + except Exception: + logger.exception("Error recording usage for autoscaler v2.") + + +if __name__ == "__main__": + parser = argparse.ArgumentParser( + description=("Parse GCS server for the monitor to connect to.") + ) + parser.add_argument( + "--gcs-address", required=False, type=str, help="The address (ip:port) of GCS." + ) + parser.add_argument( + "--autoscaling-config", + required=False, + type=str, + help="the path to the autoscaling config file", + ) + parser.add_argument( + "--logging-level", + required=False, + type=str, + default=ray_constants.LOGGER_LEVEL, + choices=ray_constants.LOGGER_LEVEL_CHOICES, + help=ray_constants.LOGGER_LEVEL_HELP, + ) + parser.add_argument( + "--logging-format", + required=False, + type=str, + default=ray_constants.LOGGER_FORMAT, + help=ray_constants.LOGGER_FORMAT_HELP, + ) + parser.add_argument( + "--logging-filename", + required=False, + type=str, + default=ray_constants.MONITOR_LOG_FILE_NAME, + help="Specify the name of log file, " + "log to stdout if set empty, default is " + f'"{ray_constants.MONITOR_LOG_FILE_NAME}"', + ) + parser.add_argument( + "--logs-dir", + required=True, + type=str, + help="Specify the path of the temporary directory used by Ray processes.", + ) + parser.add_argument( + "--logging-rotate-bytes", + required=False, + type=int, + default=ray_constants.LOGGING_ROTATE_BYTES, + help="Specify the max bytes for rotating " + "log file, default is " + f"{ray_constants.LOGGING_ROTATE_BYTES} bytes.", + ) + parser.add_argument( + "--logging-rotate-backup-count", + required=False, + type=int, + default=ray_constants.LOGGING_ROTATE_BACKUP_COUNT, + help="Specify the backup count of rotated log file, default is " + f"{ray_constants.LOGGING_ROTATE_BACKUP_COUNT}.", + ) + parser.add_argument( + "--monitor-ip", + required=False, + type=str, + default=None, + help="The IP address of the machine hosting the monitor process.", + ) + + args = parser.parse_args() + setup_component_logger( + logging_level=args.logging_level, + logging_format=args.logging_format, + log_dir=args.logs_dir, + filename=args.logging_filename, + max_bytes=args.logging_rotate_bytes, + backup_count=args.logging_rotate_backup_count, + ) + + logger.info( + f"Starting autoscaler v2 monitor using ray installation: {ray.__file__}" + ) + logger.info(f"Ray version: {ray.__version__}") + logger.info(f"Ray commit: {ray.__commit__}") + logger.info(f"AutoscalerMonitor started with command: {sys.argv}") + + gcs_address = args.gcs_address + if gcs_address is None: + raise ValueError("--gcs-address must be set!") + + if not args.autoscaling_config: + logger.info("No autoscaling config provided: use read only node provider.") + config_reader = ReadOnlyProviderConfigReader(gcs_address) + else: + autoscaling_config = os.path.expanduser(args.autoscaling_config) + config_reader = FileConfigReader( + config_file=autoscaling_config, skip_content_hash=True + ) + + monitor = AutoscalerMonitor( + gcs_address, + config_reader, + log_dir=args.logs_dir, + monitor_ip=args.monitor_ip, + ) + + monitor.run() diff --git a/.venv/lib/python3.11/site-packages/ray/autoscaler/v2/scheduler.py b/.venv/lib/python3.11/site-packages/ray/autoscaler/v2/scheduler.py new file mode 100644 index 0000000000000000000000000000000000000000..dac774c9446dad6af87ac94006b07104382ebb96 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/autoscaler/v2/scheduler.py @@ -0,0 +1,1642 @@ +import copy +import logging +import time +import uuid +from abc import ABC, abstractmethod +from collections import defaultdict +from dataclasses import dataclass, field +from enum import Enum +from typing import Dict, List, Optional, Tuple + +from ray._private.protobuf_compat import message_to_dict +from ray.autoscaler._private.constants import AUTOSCALER_CONSERVE_GPU_NODES +from ray.autoscaler._private.resource_demand_scheduler import ( + UtilizationScore, + _fits, + _inplace_subtract, +) +from ray.autoscaler.v2.event_logger import AutoscalerEventLogger +from ray.autoscaler.v2.instance_manager.common import InstanceUtil +from ray.autoscaler.v2.instance_manager.config import NodeTypeConfig +from ray.autoscaler.v2.schema import AutoscalerInstance, NodeType +from ray.autoscaler.v2.utils import ProtobufUtil, ResourceRequestUtil +from ray.core.generated.autoscaler_pb2 import ( + ClusterResourceConstraint, + GangResourceRequest, + ResourceRequest, + ResourceRequestByCount, +) +from ray.core.generated.instance_manager_pb2 import ( + Instance, + LaunchRequest, + NodeKind, + TerminationRequest, +) + +# ============= Resource Scheduling Service API ======================= +# +# ResourceSchedulerService is a service that schedules resource bundles +# to nodes. It's used by the autoscaler to schedule resource bundles +# to determine the desired cluster size to satisfy the current resource +# demands. +# +logger = logging.getLogger(__name__) + + +@dataclass +class SchedulingRequest: + # If outdated node check through launch config is disabled. + disable_launch_config_check: bool + # Available node type configs + node_type_configs: Dict[NodeType, NodeTypeConfig] = field(default_factory=dict) + # Max number of worker nodes. + max_num_nodes: Optional[int] = None + # Idle timeout in seconds. + idle_timeout_s: Optional[float] = None + # TODO: This prob could be refactored into the ClusterStatus data class later. + # The current ray resource requests. + resource_requests: List[ResourceRequestByCount] = field(default_factory=list) + # The Gang resource requests. + gang_resource_requests: List[GangResourceRequest] = field(default_factory=list) + # cluster resource constraints. + cluster_resource_constraints: List[ClusterResourceConstraint] = field( + default_factory=list + ) + # The current instances. + current_instances: List[AutoscalerInstance] = field(default_factory=list) + + +@dataclass +class SchedulingReply: + # Instances to launch. + to_launch: List[LaunchRequest] = field(default_factory=list) + # To terminate. + to_terminate: List[TerminationRequest] = field(default_factory=list) + # The infeasible resource bundles. + infeasible_resource_requests: List[ResourceRequest] = field(default_factory=list) + # The infeasible gang resource bundles. + infeasible_gang_resource_requests: List[GangResourceRequest] = field( + default_factory=list + ) + # The infeasible cluster resource constraints. + infeasible_cluster_resource_constraints: List[ClusterResourceConstraint] = field( + default_factory=list + ) + + +class IResourceScheduler(ABC): + """ + Interface for a resource scheduler. + + Implements the `instance_manager.proto ResourceSchedulerService` interface. + """ + + @abstractmethod + def schedule(self, request: SchedulingRequest) -> SchedulingReply: + """ + Given the resource requests and the current cluster state, calculate the + target cluster shape by trying to schedule the resource requests on the + nodes. + """ + pass + + +class SchedulingNodeStatus(Enum): + """ + The status of a scheduling node (`SchedulingNode`) + """ + + # The node is added by the ResourceDemandScheduler. + TO_LAUNCH = "TO_LAUNCH" + # The node is pending, i.e. there's already an autoscaler instance being launched + # The node is schedulable. It could be running ray or pending to run ray. Either + # Way, it should be able to accept new resource requests/resource constraints. + SCHEDULABLE = "SCHEDULABLE" + # The node is to be terminated by the ResourceDemandScheduler + TO_TERMINATE = "TO_TERMINATE" + + +class ResourceRequestSource(Enum): + """ + The source of the resource request. + """ + + # The resource request is from demand, e.g. ray tasks/actors, + # placement groups, etc. + PENDING_DEMAND = "PENDING_DEMAND" + # The resource request is from the cluster resource constraints, i.e. + # from ray.autoscaler.sdk.request_resources(). + CLUSTER_RESOURCE_CONSTRAINT = "CLUSTER_RESOURCE_CONSTRAINT" + + +@dataclass +class SchedulingNode: + """ + A abstraction of a node that can be scheduled on by the resource scheduler. + + A scheduling node is expected to be used as: + + node = SchedulingNode.new(instance, node_configs) + remaining, score = node.try_schedule(requests) + + .... do something with the score .... + + NOTE: + One could also extend the scheduling behavior by overriding `try_schedule` + """ + + # Node type name. + node_type: NodeType + # Status + status: SchedulingNodeStatus + # Resource requests scheduled on this nodes for different sources. + sched_requests: Dict[ResourceRequestSource, List[ResourceRequest]] = field( + default_factory=lambda: defaultdict(list) + ) + # Available resources for different sources of requests. + available_resources_for_sched: Dict[ + ResourceRequestSource, Dict[str, float] + ] = field(default_factory=dict) + # The node's current resource capacity. + total_resources: Dict[str, float] = field(default_factory=dict) + # Node's labels, including static or dynamic labels. + labels: Dict[str, str] = field(default_factory=dict) + # Observability descriptive message for why the node was launched in the + # first place. + launch_reason: Optional[str] = None + # Termination request, none when the node is not being terminated. + termination_request: Optional[TerminationRequest] = None + # The instance id of the IM(Instance Manager) instance. None if the node + # is not yet in IM. + im_instance_id: Optional[str] = None + # The ray node id of the ray node. None if the node is not included in + # ray cluster's GCS report yet (not running ray yet). + ray_node_id: Optional[str] = None + # Idle duration in ms. Default not idle. + idle_duration_ms: int = 0 + # Launch config hash. + launch_config_hash: Optional[str] = None + # node kind. + node_kind: NodeKind = NodeKind.WORKER + + def __init__( + self, + node_type: NodeType, + total_resources: Dict[str, float], + available_resources: Dict[str, float], + labels: Dict[str, str], + status: SchedulingNodeStatus, + im_instance_id: str = "", + ray_node_id: str = "", + idle_duration_ms: int = 0, + launch_config_hash: str = "", + node_kind: NodeKind = NodeKind.WORKER, + termination_request: Optional[TerminationRequest] = None, + ): + self.node_type = node_type + self.total_resources = total_resources + self.available_resources_for_sched = { + ResourceRequestSource.PENDING_DEMAND: dict(available_resources), + ResourceRequestSource.CLUSTER_RESOURCE_CONSTRAINT: dict(total_resources), + } + self.sched_requests = { + ResourceRequestSource.PENDING_DEMAND: [], + ResourceRequestSource.CLUSTER_RESOURCE_CONSTRAINT: [], + } + self.labels = labels + self.status = status + self.im_instance_id = im_instance_id + self.ray_node_id = ray_node_id + self.idle_duration_ms = idle_duration_ms + self.launch_config_hash = launch_config_hash + self.node_kind = node_kind + self.termination_request = termination_request + + def get_available_resources(self, resource_request_source: ResourceRequestSource): + """Get the available resources for the given resource request source.""" + return self.available_resources_for_sched[resource_request_source] + + def get_sched_requests(self, resource_request_source: ResourceRequestSource): + """Get the resource requests for the given resource request source.""" + return self.sched_requests[resource_request_source] + + def add_sched_request( + self, + request: ResourceRequest, + resource_request_source: ResourceRequestSource, + ): + """ + Add the resource requests to the node. + + Args: + request: The resource request to be added. + resource_request_source: The source of the resource request. + """ + self.sched_requests[resource_request_source].append(request) + + @staticmethod + def new( + instance: AutoscalerInstance, + node_type_configs: Dict[NodeType, NodeTypeConfig], + disable_launch_config_check: bool, + ) -> Optional["SchedulingNode"]: + """ + Create a new scheduling node from an autoscaler instance. + + It creates: + - None if the instance is not schedulable by IM. + - A schedulable node if the instance is running ray or pending to run ray, + so it should be considered in the scheduling process. + + Args: + instance: The instance. + node_type_configs: The node type configs. + disable_launch_config_check: If outdated node check through launch config is + disabled. + + """ + if not SchedulingNode.is_schedulable(instance): + return None + + if instance.im_instance.status == Instance.RAY_RUNNING: + assert instance.ray_node is not None, ( + "ray node should not be None " + f"when the instance is running ray: instance={instance}" + ) + # An running ray node + return SchedulingNode( + node_type=instance.im_instance.instance_type, + total_resources=dict(instance.ray_node.total_resources), + # Available resources for scheduling requests of different + # sources. + available_resources=dict(instance.ray_node.available_resources), + # Use ray node's dynamic labels. + labels=dict(instance.ray_node.dynamic_labels), + status=SchedulingNodeStatus.SCHEDULABLE, + im_instance_id=instance.im_instance.instance_id, + ray_node_id=instance.im_instance.node_id, + idle_duration_ms=instance.ray_node.idle_duration_ms, + launch_config_hash=instance.im_instance.launch_config_hash, + node_kind=instance.im_instance.node_kind, + ) + + # This is an instance pending to run ray. Initialize a schedulable node + # from the node type config. + node_config = node_type_configs.get(instance.im_instance.instance_type, None) + if node_config is None: + if disable_launch_config_check: + # We are not terminating outdated nodes. + logger.info( + f"Node config for {instance.im_instance.instance_type} is missing, " + "but we are not terminating the outdated node because " + "`disable_launch_config_check` is True in " + "the autoscaler's provider config." + ) + return None + + # Configs might have been updated, and no more + # node_type_configs for this node type. We should terminate it. + return SchedulingNode( + node_type=instance.im_instance.instance_type, + total_resources={}, + available_resources={}, + labels={}, + status=SchedulingNodeStatus.TO_TERMINATE, + im_instance_id=instance.im_instance.instance_id, + termination_request=TerminationRequest( + id=str(uuid.uuid4()), + instance_id=instance.im_instance.instance_id, + cause=TerminationRequest.Cause.OUTDATED, + instance_type=instance.im_instance.instance_type, + ), + node_kind=NodeKind.WORKER, + ) + + return SchedulingNode.from_node_config( + node_config, + SchedulingNodeStatus.SCHEDULABLE, + node_kind=instance.im_instance.node_kind, + im_instance_id=instance.im_instance.instance_id, + ) + + @staticmethod + def is_schedulable(instance: AutoscalerInstance) -> bool: + """ + Check if the instance is schedulable by IM. + + Args: + instance: The instance. + + Returns: + True if the instance is schedulable by IM. + """ + if instance.im_instance is None: + # We will skip any instances that are not yet in IM which + # could be + # 1. an out-of-band ray node + # 2. an cloud instance running ray not yet discovered + # by the IM's Reconciler + # 3. an cloud instance already terminated but ray state + # still lagging behind. + # + # In all of these cases, the instance is not schedulable or + # shouldn't be managed by IM, so we don't consider them. + return False + + # These are the statuses where there's a running ray node or + # could eventually run ray. + if InstanceUtil.is_ray_running_reachable(instance.im_instance.status): + return True + + return False + + @staticmethod + def from_node_config( + node_config: NodeTypeConfig, + status: SchedulingNodeStatus, + node_kind: NodeKind, + im_instance_id: Optional[str] = None, + ) -> "SchedulingNode": + """ + Create a scheduling node from a node config. + + Args: + node_config: The node config. + status: The status of the node. + node_kind: The node kind. + im_instance_id: The instance id of the im instance. + node_kind: The node kind. + """ + return SchedulingNode( + node_type=node_config.name, + total_resources=dict(node_config.resources), + available_resources=dict(node_config.resources), + labels=dict(node_config.labels), + status=status, + im_instance_id=im_instance_id, + node_kind=node_kind, + ) + + def __post_init__(self): + assert self.node_type, "node_type should be set" + + def try_schedule( + self, + requests: List[ResourceRequest], + resource_request_source: ResourceRequestSource, + ) -> Tuple[List[ResourceRequest], UtilizationScore]: + """ + Try to schedule the resource requests on this node. + + This modifies the node's available resources if the requests are schedulable. + The requests are scheduled one by one in the sorted order, and no + backtracking is done. + + Args: + requests: The resource requests to be scheduled. + resource_request_source: The source of the resource request, i.e. + pending demands from ray actors/tasks or cluster resource constraints. + + Returns: + A tuple of: + - list of remaining requests that cannot be scheduled on this node. + - the utilization score for this node with respect to the current + resource requests being scheduled. + """ + # Track the resource requests that cannot be scheduled on this node. + unschedulable_requests = [] + + # Sort the requests and try schedule them one by one. + for r in requests: + if not self._try_schedule_one(r, resource_request_source): + unschedulable_requests.append(r) + + score = self._compute_score(resource_request_source) + + return unschedulable_requests, score + + def _compute_score( + self, resource_request_source: ResourceRequestSource + ) -> UtilizationScore: + """ + Compute the utilization score for this node with respect to the current resource + request being scheduled. + + A "higher" score means that this node is more suitable for scheduling the + current scheduled resource requests. + + The score is a tuple of 4 values: + 1. Whether this node is a GPU node and the current resource request has + GPU requirements: + 0: if this node is a GPU node and the current resource request + placed onto the node has no GPU requirements. + 1: if this node is not a GPU node or the current resource request + placed onto the node has GPU requirements. + 2. The number of resource types being scheduled. + 3. The minimum utilization rate across all resource types. + 4. The average utilization rate across all resource types. + + NOTE: + This function is adapted from _resource_based_utilization_scorer from + autoscaler v1. + + TODO(rickyx,jjyao): We should also consider node labels for + scoring. For example, if a node has a label that matches the affinity + label of the resource request, we should give it a higher score. + + TODO(rickyx): add pluggable scoring functions here. + + Returns: + A utilization score for this node. + """ + + sched_requests = self.get_sched_requests(resource_request_source) + available_resources = self.get_available_resources(resource_request_source) + + # Compute the number of resource types being scheduled. + num_matching_resource_types = 0 + sched_resource_types = set() + for req in sched_requests: + for resource_name, v in req.resources_bundle.items(): + if v > 0: + sched_resource_types.add(resource_name) + + for sched_resource_type in sched_resource_types: + if sched_resource_type in self.total_resources: + num_matching_resource_types += 1 + + # Compute the utilization rate for each resource type + util_by_resources = [] + for k, v in self.total_resources.items(): + if v == 0: + # Skip any zero values. + continue + if k in available_resources: + util = (v - available_resources.get(k, 0)) / v + assert util >= 0 and util <= 1, f"Invalid utilization: {util}" + util_by_resources.append(v * (util**3)) + + # Prefer not to launch a GPU node if there aren't any GPU requirements in the + # resource bundle. + gpu_ok = True + if AUTOSCALER_CONSERVE_GPU_NODES: + # TODO: we should also generalize this optimization for accelerators. + # https://github.com/ray-project/ray/issues/43079 + is_gpu_node = self.total_resources.get("GPU", 0) > 0 + any_gpu_requests = any("GPU" in r.resources_bundle for r in sched_requests) + if is_gpu_node and not any_gpu_requests: + gpu_ok = False + + # Prioritize avoiding gpu nodes for non-gpu workloads first, + # then prioritize matching multiple resource types, + # then prioritize using all resources, + # then prioritize overall balance of multiple resources. + return ( + gpu_ok, + num_matching_resource_types, + min(util_by_resources) if util_by_resources else 0, + float(sum(util_by_resources)) / len(util_by_resources) + if util_by_resources + else 0, + ) + + def _try_schedule_one( + self, request: ResourceRequest, resource_request_source: ResourceRequestSource + ) -> bool: + """ + Try to schedule one resource request on this node. The request could be from + various sources, specified by `resource_request_source`. + + Args: + request: The resource request to be scheduled. + resource_request_source: The source of the resource request, i.e. + pending demands from ray actors/tasks or cluster resource constraints. + + Returns: + True if the resource request is scheduled on this node. + """ + + # Check if there's placement constraints that are not satisfied. + for constraint in request.placement_constraints: + if constraint.HasField("anti_affinity"): + anti_affinity = constraint.anti_affinity + if ( + anti_affinity.label_name in self.labels + and anti_affinity.label_value + == self.labels[anti_affinity.label_name] + ): + # The node already has a label that matches the anti-affinity + return False + + # We don't need to check for affinity constraints here since + # we have already combined resource requests with the affinity + # constraints into the same request at `combine_requests_with_affinity`. + pass + + available_resources_dict = self.get_available_resources(resource_request_source) + + # Check if there's enough resources to schedule the request. + if not _fits(available_resources_dict, dict(request.resources_bundle)): + return False + + # Schedule the request, update resources + _inplace_subtract(available_resources_dict, dict(request.resources_bundle)) + + # Add the request to the node. + self.add_sched_request(request, resource_request_source) + + # Update the dynamic labels if there's any + for constraint in request.placement_constraints: + # We don't need to check for affinity constraints here since + # we have already combined resource requests with the affinity + # constraints into the same request at `combine_requests_with_affinity`. + # We don't need node labels for enforcing affinity. + if constraint.HasField("anti_affinity"): + anti_affinity = constraint.anti_affinity + self._add_label(anti_affinity.label_name, anti_affinity.label_value) + + return True + + def _add_label(self, label_name: str, label_value: str): + """ + Add a label to the node. + This assumes a label key can only have one value. + """ + assert ( + self.labels.get(label_name) is None + or self.labels[label_name] == label_value + ), ( + f"Label {label_name} already exists with value " + f"{self.labels[label_name]}, cannot set to " + f"{label_value}" + ) + self.labels[label_name] = label_value + + def __repr__(self) -> str: + return ( + "SchedulingNode(node_type={node_type}, " + "node_kind={node_kind}, " + "instance_id={instance_id}," + "ray_node_id={ray_node_id}," + "idle_duration_ms={idle_duration_ms}," + "termination_request={termination_request}," + "status={status}, " + "total_resources={total_resources}, " + "available_resources_for_demand={available_resources_for_demand}, " + "available_resources_for_cluster_resource_constraints=" + "{available_resources_for_cluster_resource_constraints}," + "labels={labels}, launch_reason={launch_reason}), " + "sched_requests_for_demand={sched_requests_for_demand}), " + "sched_requests_for_cluster_resource_constraints=" + "{sched_requests_for_cluster_resources_constraint})" + ).format( + node_type=self.node_type, + node_kind=self.node_kind, + instance_id=self.im_instance_id, + ray_node_id=self.ray_node_id, + idle_duration_ms=self.idle_duration_ms, + termination_request=str(message_to_dict(self.termination_request)) + if self.termination_request + else None, + status=self.status, + total_resources=self.total_resources, + available_resources_for_demand=self.available_resources_for_sched[ + ResourceRequestSource.PENDING_DEMAND + ], + available_resources_for_cluster_resource_constraints=self.available_resources_for_sched[ # noqa + ResourceRequestSource.CLUSTER_RESOURCE_CONSTRAINT + ], + labels=self.labels, + launch_reason=self.launch_reason, + sched_requests_for_demand="|".join( + str(message_to_dict(r)) + for r in self.sched_requests[ResourceRequestSource.PENDING_DEMAND] + ), + sched_requests_for_cluster_resources_constraint="|".join( + str(message_to_dict(r)) + for r in self.sched_requests[ + ResourceRequestSource.CLUSTER_RESOURCE_CONSTRAINT + ] + ), + ) + + +class ResourceDemandScheduler(IResourceScheduler): + """ + A resource demand scheduler that schedules resource requests based on the + following rules: + 1. Enforce the minimal count of nodes for each worker node type. + 2. Enforce the cluster resource constraints. + 3. Schedule the gang resource requests. + 4. Schedule the tasks/actor resource requests + """ + + def __init__(self, event_logger: Optional[AutoscalerEventLogger] = None): + self._event_logger = event_logger + + @dataclass + class ScheduleContext: + """ + Encapsulates the context for processing one scheduling request. + + This exposes functions to read and write the scheduling nodes, to prevent + accidental modification of the internal state. + """ + + # The node type configs for this scheduling request. + _node_type_configs: Dict[NodeType, NodeTypeConfig] + # If outdated node check through launch config is disabled. + _disable_launch_config_check: bool + # The max number of nodes for the entire cluster. + _max_num_nodes: Optional[int] = None + # The idle timeout in seconds. + _idle_timeout_s: Optional[float] = None + # The current schedulable nodes (including pending nodes and pending requests). + _nodes: List[SchedulingNode] = field(default_factory=list) + # The number of nodes by node types available for launching based on the max + # number of workers in the config. This takes into account any pending/running + # nodes. + _node_type_available: Dict[NodeType, int] = field(default_factory=dict) + + def __init__( + self, + nodes: List[SchedulingNode], + node_type_configs: Dict[NodeType, NodeTypeConfig], + disable_launch_config_check: bool, + max_num_nodes: Optional[int] = None, + idle_timeout_s: Optional[float] = None, + ): + self._nodes = nodes + self._node_type_configs = node_type_configs + self._node_type_available = self._compute_available_node_types( + nodes, node_type_configs + ) + self._max_num_nodes = max_num_nodes + self._idle_timeout_s = idle_timeout_s + self._disable_launch_config_check = disable_launch_config_check + + @classmethod + def from_schedule_request( + cls, req: SchedulingRequest + ) -> "ResourceDemandScheduler.ScheduleContext": + """ + Create a schedule context from a schedule request. + It will populate the context with the existing nodes and the available node + types from the config. + + Args: + req: The scheduling request. The caller should make sure the + request is valid. + """ + + nodes = [] + node_type_configs = req.node_type_configs + + # Initialize the scheduling nodes. + for instance in req.current_instances: + node = SchedulingNode.new( + instance, node_type_configs, req.disable_launch_config_check + ) + if node: + nodes.append(node) + + return cls( + nodes=nodes, + node_type_configs=node_type_configs, + disable_launch_config_check=req.disable_launch_config_check, + max_num_nodes=req.max_num_nodes, + idle_timeout_s=req.idle_timeout_s, + ) + + @staticmethod + def _compute_available_node_types( + nodes: List[SchedulingNode], + node_type_configs: Dict[NodeType, NodeTypeConfig], + ) -> Dict[NodeType, int]: + """ + Compute the number of nodes by node types available for launching based on + the max number of workers in the config. + Args: + nodes: The current existing nodes. + node_type_configs: The node type configs. + Returns: + A dict of node types and the number of nodes available for launching. + """ + node_type_available: Dict[NodeType, int] = defaultdict(int) + node_type_existing: Dict[NodeType, int] = defaultdict(int) + for node in nodes: + node_type_existing[node.node_type] += 1 + + for ( + node_type, + node_type_config, + ) in node_type_configs.items(): + node_type_available[ + node_type + ] = node_type_config.max_worker_nodes - node_type_existing.get( + node_type, 0 + ) + + return node_type_available + + def get_nodes(self) -> List[SchedulingNode]: + """ + Get the current nodes with filter. + + Returns: + A list of nodes. + """ + nodes = copy.deepcopy(self._nodes) + return nodes + + def get_node_type_available(self) -> Dict[NodeType, int]: + return copy.deepcopy(self._node_type_available) + + def get_cluster_shape(self) -> Dict[NodeType, int]: + cluster_shape = defaultdict(int) + for node in self._nodes: + if node.status == SchedulingNodeStatus.TO_TERMINATE: + # Skip the nodes that are to be terminated. + continue + + cluster_shape[node.node_type] += 1 + return cluster_shape + + def get_idle_timeout_s(self) -> Optional[float]: + return self._idle_timeout_s + + def update(self, new_nodes: List[SchedulingNode]) -> None: + """ + Update the context with the new nodes. + """ + self._nodes = new_nodes + + # Update the available node types. + self._node_type_available = self._compute_available_node_types( + self._nodes, self._node_type_configs + ) + + def get_max_num_nodes(self) -> Optional[int]: + """ + Get the max number of nodes for the entire cluster. + """ + return self._max_num_nodes + + def get_node_type_configs(self) -> Dict[NodeType, NodeTypeConfig]: + return self._node_type_configs + + def __str__(self) -> str: + return "ScheduleContext({} nodes, node_type_available={})".format( + len(self._nodes), dict(self._node_type_available) + ) + + def get_launch_requests(self) -> List[LaunchRequest]: + """ + Get the launch requests for the nodes that are to be launched. + """ + launch_by_type = defaultdict(int) + for node in self._nodes: + if node.status == SchedulingNodeStatus.TO_LAUNCH: + launch_by_type[node.node_type] += 1 + + launch_requests = [] + for instance_type, count in launch_by_type.items(): + launch_requests.append( + LaunchRequest( + instance_type=instance_type, + count=count, + id=str(uuid.uuid4()), + request_ts_ms=time.time_ns() // 1000, + ) + ) + return launch_requests + + def get_terminate_requests( + self, + ) -> List[TerminationRequest]: + """ + Get the terminate requests for the nodes that are to be terminated. + """ + return [ + node.termination_request + for node in self._nodes + if node.termination_request is not None + ] + + def schedule(self, request: SchedulingRequest) -> SchedulingReply: + logger.debug( + "Scheduling for request: resource_request={}, gang_resource_request={}, " + "cluster_constraint={}".format( + ResourceRequestUtil.to_dict_list(request.resource_requests), + ProtobufUtil.to_dict_list(request.gang_resource_requests), + ProtobufUtil.to_dict_list(request.cluster_resource_constraints), + ) + ) + + ctx = ResourceDemandScheduler.ScheduleContext.from_schedule_request(request) + + # Enforce outdate nodes. + ResourceDemandScheduler._terminate_outdated_nodes(ctx) + + # Enforce the minimal count of nodes for each worker node type. + ResourceDemandScheduler._enforce_min_workers_per_type(ctx) + + # Enforce the max worker nodes count. + ResourceDemandScheduler._enforce_max_workers_per_type(ctx) + + # Enforce the max worker nodes count globally. + ResourceDemandScheduler._enforce_max_workers_global(ctx) + + # Enforce the cluster resource constraints. + infeasible_constraints = ResourceDemandScheduler._enforce_resource_constraints( + ctx, request.cluster_resource_constraints + ) + + # Schedule the gang resource requests. + infeasible_gang_requests = ( + ResourceDemandScheduler._sched_gang_resource_requests( + ctx, request.gang_resource_requests + ) + ) + + # Schedule the tasks/actor resource requests + infeasible_requests = ResourceDemandScheduler._sched_resource_requests( + ctx, + ResourceRequestUtil.ungroup_by_count(request.resource_requests), + ) + + # Shutdown any idle nodes that's not needed (e.g. no resource constraints. + # not needed by min_worker count, etc.) + ResourceDemandScheduler._enforce_idle_termination(ctx) + + # Compute the number of nodes to launch. + reply = SchedulingReply( + infeasible_resource_requests=infeasible_requests, + infeasible_gang_resource_requests=infeasible_gang_requests, + infeasible_cluster_resource_constraints=infeasible_constraints, + to_launch=ctx.get_launch_requests(), + to_terminate=ctx.get_terminate_requests(), + ) + + if self._event_logger is not None: + try: + self._event_logger.log_cluster_scheduling_update( + launch_requests=reply.to_launch, + terminate_requests=reply.to_terminate, + infeasible_requests=infeasible_requests, + infeasible_gang_requests=infeasible_gang_requests, + infeasible_cluster_resource_constraints=infeasible_constraints, + cluster_shape=ctx.get_cluster_shape(), + node_type_configs=ctx.get_node_type_configs(), + ) + except Exception: + logger.exception("Failed to emit event logs.") + + return reply + + @staticmethod + def _enforce_max_workers_per_type( + ctx: "ResourceDemandScheduler.ScheduleContext", + ) -> None: + """ + Enforce the max number of workers for each node type. + """ + + # Get all the nodes by type + all_nodes = ctx.get_nodes() + + non_terminating_nodes_by_type = defaultdict(list) + terminating_nodes = [] + for node in all_nodes: + if node.status == SchedulingNodeStatus.TO_TERMINATE: + terminating_nodes.append(node) + else: + non_terminating_nodes_by_type[node.node_type].append(node) + + # Step 1. Enforce the max number of workers for each node type. + for node_type in non_terminating_nodes_by_type.keys(): + non_terminate_nodes_of_type = non_terminating_nodes_by_type[node_type] + node_config = ctx.get_node_type_configs()[node_type] + num_max_nodes_per_type = node_config.max_worker_nodes + num_extra_nodes = len(non_terminate_nodes_of_type) - num_max_nodes_per_type + + if num_extra_nodes <= 0: + # No extra nodes for this type, continue. + continue + + # Terminate the nodes + ( + to_terminate, + remained_nodes, + ) = ResourceDemandScheduler._select_nodes_to_terminate( + non_terminate_nodes_of_type, + num_extra_nodes, + TerminationRequest.Cause.MAX_NUM_NODE_PER_TYPE, + max_num_nodes_per_type=num_max_nodes_per_type, + ) + + non_terminating_nodes_by_type[node_type] = remained_nodes + terminating_nodes.extend(to_terminate) + + non_terminating_nodes = [] + for nodes in non_terminating_nodes_by_type.values(): + non_terminating_nodes.extend(nodes) + + # Update the context + assert len(all_nodes) == len( + terminating_nodes + non_terminating_nodes + ), "The number of nodes should be the same after enforcing max nodes per type." + + ctx.update(terminating_nodes + non_terminating_nodes) + + if terminating_nodes: + logger.debug( + f"Terminating {len(terminating_nodes)} " + "nodes for per node type max num node's constraints." + ) + + @staticmethod + def _enforce_max_workers_global( + ctx: "ResourceDemandScheduler.ScheduleContext", + ) -> None: + """ + Enforce the max number of workers for the entire cluster. + """ + all_nodes = ctx.get_nodes() + + terminating_nodes = [] + non_terminating_nodes = [] + + for node in all_nodes: + if node.status == SchedulingNodeStatus.TO_TERMINATE: + terminating_nodes.append(node) + else: + non_terminating_nodes.append(node) + + num_max_nodes = ctx.get_max_num_nodes() + + num_to_terminate = ( + max(len(non_terminating_nodes) - num_max_nodes, 0) if num_max_nodes else 0 + ) + + if num_to_terminate <= 0: + # No extra nodes needed to terminate. + return + + # Terminate the nodes + ( + to_terminate_nodes, + non_terminating_nodes, + ) = ResourceDemandScheduler._select_nodes_to_terminate( + non_terminating_nodes, + num_to_terminate, + TerminationRequest.Cause.MAX_NUM_NODES, + max_num_nodes=num_max_nodes, + ) + + assert len(to_terminate_nodes) == num_to_terminate, ( + "Terminating {} nodes, failed to terminate {} nodes to " + "satisfy max_num_nodes={}".format( + len(to_terminate_nodes), + num_to_terminate - len(to_terminate_nodes), + num_max_nodes, + ) + ) + + # Update the context + terminating_nodes.extend(to_terminate_nodes) + assert len(all_nodes) == len( + terminating_nodes + non_terminating_nodes + ), "The number of nodes should be the same after enforcing max nodes." + + all_nodes = terminating_nodes + non_terminating_nodes + ctx.update(all_nodes) + + @staticmethod + def _select_nodes_to_terminate( + nodes: List[SchedulingNode], + num_to_terminate: int, + cause: TerminationRequest.Cause, + max_num_nodes: Optional[int] = None, + max_num_nodes_per_type: Optional[int] = None, + ) -> Tuple[List[SchedulingNode], List[SchedulingNode]]: + """ + Select 'num_to_terminate' of nodes to be terminated + from the 'nodes' list. It should never select a head node. + + Args: + nodes: The nodes to be terminated. + num_to_terminate: The number of nodes to be terminated. + cause: The cause of the termination. Should be one of + TerminationRequest.Cause.MAX_NUM_NODES or + TerminationRequest.Cause.MAX_NUM_NODE_PER_TYPE. + + max_num_nodes: The max number of nodes for the entire cluster only + used when the cause is TerminationRequest.Cause.MAX_NUM_NODES. + max_num_nodes_per_type: The max number of nodes for each node type. + Only used when the cause is + TerminationRequest.Cause.MAX_NUM_NODE_PER_TYPE. + + Returns: + A tuple of: + - The terminated nodes. + - The remained nodes. + """ + + # Sort the nodes for termination. + nodes.sort(key=ResourceDemandScheduler._sort_nodes_for_termination) + + # Remove the head node from the list. + head_node = None + for i, node in enumerate(nodes): + if node.node_kind == NodeKind.HEAD: + # Remove the head node from the list. + head_node = nodes.pop(i) + break + + terminated_nodes, remained_nodes = ( + nodes[:num_to_terminate], + # The head could be None if there's no head node being reported yet + # from the ray cluster. + nodes[num_to_terminate:] + ([head_node] if head_node else []), + ) + + assert cause in [ + TerminationRequest.Cause.MAX_NUM_NODES, + TerminationRequest.Cause.MAX_NUM_NODE_PER_TYPE, + ], "Other termination causes don't have to select nodes for termination." + + for node in terminated_nodes: + node.status = SchedulingNodeStatus.TO_TERMINATE + node.termination_request = TerminationRequest( + id=str(uuid.uuid4()), + instance_id=node.im_instance_id, + ray_node_id=node.ray_node_id, + cause=cause, + instance_type=node.node_type, + details=( + f"Terminating node due to {TerminationRequest.Cause.Name(cause)}: " + f"max_num_nodes={max_num_nodes}, " + f"max_num_nodes_per_type={max_num_nodes_per_type}" + ), + ) + if cause == TerminationRequest.Cause.MAX_NUM_NODES: + node.termination_request.max_num_nodes = max_num_nodes + elif cause == TerminationRequest.Cause.MAX_NUM_NODE_PER_TYPE: + node.termination_request.max_num_nodes_per_type = max_num_nodes_per_type + else: + raise ValueError("Unknown termination cause: {}".format(cause)) + + return terminated_nodes, remained_nodes + + @staticmethod + def _sort_nodes_for_termination(node: SchedulingNode) -> Tuple: + """ + Sort the nodes for termination increasingly by: + + 1. First if ray hasn't been started yet + 2. Then if the nodes are idle + 3. Then with lower resources util nodes first. + + Such that nodes sorted earlier will be terminated first. + """ + + running_ray = len(node.ray_node_id) > 0 + # Reverse the idle duration such that the nodes with the largest idle duration + # will be terminated first. + idle_dur = -1 * node.idle_duration_ms + available_resources = node.get_available_resources( + ResourceRequestSource.PENDING_DEMAND + ) + + utils_per_resources = {} + for resource, total in node.total_resources.items(): + if total <= 0: + continue + utils_per_resources[resource] = ( + total - available_resources.get(resource, 0) + ) / total + + avg_util = ( + sum(utils_per_resources.values()) / len(utils_per_resources) + if utils_per_resources + else 0 + ) + + return (running_ray, idle_dur, avg_util) + + @staticmethod + def _enforce_min_workers_per_type( + ctx: "ResourceDemandScheduler.ScheduleContext", + ) -> None: + """ + Enforce the minimal count of nodes for each worker node type. + """ + + # Count the existing nodes by type + count_by_node_type = ctx.get_cluster_shape() + + new_nodes = [] + # Launch new nodes to satisfy min count for each node type. + for ( + node_type, + node_type_config, + ) in ctx.get_node_type_configs().items(): + cur_count = count_by_node_type.get(node_type, 0) + min_count = node_type_config.min_worker_nodes + if cur_count < min_count: + logger.info( + f"Adding {min_count - cur_count} nodes to satisfy min count for " + f"node type: {node_type}." + ) + new_nodes.extend( + [ + SchedulingNode.from_node_config( + copy.deepcopy(node_type_config), + status=SchedulingNodeStatus.TO_LAUNCH, + node_kind=NodeKind.WORKER, + ) + ] + * (min_count - cur_count) + ) + # NOTE: we assume the aggregated number of min workers across all node types + # should not exceed any globally enforced max_num_nodes + + # Add the new nodes to the existing nodes and update the context. + ctx.update(new_nodes + ctx.get_nodes()) + + @staticmethod + def _enforce_resource_constraints( + ctx: "ResourceDemandScheduler.ScheduleContext", + constraints: List[ClusterResourceConstraint], + ) -> List[ClusterResourceConstraint]: + """ + Enforce the cluster resource constraints. + + Args: + ctx: The schedule context. + constraints: The cluster resource constraints. + + Returns: + A list of infeasible constraints. + + Notes: + It's different from the other scheduling functions since it doesn't actually + schedule any resource requests. Instead, it asks if the cluster could be + upscale to a certain shape to fulfill the constraints. + """ + + # NOTE: we currently only have 1 constraint from a cluster, but + # we may have multiple in the future. + assert len(constraints) <= 1, "Max 1 cluster resource constraint is supported." + if len(constraints) == 0: + # No cluster resource constraints - nothing needs to be done. + return [] + + constraint = constraints[0] + # Flatten the requests for iterating through. + requests = ResourceRequestUtil.ungroup_by_count(constraint.resource_requests) + + # Pass the empty nodes to schedule. + scheduled_nodes, infeasible = ResourceDemandScheduler._try_schedule( + ctx, + requests, + resource_request_source=ResourceRequestSource.CLUSTER_RESOURCE_CONSTRAINT, + ) + + if infeasible: + # Unable to satisfy the constraint. + return [constraint] + + ctx.update(scheduled_nodes) + return [] + + @staticmethod + def _sched_resource_requests( + ctx: "ResourceDemandScheduler.ScheduleContext", + requests: List[ResourceRequest], + ) -> List[ResourceRequest]: + """ + Schedule the resource requests. + + Args: + ctx: The schedule context. + requests_by_count: The resource requests. + + Returns: + A list of infeasible resource requests. + """ + nodes, infeasible = ResourceDemandScheduler._try_schedule( + ctx, requests, resource_request_source=ResourceRequestSource.PENDING_DEMAND + ) + + # Regardless if there's feasible, we will update the context for schedule nodes. + ctx.update(nodes) + + return infeasible + + @staticmethod + def _sched_gang_resource_requests( + ctx: "ResourceDemandScheduler.ScheduleContext", + gang_requests: List[GangResourceRequest], + ) -> List[GangResourceRequest]: + """ + Schedule the gang resource requests. + + These requests should be scheduled atomically, i.e. either all of the resources + requests in a gang request are scheduled or none of them are scheduled. + + For now, the gang resource requests represent Ray's placement groups, while it + could be more general in the future: + - For STRICT_PACK placement group requests, we combine them into a single + request and try to schedule them together. + - For STRICT_SPREAD placement groups requests, they should be scheduled on + different nodes by leveraging on the node labels that are associated with + the placement group. + If there are requests from rescheduling placement groups due to node + failures, these requests should not be scheduled on nodes with requests + from the same placement group. + + + Args: + ctx: The schedule context. + gang_requests: The gang resource requests. + + Returns: + A list of infeasible gang resource requests. + """ + + def _sort_gang_resource_requests(req: GangResourceRequest) -> Tuple: + """ + Key function for sorting the gang resource request by: + 1. the number of placement constraints in the gang request. + 2. the number of resource requests in the gang request. + """ + total_placement_constraints = 0 + for resource_request in req.requests: + total_placement_constraints += len( + resource_request.placement_constraints + ) + + return (total_placement_constraints, len(req.requests)) + + infeasible_gang_requests = [] + # Try fulfilling the gang requests one by one. + for gang_req in sorted( + gang_requests, key=_sort_gang_resource_requests, reverse=True + ): + requests = gang_req.requests + # Try to combine requests with affinity constraints into the same request. + requests = ResourceRequestUtil.combine_requests_with_affinity(requests) + + nodes, infeasible = ResourceDemandScheduler._try_schedule( + ctx, requests, ResourceRequestSource.PENDING_DEMAND + ) + + if infeasible: + # Unable to satisfy the constraint. We will skip the gang request. + # Don't update the context. + infeasible_gang_requests.append(gang_req) + continue + + # We are able to satisfy the constraint and thus update the context. + ctx.update(nodes) + + return infeasible_gang_requests + + @staticmethod + def _try_schedule( + ctx: "ResourceDemandScheduler.ScheduleContext", + requests_to_sched: List[ResourceRequest], + resource_request_source: ResourceRequestSource, + ) -> Tuple[List[SchedulingNode], List[ResourceRequest]]: + """ + Try to schedule the resource requests on the current context. + + It tries to schedule the requests on the existing nodes first, and + then try to schedule the requests on new nodes if possible. + + Args: + requests_to_sched: The resource requests to be scheduled. + ctx: The current scheduling context. + resource_request_source: The source of the resource request, i.e. + pending demands from ray actors/tasks or cluster resource + constraints. + + Returns: + - List of scheduled nodes to that have part or all of the requests + scheduled. + - List of infeasible requests remained that cannot be scheduled. + """ + # First sort the requests. + def _sort_resource_request(req: ResourceRequest) -> Tuple: + """ + Sort the resource requests by: + 1. The length of it's placement constraints. + 2. The number of resources it requests. + 3. The values of resources it requests. + 4. lexicographically for each resource (for stable ordering) + + This is a legacy sorting function for the autoscaler's binpacking + algo - we do this so that we could have a deterministic scheduling + results with reasonable fragmentation. + """ + return ( + len(req.placement_constraints), + len(req.resources_bundle.values()), + sum(req.resources_bundle.values()), + sorted(req.resources_bundle.items()), + ) + + requests_to_sched = sorted( + requests_to_sched, key=_sort_resource_request, reverse=True + ) + + existing_nodes = ctx.get_nodes() + node_type_available = ctx.get_node_type_available() + + # A list of nodes that are either: + # 1. existing nodes in the cluster. or + # 2. new nodes that are launched to satisfy the resource requests. + target_nodes = [] + + # Try scheduling resource requests with existing nodes first. + while len(requests_to_sched) > 0 and len(existing_nodes) > 0: + ( + best_node, + requests_to_sched, + existing_nodes, + ) = ResourceDemandScheduler._sched_best_node( + requests_to_sched, existing_nodes, resource_request_source + ) + if best_node is None: + # No existing nodes can schedule any more requests. + break + + target_nodes.append(best_node) + + # If there's any existing nodes left, we will add to the target nodes + target_nodes.extend(existing_nodes) + + # Try scheduling resource requests with new nodes. + node_pools = [ + SchedulingNode.from_node_config( + ctx.get_node_type_configs()[node_type], + status=SchedulingNodeStatus.TO_LAUNCH, + node_kind=NodeKind.WORKER, + ) + for node_type, num_available in node_type_available.items() + if num_available > 0 + ] + while len(requests_to_sched) > 0 and len(node_pools) > 0: + # Max number of nodes reached. + max_num_nodes = ctx.get_max_num_nodes() + if max_num_nodes is not None and len(target_nodes) >= max_num_nodes: + logger.debug( + "Max number of nodes reached: {}, " + "cannot launch more nodes.".format(max_num_nodes) + ) + break + + ( + best_node, + requests_to_sched, + node_pools, + ) = ResourceDemandScheduler._sched_best_node( + requests_to_sched, node_pools, resource_request_source + ) + if best_node is None: + break + + target_nodes.append(best_node) + # Update the node pool if a node with the same node type of the + # added node can be launched. + node_type_available[best_node.node_type] -= 1 + if node_type_available[best_node.node_type] > 0: + node_pools.append( + SchedulingNode.from_node_config( + ctx.get_node_type_configs()[best_node.node_type], + status=SchedulingNodeStatus.TO_LAUNCH, + node_kind=NodeKind.WORKER, + ) + ) + + return target_nodes, requests_to_sched + + @staticmethod + def _sched_best_node( + requests: List[ResourceRequest], + nodes: List[SchedulingNode], + resource_request_source: ResourceRequestSource, + ) -> Tuple[SchedulingNode, List[ResourceRequest], List[SchedulingNode]]: + """ + Schedule the requests on the best node. + A simple greedy algorithm is used to schedule the requests: + 1. Try to schedule the requests on each node. + 2. Sort the nodes by a score + 3. Return the node with the highest score. + + The highest score node is updated with the scheduled requests, and the node is + removed from the node list. + + Args: + requests: The resource requests to be scheduled. + nodes: The node candidates to be scheduled on. The nodes will be updated + after the scheduling attempt, i.e. the node that is scheduled will be + removed from the list. + resource_request_source: The source of the resource request, i.e. + pending demands from ray actors/tasks or cluster resource constraints. + + Returns: + best_node: The best node to schedule the requests. + infeasible: The infeasible requests that cannot be scheduled on the best + node. + nodes: Remaining nodes after the best node is removed. + """ + results = [] + + # A temporary data class to store the scheduling result. + @dataclass + class ScheduleResult: + # The node candidate after a scheduling attempt. + node: SchedulingNode + # The infeasible resource requests that are not scheduled. + infeasible_requests: List[ResourceRequest] + # The index of the node in the original node list. + idx: int + # the score of the scheduling node to compare with others. + score: UtilizationScore + + nodes_copy = copy.deepcopy(nodes) + + # Iterate through each node and modify the node's available resources + # if the requests are schedulable. + for idx, node in enumerate(nodes_copy): + remaining, score = node.try_schedule(requests, resource_request_source) + + if len(remaining) == len(requests): + # The node cannot schedule any of the requests. + continue + + results.append(ScheduleResult(node, remaining, idx, score)) + + # No nodes can schedule any of the requests. + if len(results) == 0: + logger.debug( + "No nodes can schedule the requests: {}, for nodes: {}".format( + ResourceRequestUtil.to_dict_list(requests), nodes + ) + ) + return None, requests, nodes + + # Sort the results by score. + results = sorted(results, key=lambda r: r.score, reverse=True) + best_result = results[0] + + # Remove the best node from the nodes. + nodes.pop(best_result.idx) + logger.debug( + "Best node: {}, score: {}, remaining requests: {}".format( + best_result.node, + best_result.score, + ResourceRequestUtil.to_dict_list(best_result.infeasible_requests), + ) + ) + return best_result.node, best_result.infeasible_requests, nodes + + @staticmethod + def _terminate_outdated_nodes( + ctx: "ResourceDemandScheduler.ScheduleContext", + ) -> None: + """ + Terminate the nodes that are outdated, i.e. the node type config has been + updated or the node's launch config hash is outdated. + + Args: + ctx: The schedule context. + """ + nodes = ctx.get_nodes() + + if ctx._disable_launch_config_check: + # Outdated nodes check through launch config check is disabled. + return + + for node in nodes: + if node.status != SchedulingNodeStatus.SCHEDULABLE: + # We don't need to care about the non-running nodes. + continue + + if node.node_kind == NodeKind.HEAD: + # We should not be terminating the head node even if it's outdated. + logger.warning( + f"Head node {node.im_instance_id}(ray={node.ray_node_id}) is " + "outdated with node config changes. " + "Please check the node's config or restart the cluster or restart " + "the head node. Autoscaler is not able to shutdown the outdated " + "head node" + ) + continue + node_type = node.node_type + node_type_config = ctx.get_node_type_configs().get(node_type) + if node_type_config is None or ( + node_type_config.launch_config_hash + and node_type_config.launch_config_hash != node.launch_config_hash + ): + # The node type config has been updated, and the node's launch config + # hash is outdated. + node.status = SchedulingNodeStatus.TO_TERMINATE + node.termination_request = TerminationRequest( + id=str(time.time_ns()), + instance_id=node.im_instance_id, + ray_node_id=node.ray_node_id, + instance_type=node.node_type, + cause=TerminationRequest.Cause.OUTDATED, + details=f"node from {node.node_type} has outdated config", + ) + + ctx.update(nodes) + + @staticmethod + def _enforce_idle_termination( + ctx: "ResourceDemandScheduler.ScheduleContext", + ) -> None: + """ + Enforce the idle termination for the nodes that are not needed by the cluster + resource constraints and idle for too long. + + Args: + ctx: The schedule context. + """ + count_by_node_type = ctx.get_cluster_shape() + node_type_configs = ctx.get_node_type_configs() + terminate_nodes_by_type: Dict[NodeType, int] = defaultdict(int) + + nodes = ctx.get_nodes() + s_to_ms = 1000 + for node in nodes: + if node.status != SchedulingNodeStatus.SCHEDULABLE: + # We don't need to care about the non-running nodes. + continue + + if node.node_kind == NodeKind.HEAD: + # The head node is not subject to idle termination. + continue + + idle_timeout_s = ctx.get_idle_timeout_s() + # Override the scheduler idle_timeout_s if set for this node_type. + node_type = node.node_type + if node_type in node_type_configs: + if node_type_configs[node_type].idle_timeout_s is not None: + idle_timeout_s = node_type_configs[node_type].idle_timeout_s + if idle_timeout_s is None: + # No idle timeout is set, skip the idle termination. + continue + + if node.idle_duration_ms <= idle_timeout_s * s_to_ms: + # The node is not idle for too long, skip it. + continue + + if node.sched_requests[ResourceRequestSource.CLUSTER_RESOURCE_CONSTRAINT]: + # The node is needed by the resource constraints. + # Skip it. + if node.idle_duration_ms > ctx.get_idle_timeout_s() * s_to_ms: + logger.debug( + "Node {} (idle for {} secs) is needed by the cluster resource " + "constraints, skip idle termination.".format( + node.ray_node_id, node.idle_duration_ms / s_to_ms + ) + ) + continue + + # Honor the min_worker_nodes setting for the node type. + min_count = 0 + if node_type in node_type_configs: + min_count = node_type_configs[node_type].min_worker_nodes + if ( + count_by_node_type.get(node_type, 0) + - terminate_nodes_by_type[node_type] + <= min_count + ): + logger.info( + "Node {} (idle for {} secs) belongs to node_type {} and is " + "required by min_worker_nodes, skipping idle termination.".format( + node.ray_node_id, node.idle_duration_ms / s_to_ms, node_type + ) + ) + continue + + terminate_nodes_by_type[node.node_type] += 1 + # The node is idle for too long, terminate it. + node.status = SchedulingNodeStatus.TO_TERMINATE + node.termination_request = TerminationRequest( + id=str(uuid.uuid4()), + instance_id=node.im_instance_id, + ray_node_id=node.ray_node_id, + cause=TerminationRequest.Cause.IDLE, + instance_type=node.node_type, + idle_duration_ms=node.idle_duration_ms, + details=f"idle for {node.idle_duration_ms/s_to_ms} secs > " + f"timeout={idle_timeout_s} secs", + ) + + ctx.update(nodes) diff --git a/.venv/lib/python3.11/site-packages/ray/autoscaler/v2/schema.py b/.venv/lib/python3.11/site-packages/ray/autoscaler/v2/schema.py new file mode 100644 index 0000000000000000000000000000000000000000..76eda2ec57c5127c7993d3eab6f201b3ecff395b --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/autoscaler/v2/schema.py @@ -0,0 +1,351 @@ +import re +from dataclasses import dataclass, field +from enum import Enum +from typing import Dict, List, Optional, Tuple + +from ray.autoscaler.v2.instance_manager.common import InstanceUtil +from ray.core.generated.autoscaler_pb2 import NodeState, NodeStatus +from ray.core.generated.instance_manager_pb2 import Instance + +# TODO(rickyx): once we have graceful shutdown, we could populate +# the failure detail with the actual termination message. As of now, +# we will use a more generic message to include cases such as: +# (idle termination, node death, crash, preemption, etc) +NODE_DEATH_CAUSE_RAYLET_DIED = "NodeTerminated" + + +# e.g., cpu_4_ondemand. +NodeType = str + + +@dataclass +class ResourceUsage: + # Resource name. + resource_name: str = "" + # Total resource. + total: float = 0.0 + # Resource used. + used: float = 0.0 + + +@dataclass +class NodeUsage: + # The node resource usage. + usage: List[ResourceUsage] + # How long the node has been idle. + idle_time_ms: int + + +@dataclass +class NodeInfo: + # The instance type name, e.g. p3.2xlarge + instance_type_name: str + # ray node type name. + ray_node_type_name: str + # Cloud instance id. + instance_id: str + # Ip address of the node when alive. + ip_address: str + # The status of the node. Optional for pending nodes. + node_status: Optional[str] = None + # ray node id in hex. None if still pending. + node_id: Optional[str] = None + # Resource usage breakdown if node is running. + resource_usage: Optional[NodeUsage] = None + # Failure detail if the node failed. + failure_detail: Optional[str] = None + # Descriptive details. + details: Optional[str] = None + # Activity on the node. + node_activity: Optional[List[str]] = None + + def total_resources(self) -> Dict[str, float]: + if self.resource_usage is None: + return {} + return {r.resource_name: r.total for r in self.resource_usage.usage} + + def available_resources(self) -> Dict[str, float]: + if self.resource_usage is None: + return {} + return {r.resource_name: r.total - r.used for r in self.resource_usage.usage} + + def used_resources(self) -> Dict[str, float]: + if self.resource_usage is None: + return {} + return {r.resource_name: r.used for r in self.resource_usage.usage} + + +@dataclass +class LaunchRequest: + class Status(Enum): + FAILED = "FAILED" + PENDING = "PENDING" + + # The instance type name, e.g. p3.2xlarge + instance_type_name: str + # ray node type name. + ray_node_type_name: str + # count. + count: int + # State: (e.g. PENDING, FAILED) + state: Status + # When the launch request was made in unix timestamp in secs. + request_ts_s: int + # When the launch request failed unix timestamp in secs if failed. + failed_ts_s: Optional[int] = None + # Request details, e.g. error reason if the launch request failed. + details: Optional[str] = None + + +@dataclass +class ResourceRequestByCount: + # Bundles in the demand. + bundle: Dict[str, float] + # Number of bundles with the same shape. + count: int + + def __str__(self) -> str: + return f"[{self.count} {self.bundle}]" + + +@dataclass +class ResourceDemand: + # The bundles in the demand with shape and count info. + bundles_by_count: List[ResourceRequestByCount] + + +@dataclass +class PlacementGroupResourceDemand(ResourceDemand): + # Details string (parsed into below information) + details: str + # Placement group's id. + pg_id: Optional[str] = None + # Strategy, e.g. STRICT_SPREAD + strategy: Optional[str] = None + # Placement group's state, e.g. PENDING + state: Optional[str] = None + + def __post_init__(self): + if not self.details: + return + + # Details in the format of :|, parse + # it into the above fields. + pattern = r"^.*:.*\|.*$" + match = re.match(pattern, self.details) + if not match: + return + + pg_id, details = self.details.split(":") + strategy, state = details.split("|") + self.pg_id = pg_id + self.strategy = strategy + self.state = state + + +@dataclass +class RayTaskActorDemand(ResourceDemand): + pass + + +@dataclass +class ClusterConstraintDemand(ResourceDemand): + pass + + +@dataclass +class ResourceDemandSummary: + # Placement group demand. + placement_group_demand: List[PlacementGroupResourceDemand] = field( + default_factory=list + ) + # Ray task actor demand. + ray_task_actor_demand: List[RayTaskActorDemand] = field(default_factory=list) + # Cluster constraint demand. + cluster_constraint_demand: List[ClusterConstraintDemand] = field( + default_factory=list + ) + + +@dataclass +class Stats: + # How long it took to get the GCS request. + # This is required when initializing the Stats since it should be calculated before + # the request was made. + gcs_request_time_s: float + # How long it took to get all live instances from node provider. + none_terminated_node_request_time_s: Optional[float] = None + # How long for autoscaler to process the scaling decision. + autoscaler_iteration_time_s: Optional[float] = None + # The last seen autoscaler state version from Ray. + autoscaler_version: Optional[str] = None + # The last seen cluster state resource version. + cluster_resource_state_version: Optional[str] = None + # Request made time unix timestamp: when the data was pulled from GCS. + request_ts_s: Optional[int] = None + + +@dataclass +class ClusterStatus: + # Healthy nodes information (non-idle) + active_nodes: List[NodeInfo] = field(default_factory=list) + # Idle node information + idle_nodes: List[NodeInfo] = field(default_factory=list) + # Pending launches. + pending_launches: List[LaunchRequest] = field(default_factory=list) + # Failed launches. + failed_launches: List[LaunchRequest] = field(default_factory=list) + # Pending nodes. + pending_nodes: List[NodeInfo] = field(default_factory=list) + # Failures + failed_nodes: List[NodeInfo] = field(default_factory=list) + # Resource usage summary for entire cluster. + cluster_resource_usage: List[ResourceUsage] = field(default_factory=list) + # Demand summary. + resource_demands: ResourceDemandSummary = field( + default_factory=ResourceDemandSummary + ) + # Query metics + stats: Stats = field(default_factory=Stats) + + def total_resources(self) -> Dict[str, float]: + return {r.resource_name: r.total for r in self.cluster_resource_usage} + + def available_resources(self) -> Dict[str, float]: + return {r.resource_name: r.total - r.used for r in self.cluster_resource_usage} + + # TODO(rickyx): we don't show infeasible requests as of now. + # (They will just be pending forever as part of the demands) + # We should show them properly in the future. + + +@dataclass +class AutoscalerInstance: + """ + AutoscalerInstance represents an instance that's managed by the autoscaler. + This includes two states: + 1. the instance manager state: information of the underlying cloud instance. + 2. the ray node state, e.g. resources, ray node status. + + The two states are linked by the cloud instance id, which should be set + when the ray node is started. + """ + + # The cloud instance id. It could be None if the instance hasn't been assigned + # a cloud instance id, e.g. the instance is still in QUEUED or REQUESTED status. + cloud_instance_id: Optional[str] = None + + # The ray node state status. It could be None when no ray node is running + # or has run on the cloud instance: for example, ray is still being installed + # or the instance manager hasn't had a cloud instance assigned (e.g. QUEUED, + # REQUESTED). + ray_node: Optional[NodeState] = None + + # The instance manager instance state. It would be None when the ray_node is not + # None. + # It could be None iff: + # 1. There's a ray node, but the instance manager hasn't discovered the + # cloud instance that's running this ray process yet. This could happen since + # the instance manager only discovers instances periodically. + # + # 2. There was a ray node running on the cloud instance, which was already stopped + # and removed from the instance manager state. But the ray state is still lagging + # behind. + # + # 3. There is a ray node that's unmanaged by the instance manager. + # + im_instance: Optional[Instance] = None + + # | cloud_instance_id | ray_node | im_instance | + # |-------------------|----------|-------------| + # | None | None | None | Not possible. + # | None | None | not None | OK. An instance hasn't had ray running on it yet. # noqa E501 + # | None | Not None | None | OK. Possible if the ray node is not started by autoscaler. # noqa E501 + # | None | Not None | not None | Not possible - no way to link im instance with ray node. # noqa E501 + # | not None | None | None | Not possible since cloud instance id is either part of im state or ray node. # noqa E501 + # | not None | None | not None | OK. e.g. An instance that's not running ray yet. # noqa E501 + # | not None | Not None | None | OK. See scenario 1, 2, 3 above. + # | not None | Not None | not None | OK. An instance that's running ray. + def validate(self) -> Tuple[bool, str]: + """Validate the autoscaler instance state. + + Returns: + A tuple of (valid, error_msg) where: + - valid is whether the state is valid + - error_msg is the error message for the validation results. + """ + + state_combinations = { + # (cloud_instance_id is None, ray_node is None, im_instance is None): (valid, error_msg) # noqa E501 + (True, True, True): (False, "Not possible"), + (True, True, False): (True, ""), + (True, False, True): ( + True, + "There's a ray node w/o cloud instance id, must be started not " + "by autoscaler", + ), + (True, False, False): ( + False, + "Not possible - no way to link im instance with ray node", + ), + (False, True, True): ( + False, + "Not possible since cloud instance id is either part of " + "im state or ray node", + ), + (False, True, False): (True, ""), + (False, False, True): (True, ""), + (False, False, False): (True, ""), + } + + valid, error_msg = state_combinations[ + ( + self.cloud_instance_id is None, + self.ray_node is None, + self.im_instance is None, + ) + ] + if not valid: + return valid, error_msg + + if self.im_instance is not None and self.ray_node is None: + # We don't see a ray node, but tracking an im instance. + if self.cloud_instance_id is None: + if InstanceUtil.is_cloud_instance_allocated(self.im_instance.status): + return ( + False, + "instance should be in a status where cloud instance " + "is not allocated.", + ) + else: + if not InstanceUtil.is_cloud_instance_allocated( + self.im_instance.status + ): + return ( + False, + "instance should be in a status where cloud instance is " + "allocated.", + ) + + if self.ray_node is not None: + if self.cloud_instance_id != self.ray_node.instance_id: + return False, "cloud instance id doesn't match." + + if self.im_instance is not None and self.cloud_instance_id is not None: + if self.cloud_instance_id != self.im_instance.cloud_instance_id: + return False, "cloud instance id doesn't match." + + return True, "" + + def is_ray_running(self) -> bool: + """Whether the ray node is running.""" + return self.ray_node is not None and self.ray_node.status in [ + NodeStatus.RUNNING, + NodeStatus.IDLE, + ] + + def is_ray_stop(self) -> bool: + """Whether the ray node is stopped.""" + return self.ray_node is None or self.ray_node.status in [ + NodeStatus.DEAD, + ] diff --git a/.venv/lib/python3.11/site-packages/ray/autoscaler/v2/sdk.py b/.venv/lib/python3.11/site-packages/ray/autoscaler/v2/sdk.py new file mode 100644 index 0000000000000000000000000000000000000000..ad11723994ac1407ce542f87e1068a4d57e286e6 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/autoscaler/v2/sdk.py @@ -0,0 +1,110 @@ +import time +from collections import defaultdict +from typing import List + +from ray._raylet import GcsClient +from ray.autoscaler.v2.schema import ClusterStatus, Stats +from ray.autoscaler.v2.utils import ClusterStatusParser +from ray.core.generated.autoscaler_pb2 import ( + ClusterResourceState, + GetClusterResourceStateReply, + GetClusterStatusReply, + NodeState, +) + +DEFAULT_RPC_TIMEOUT_S = 10 + + +def request_cluster_resources( + gcs_address: str, to_request: List[dict], timeout: int = DEFAULT_RPC_TIMEOUT_S +): + """Request resources from the autoscaler. + + This will add a cluster resource constraint to GCS. GCS will asynchronously + pass the constraint to the autoscaler, and the autoscaler will try to provision the + requested minimal bundles in `to_request`. + + If the cluster already has `to_request` resources, this will be an no-op. + Future requests submitted through this API will overwrite the previous requests. + + Args: + gcs_address: The GCS address to query. + to_request: A list of resource bundles to request the cluster to have. + Each bundle is a dict of resource name to resource quantity, e.g: + [{"CPU": 1}, {"GPU": 1}]. + timeout: Timeout in seconds for the request to be timeout + + """ + assert len(gcs_address) > 0, "GCS address is not specified." + + # Aggregate bundle by shape. + resource_requests_by_count = defaultdict(int) + for request in to_request: + bundle = frozenset(request.items()) + resource_requests_by_count[bundle] += 1 + + bundles = [] + counts = [] + for bundle, count in resource_requests_by_count.items(): + bundles.append(dict(bundle)) + counts.append(count) + + GcsClient(gcs_address).request_cluster_resource_constraint( + bundles, counts, timeout_s=timeout + ) + + +def get_cluster_status( + gcs_address: str, timeout: int = DEFAULT_RPC_TIMEOUT_S +) -> ClusterStatus: + """ + Get the cluster status from the autoscaler. + + Args: + gcs_address: The GCS address to query. + timeout: Timeout in seconds for the request to be timeout + + Returns: + A ClusterStatus object. + """ + assert len(gcs_address) > 0, "GCS address is not specified." + req_time = time.time() + str_reply = GcsClient(gcs_address).get_cluster_status(timeout_s=timeout) + reply_time = time.time() + reply = GetClusterStatusReply() + reply.ParseFromString(str_reply) + + # TODO(rickyx): To be more accurate, we could add a timestamp field from the reply. + return ClusterStatusParser.from_get_cluster_status_reply( + reply, + stats=Stats(gcs_request_time_s=reply_time - req_time, request_ts_s=req_time), + ) + + +def get_cluster_resource_state(gcs_client: GcsClient) -> ClusterResourceState: + """ + Get the cluster resource state from GCS. + Args: + gcs_client: The GCS client to query. + Returns: + A ClusterResourceState object + Raises: + Exception: If the request times out or failed. + """ + str_reply = gcs_client.get_cluster_resource_state() + reply = GetClusterResourceStateReply() + reply.ParseFromString(str_reply) + return reply.cluster_resource_state + + +def is_head_node(node_state: NodeState) -> bool: + """ + Check if the node is a head node from the node state. + Args: + node_state: the node state + Returns: + is_head: True if the node is a head node, False otherwise. + """ + # TODO: we should include this bit of information in the future. e.g. + # from labels. + return "node:__internal_head__" in dict(node_state.total_resources) diff --git a/.venv/lib/python3.11/site-packages/ray/autoscaler/v2/utils.py b/.venv/lib/python3.11/site-packages/ray/autoscaler/v2/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..5855a90a4e04db5a65c9f2bc4c9e16c964cb0512 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/autoscaler/v2/utils.py @@ -0,0 +1,851 @@ +from collections import Counter, defaultdict +from copy import deepcopy +from datetime import datetime +from enum import Enum +from itertools import chain +from typing import Any, Dict, List, Optional, Tuple + +import ray +from ray._private.utils import binary_to_hex +from ray._raylet import GcsClient +from ray.autoscaler._private.autoscaler import AutoscalerSummary +from ray.autoscaler._private.node_provider_availability_tracker import ( + NodeAvailabilityRecord, + NodeAvailabilitySummary, + UnavailableNodeInformation, +) +from ray.autoscaler._private.util import LoadMetricsSummary, format_info_string +from ray.autoscaler.v2.schema import ( + NODE_DEATH_CAUSE_RAYLET_DIED, + ClusterConstraintDemand, + ClusterStatus, + LaunchRequest, + NodeInfo, + NodeUsage, + PlacementGroupResourceDemand, + RayTaskActorDemand, + ResourceDemand, + ResourceDemandSummary, + ResourceRequestByCount, + ResourceUsage, + Stats, +) +from ray.core.generated.autoscaler_pb2 import ( + AffinityConstraint, + AntiAffinityConstraint, + AutoscalingState, + ClusterResourceState, + GetClusterStatusReply, + NodeState, + NodeStatus, + PlacementConstraint, + ResourceRequest, +) +from ray.core.generated.autoscaler_pb2 import ( + ResourceRequestByCount as ResourceRequestByCountProto, +) +from ray.experimental.internal_kv import internal_kv_get_gcs_client + + +def _count_by(data: Any, key: str) -> Dict[str, int]: + """ + Count the number of items by the given keys. + Args: + data: the data to be counted + keys: the keys to count by + Returns: + counts: the counts + """ + counts = defaultdict(int) + for item in data: + key_name = getattr(item, key) + counts[key_name] += 1 + return counts + + +class ProtobufUtil: + """ + A utility class for protobuf objects. + """ + + @staticmethod + def to_dict(proto): + """ + Convert a protobuf object to a dict. + + This is a slow conversion, and should only be used for debugging or + latency insensitve code. + + Args: + proto: the protobuf object + Returns: + dict: the dict + """ + from ray._private.protobuf_compat import message_to_dict + + return message_to_dict( + proto, + preserving_proto_field_name=True, + always_print_fields_with_no_presence=True, + ) + + @staticmethod + def to_dict_list(protos): + """ + Convert a list of protobuf objects to a list of dicts. + + Args: + protos: the list of protobuf objects + Returns: + dict_list: the list of dicts + """ + return [ProtobufUtil.to_dict(proto) for proto in protos] + + +class ResourceRequestUtil(ProtobufUtil): + """ + A utility class for resource requests, autoscaler.proto.ResourceRequest + """ + + class PlacementConstraintType(Enum): + """ + The affinity type for the resource request. + """ + + ANTI_AFFINITY = "ANTI_AFFINITY" + AFFINITY = "AFFINITY" + + @staticmethod + def group_by_count( + requests: List[ResourceRequest], + ) -> List[ResourceRequestByCountProto]: + """ + Aggregate resource requests by shape. + Args: + requests: the list of resource requests + Returns: + resource_requests_by_count: the aggregated resource requests by count + """ + resource_requests_by_count = defaultdict(int) + for request in requests: + serialized_request = request.SerializeToString() + resource_requests_by_count[serialized_request] += 1 + + results = [] + for serialized_request, count in resource_requests_by_count.items(): + request = ResourceRequest() + request.ParseFromString(serialized_request) + results.append(ResourceRequestByCountProto(request=request, count=count)) + + return results + + @staticmethod + def ungroup_by_count( + requests_by_count: List[ResourceRequestByCountProto], + ) -> List[ResourceRequest]: + """ + Flatten the resource requests by count to resource requests. + Args: + requests_by_count: the resource requests by count + Returns: + requests: the flattened resource requests + """ + reqs = [] + for r in requests_by_count: + reqs += [r.request] * r.count + + return reqs + + @staticmethod + def to_resource_map( + request: ResourceRequest, + ) -> Dict[str, float]: + """ + Convert the resource request by count to resource map. + Args: + request: the resource request + Returns: + resource_map: the resource map + """ + resource_map = defaultdict(float) + for k, v in request.resources_bundle.items(): + resource_map[k] += v + return dict(resource_map) + + @staticmethod + def to_resource_maps( + requests: List[ResourceRequest], + ) -> List[Dict[str, float]]: + """ + Convert the resource requests by count to resource map. + Args: + requests: the resource requests + Returns: + resource_maps: list of resource map + """ + return [ResourceRequestUtil.to_resource_map(r) for r in requests] + + @staticmethod + def make( + resources_map: Dict[str, float], + constraints: Optional[List[Tuple[PlacementConstraintType, str, str]]] = None, + ) -> ResourceRequest: + """ + Make a resource request from the given resources map. + Args: + resources_map: the resources map + Returns: + request: the resource request + """ + request = ResourceRequest() + for resource_name, quantity in resources_map.items(): + request.resources_bundle[resource_name] = quantity + + if constraints is None: + return request + + for constraint_type, label, value in constraints: + if constraint_type == ResourceRequestUtil.PlacementConstraintType.AFFINITY: + request.placement_constraints.append( + PlacementConstraint( + affinity=AffinityConstraint(label_name=label, label_value=value) + ) + ) + elif ( + constraint_type + == ResourceRequestUtil.PlacementConstraintType.ANTI_AFFINITY + ): + request.placement_constraints.append( + PlacementConstraint( + anti_affinity=AntiAffinityConstraint( + label_name=label, label_value=value + ) + ) + ) + else: + raise ValueError(f"Unknown constraint type: {constraint_type}") + + return request + + @staticmethod + def combine_requests_with_affinity( + resource_requests: List[ResourceRequest], + ) -> List[ResourceRequest]: + """ + Combine the resource requests with affinity constraints + into the same request. This is so that requests with affinity + constraints could be considered and placed together. + + It merges the resource requests with the same affinity constraints + into one request, and dedup the placement constraints. + + This assumes following: + 1. There's only at most 1 placement constraint, either an affinity + constraint OR an anti-affinity constraint. + + Args: + resource_requests: The list of resource requests to be combined. + Returns: + A list of combined resource requests. + """ + + # Map of set of serialized affinity constraint to the list of resource requests + requests_by_affinity: Dict[ + Tuple[str, str], List[ResourceRequest] + ] = defaultdict(list) + combined_requests: List[ResourceRequest] = [] + + for request in resource_requests: + assert len(request.placement_constraints) <= 1, ( + "There should be at most 1 placement constraint, " + "either an affinity constraint OR an anti-affinity constraint." + ) + + if len(request.placement_constraints) == 0: + # No affinity constraints, just add to the combined requests. + combined_requests.append(request) + continue + + constraint = request.placement_constraints[0] + + if constraint.HasField("affinity"): + affinity = constraint.affinity + requests_by_affinity[ + (affinity.label_name, affinity.label_value) + ].append(request) + elif constraint.HasField("anti_affinity"): + # We don't need to combine requests with anti-affinity constraints. + combined_requests.append(request) + + for ( + affinity_label_name, + affinity_label_value, + ), requests in requests_by_affinity.items(): + combined_request = ResourceRequest() + + # Merge the resource bundles with the same affinity constraint. + for request in requests: + for k, v in request.resources_bundle.items(): + combined_request.resources_bundle[k] = ( + combined_request.resources_bundle.get(k, 0) + v + ) + + # Add the placement constraint to the combined request. + affinity_constraint = AffinityConstraint( + label_name=affinity_label_name, label_value=affinity_label_value + ) + combined_request.placement_constraints.append( + PlacementConstraint(affinity=affinity_constraint) + ) + + combined_requests.append(combined_request) + + return combined_requests + + +class ClusterStatusFormatter: + """ + A formatter to format the ClusterStatus into a string. + + TODO(rickyx): We right now parse the ClusterStatus to the legacy format + by using the `format_info_string`. + In the future, we should refactor the `format_info_string` to directly format + the ClusterStatus into a string as we migrate eventually away from v1. + + """ + + @classmethod + def format(cls, data: ClusterStatus, verbose: bool = False) -> str: + lm_summary = cls._parse_lm_summary(data) + autoscaler_summary = cls._parse_autoscaler_summary(data) + + return format_info_string( + lm_summary, + autoscaler_summary, + time=datetime.fromtimestamp(data.stats.request_ts_s), + gcs_request_time=data.stats.gcs_request_time_s, + non_terminated_nodes_time=data.stats.none_terminated_node_request_time_s, + autoscaler_update_time=data.stats.autoscaler_iteration_time_s, + verbose=verbose, + ) + + @classmethod + def _parse_autoscaler_summary(cls, data: ClusterStatus) -> AutoscalerSummary: + active_nodes = _count_by(data.active_nodes, "ray_node_type_name") + idle_nodes = _count_by(data.idle_nodes, "ray_node_type_name") + pending_launches = _count_by(data.pending_launches, "ray_node_type_name") + pending_nodes = [] + for node in data.pending_nodes: + # We are using details for the pending node's status. + # TODO(rickyx): we should probably use instance id rather than ip address + # here. + pending_nodes.append( + (node.ip_address, node.ray_node_type_name, node.details) + ) + + failed_nodes = [] + for node in data.failed_nodes: + # TODO(rickyx): we should probably use instance id/node id rather + # than node ip here since node ip is not unique among failed nodes. + failed_nodes.append((node.ip_address, node.ray_node_type_name)) + + # From IP to node type name. + node_type_mapping = {} + for node in chain(data.active_nodes, data.idle_nodes): + node_type_mapping[node.ip_address] = node.ray_node_type_name + + # Transform failed launches to node_availability_summary + node_availabilities = {} + for failed_launch in data.failed_launches: + # TODO(rickyx): we could also add failed timestamp, count info. + node_availabilities[ + failed_launch.ray_node_type_name + ] = NodeAvailabilityRecord( + node_type=failed_launch.ray_node_type_name, + is_available=False, + last_checked_timestamp=failed_launch.request_ts_s, + unavailable_node_information=UnavailableNodeInformation( + category="LaunchFailed", + description=failed_launch.details, + ), + ) + node_availabilities = NodeAvailabilitySummary( + node_availabilities=node_availabilities + ) + + node_activities = { + node.node_id: (node.ip_address, node.node_activity) + for node in data.active_nodes + } + + return AutoscalerSummary( + active_nodes=active_nodes, + idle_nodes=idle_nodes, + pending_launches=pending_launches, + pending_nodes=pending_nodes, + failed_nodes=failed_nodes, + pending_resources={}, # NOTE: This is not used in ray status. + node_type_mapping=node_type_mapping, + node_availability_summary=node_availabilities, + node_activities=node_activities, + ) + + @classmethod + def _parse_lm_summary(cls, data: ClusterStatus) -> LoadMetricsSummary: + usage = { + u.resource_name: (u.used, u.total) for u in data.cluster_resource_usage + } + resource_demands = [] + for demand in data.resource_demands.ray_task_actor_demand: + for bundle_by_count in demand.bundles_by_count: + resource_demands.append((bundle_by_count.bundle, bundle_by_count.count)) + + pg_demand = [] + pg_demand_strs = [] + pg_demand_str_to_demand = {} + for pg_demand in data.resource_demands.placement_group_demand: + s = pg_demand.strategy + "|" + pg_demand.state + pg_demand_strs.append(s) + pg_demand_str_to_demand[s] = pg_demand + + pg_freqs = Counter(pg_demand_strs) + pg_demand = [ + ( + { + "strategy": pg_demand_str_to_demand[pg_str].strategy, + "bundles": [ + (bundle_count.bundle, bundle_count.count) + for bundle_count in pg_demand_str_to_demand[ + pg_str + ].bundles_by_count + ], + }, + freq, + ) + for pg_str, freq in pg_freqs.items() + ] + + request_demand = [ + (bc.bundle, bc.count) + for constraint_demand in data.resource_demands.cluster_constraint_demand + for bc in constraint_demand.bundles_by_count + ] + + usage_by_node = {} + node_type_mapping = {} + idle_time_map = {} + for node in chain(data.active_nodes, data.idle_nodes): + # TODO(rickyx): we should actually add node type info here. + # TODO(rickyx): we could also show node idle time. + usage_by_node[node.node_id] = { + u.resource_name: (u.used, u.total) for u in node.resource_usage.usage + } + node_type_mapping[node.node_id] = node.ray_node_type_name + idle_time_map[node.node_id] = node.resource_usage.idle_time_ms + + return LoadMetricsSummary( + usage=usage, + resource_demand=resource_demands, + pg_demand=pg_demand, + request_demand=request_demand, + node_types=None, # NOTE: This is not needed in ray status. + usage_by_node=usage_by_node, + node_type_mapping=node_type_mapping, + idle_time_map=idle_time_map, + ) + + +class ClusterStatusParser: + @classmethod + def from_get_cluster_status_reply( + cls, proto: GetClusterStatusReply, stats: Stats + ) -> ClusterStatus: + # parse nodes info + active_nodes, idle_nodes, failed_nodes = cls._parse_nodes( + proto.cluster_resource_state + ) + + # parse pending nodes info + pending_nodes = cls._parse_pending(proto.autoscaling_state) + + # parse launch requests + pending_launches, failed_launches = cls._parse_launch_requests( + proto.autoscaling_state + ) + + # parse cluster resource usage + cluster_resource_usage = cls._parse_cluster_resource_usage( + proto.cluster_resource_state + ) + + # parse resource demands + resource_demands = cls._parse_resource_demands(proto.cluster_resource_state) + + # parse stats + stats = cls._parse_stats(proto, stats) + + return ClusterStatus( + active_nodes=active_nodes, + idle_nodes=idle_nodes, + pending_launches=pending_launches, + failed_launches=failed_launches, + pending_nodes=pending_nodes, + failed_nodes=failed_nodes, + cluster_resource_usage=cluster_resource_usage, + resource_demands=resource_demands, + stats=stats, + ) + + @classmethod + def _parse_stats(cls, reply: GetClusterStatusReply, stats: Stats) -> Stats: + """ + Parse the stats from the get cluster status reply. + Args: + reply: the get cluster status reply + stats: the stats + Returns: + stats: the parsed stats + """ + stats = deepcopy(stats) + + stats.gcs_request_time_s = stats.gcs_request_time_s + # TODO(rickyx): Populate other autoscaler stats once available. + stats.autoscaler_version = str(reply.autoscaling_state.autoscaler_state_version) + stats.cluster_resource_state_version = str( + reply.cluster_resource_state.cluster_resource_state_version + ) + + return stats + + @classmethod + def _parse_resource_demands( + cls, state: ClusterResourceState + ) -> List[ResourceDemand]: + """ + Parse the resource demands from the cluster resource state. + Args: + state: the cluster resource state + Returns: + resource_demands: the resource demands + """ + task_actor_demand = [] + pg_demand = [] + constraint_demand = [] + + for request_count in state.pending_resource_requests: + # TODO(rickyx): constraints? + demand = RayTaskActorDemand( + bundles_by_count=[ + ResourceRequestByCount( + request_count.request.resources_bundle, request_count.count + ) + ], + ) + task_actor_demand.append(demand) + + for gang_request in state.pending_gang_resource_requests: + demand = PlacementGroupResourceDemand( + bundles_by_count=cls._aggregate_resource_requests_by_shape( + gang_request.requests + ), + details=gang_request.details, + ) + pg_demand.append(demand) + + for constraint_request in state.cluster_resource_constraints: + demand = ClusterConstraintDemand( + bundles_by_count=[ + ResourceRequestByCount( + bundle=dict(r.request.resources_bundle.items()), count=r.count + ) + for r in constraint_request.resource_requests + ] + ) + constraint_demand.append(demand) + + return ResourceDemandSummary( + ray_task_actor_demand=task_actor_demand, + placement_group_demand=pg_demand, + cluster_constraint_demand=constraint_demand, + ) + + @classmethod + def _aggregate_resource_requests_by_shape( + cls, + requests: List[ResourceRequest], + ) -> List[ResourceRequestByCount]: + """ + Aggregate resource requests by shape. + Args: + requests: the list of resource requests + Returns: + resource_requests_by_count: the aggregated resource requests by count + """ + + resource_requests_by_count = defaultdict(int) + for request in requests: + bundle = frozenset(request.resources_bundle.items()) + resource_requests_by_count[bundle] += 1 + + return [ + ResourceRequestByCount(dict(bundle), count) + for bundle, count in resource_requests_by_count.items() + ] + + @classmethod + def _parse_node_resource_usage( + cls, node_state: NodeState, usage: Dict[str, ResourceUsage] + ) -> Dict[str, ResourceUsage]: + """ + Parse the node resource usage from the node state. + Args: + node_state: the node state + usage: the usage dict to be updated. This is a dict of + {resource_name: ResourceUsage} + Returns: + usage: the updated usage dict + """ + # Tuple of {resource_name : (used, total)} + d = defaultdict(lambda: [0.0, 0.0]) + for resource_name, resource_total in node_state.total_resources.items(): + d[resource_name][1] += resource_total + # Will be subtracted from available later. + d[resource_name][0] += resource_total + + for ( + resource_name, + resource_available, + ) in node_state.available_resources.items(): + d[resource_name][0] -= resource_available + + # Merge with the passed in usage. + for k, (used, total) in d.items(): + usage[k].resource_name = k + usage[k].used += used + usage[k].total += total + + return usage + + @classmethod + def _parse_cluster_resource_usage( + cls, + state: ClusterResourceState, + ) -> List[ResourceUsage]: + """ + Parse the cluster resource usage from the cluster resource state. + Args: + state: the cluster resource state + Returns: + cluster_resource_usage: the cluster resource usage + """ + + cluster_resource_usage = defaultdict(ResourceUsage) + + for node_state in state.node_states: + if node_state.status != NodeStatus.DEAD: + cluster_resource_usage = cls._parse_node_resource_usage( + node_state, cluster_resource_usage + ) + + return list(cluster_resource_usage.values()) + + @classmethod + def _parse_nodes( + cls, + state: ClusterResourceState, + ) -> Tuple[List[NodeInfo], List[NodeInfo]]: + """ + Parse the node info from the cluster resource state. + Args: + state: the cluster resource state + Returns: + active_nodes: the list of non-idle nodes + idle_nodes: the list of idle nodes + dead_nodes: the list of dead nodes + """ + active_nodes = [] + dead_nodes = [] + idle_nodes = [] + for node_state in state.node_states: + # Basic node info. + node_id = binary_to_hex(node_state.node_id) + if len(node_state.ray_node_type_name) == 0: + # We don't have a node type name, but this is needed for showing + # healthy nodes. This happens when we don't use cluster launcher. + # but start ray manually. We will use node id as node type name. + ray_node_type_name = f"node_{node_id}" + else: + ray_node_type_name = node_state.ray_node_type_name + + # Parse the resource usage if it's not dead + node_resource_usage = None + failure_detail = None + if node_state.status == NodeStatus.DEAD: + # TODO(rickyx): Technically we could get a more verbose + # failure detail from GCS, but existing ray status treats + # all ray failures as raylet death. + failure_detail = NODE_DEATH_CAUSE_RAYLET_DIED + else: + usage = defaultdict(ResourceUsage) + usage = cls._parse_node_resource_usage(node_state, usage) + node_resource_usage = NodeUsage( + usage=list(usage.values()), + idle_time_ms=node_state.idle_duration_ms + if node_state.status == NodeStatus.IDLE + else 0, + ) + + node_info = NodeInfo( + instance_type_name=node_state.instance_type_name, + node_status=NodeStatus.Name(node_state.status), + node_id=binary_to_hex(node_state.node_id), + ip_address=node_state.node_ip_address, + ray_node_type_name=ray_node_type_name, + instance_id=node_state.instance_id, + resource_usage=node_resource_usage, + failure_detail=failure_detail, + node_activity=node_state.node_activity, + ) + + if node_state.status == NodeStatus.DEAD: + dead_nodes.append(node_info) + elif node_state.status == NodeStatus.IDLE: + idle_nodes.append(node_info) + else: + active_nodes.append(node_info) + + return active_nodes, idle_nodes, dead_nodes + + @classmethod + def _parse_launch_requests( + cls, state: AutoscalingState + ) -> Tuple[List[LaunchRequest], List[LaunchRequest]]: + """ + Parse the launch requests from the autoscaling state. + Args: + state: the autoscaling state, empty if there's no autoscaling state + being reported. + Returns: + pending_launches: the list of pending launches + failed_launches: the list of failed launches + """ + pending_launches = [] + for pending_request in state.pending_instance_requests: + launch = LaunchRequest( + instance_type_name=pending_request.instance_type_name, + ray_node_type_name=pending_request.ray_node_type_name, + count=pending_request.count, + state=LaunchRequest.Status.PENDING, + request_ts_s=pending_request.request_ts, + ) + + pending_launches.append(launch) + + failed_launches = [] + for failed_request in state.failed_instance_requests: + launch = LaunchRequest( + instance_type_name=failed_request.instance_type_name, + ray_node_type_name=failed_request.ray_node_type_name, + count=failed_request.count, + state=LaunchRequest.Status.FAILED, + request_ts_s=failed_request.start_ts, + details=failed_request.reason, + failed_ts_s=failed_request.failed_ts, + ) + + failed_launches.append(launch) + + return pending_launches, failed_launches + + @classmethod + def _parse_pending(cls, state: AutoscalingState) -> List[NodeInfo]: + """ + Parse the pending requests/nodes from the autoscaling state. + Args: + state: the autoscaling state, empty if there's no autoscaling state + being reported. + Returns: + pending_nodes: the list of pending nodes + """ + pending_nodes = [] + for pending_node in state.pending_instances: + pending_nodes.append( + NodeInfo( + instance_type_name=pending_node.instance_type_name, + ray_node_type_name=pending_node.ray_node_type_name, + details=pending_node.details, + instance_id=pending_node.instance_id, + ip_address=pending_node.ip_address, + ) + ) + + return pending_nodes + + +cached_is_autoscaler_v2 = None + + +def is_autoscaler_v2( + fetch_from_server: bool = False, gcs_client: Optional[GcsClient] = None +) -> bool: + """ + Check if the autoscaler is v2 from reading GCS internal KV. + + If the method is called multiple times, the result will be cached in the module. + + Args: + fetch_from_server: If True, fetch the value from the GCS server, otherwise + use the cached value. + gcs_client: The GCS client to use. If not provided, the default GCS + client will be used. + + Returns: + is_v2: True if the autoscaler is v2, False otherwise. + + Raises: + Exception: if GCS address could not be resolved (e.g. ray.init() not called) + """ + # If env var is set to enable autoscaler v2, we should always return True. + if ray._config.enable_autoscaler_v2() and not fetch_from_server: + # TODO(rickyx): Once we migrate completely to v2, we should remove this. + # While this short-circuit may allow client-server inconsistency + # (e.g. client running v1, while server running v2), it's currently + # not possible with existing use-cases. + return True + + global cached_is_autoscaler_v2 + if cached_is_autoscaler_v2 is not None and not fetch_from_server: + return cached_is_autoscaler_v2 + + if gcs_client is None: + gcs_client = internal_kv_get_gcs_client() + + assert gcs_client, ( + "GCS client is not available. Please initialize the global GCS client " + "first by calling ray.init() or explicitly calls to _initialize_internal_kv()." + ) + + # See src/ray/common/constants.h for the definition of this key. + cached_is_autoscaler_v2 = ( + gcs_client.internal_kv_get( + ray._raylet.GCS_AUTOSCALER_V2_ENABLED_KEY.encode(), + namespace=ray._raylet.GCS_AUTOSCALER_STATE_NAMESPACE.encode(), + ) + == b"1" + ) + + return cached_is_autoscaler_v2 + + +def is_head_node(node_state: NodeState) -> bool: + """ + Check if the node is a head node from the node state. + + Args: + node_state: the node state + Returns: + is_head: True if the node is a head node, False otherwise. + """ + # TODO: we should include this bit of information in the future. + # NOTE: we could use labels in the future to determine if it's a head node. + return "node:__internal_head__" in dict(node_state.total_resources) diff --git a/.venv/lib/python3.11/site-packages/ray/autoscaler/vsphere/__init__.py b/.venv/lib/python3.11/site-packages/ray/autoscaler/vsphere/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/.venv/lib/python3.11/site-packages/ray/autoscaler/vsphere/__pycache__/__init__.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/autoscaler/vsphere/__pycache__/__init__.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..0a3183bce879a5d1e53211b706e7270fb1c1bbe6 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/autoscaler/vsphere/__pycache__/__init__.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/autoscaler/vsphere/defaults.yaml b/.venv/lib/python3.11/site-packages/ray/autoscaler/vsphere/defaults.yaml new file mode 100644 index 0000000000000000000000000000000000000000..383bccadd5f95e089081e68a4c59af70380466ca --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/autoscaler/vsphere/defaults.yaml @@ -0,0 +1,166 @@ +# An unique identifier for the head node and workers of this cluster. +cluster_name: default + +# The maximum number of workers nodes to launch in addition to the head +# node. +max_workers: 2 + +# The autoscaler will scale up the cluster faster with higher upscaling speed. +# E.g., if the task requires adding more nodes then autoscaler will gradually +# scale up the cluster in chunks of upscaling_speed*currently_running_nodes. +# This number should be > 0. +upscaling_speed: 1.0 + +# This executes all commands on all nodes in the docker container, +# and opens all the necessary ports to support the Ray cluster. +# Empty string means disabled. +docker: + image: "rayproject/ray-ml:latest" + # image: rayproject/ray:latest # use this one if you don't need ML dependencies, it's faster to pull + container_name: "ray_container" + # If true, pulls latest version of image. Otherwise, `docker run` will only pull the image + # if no cached version is present. + pull_before_run: True + run_options: # Extra options to pass into "docker run" + - --ulimit nofile=65536:65536 + +# If a node is idle for this many minutes, it will be removed. +idle_timeout_minutes: 5 + +# Cloud-provider specific configuration. +provider: + type: vsphere + +# Credentials configured here will take precedence over credentials set in the +# environment variables. + vsphere_config: +# credentials: +# user: vc_username +# password: vc_password +# server: vc_address + # The frozen VM related configurations. If "library_item" is unset, then either an existing frozen VM should be + # specified by "name" of a resource pool name of Frozen VMs on every ESXi host should be specified by + # "resource_pool". If "library_item" is set, then "name" must be set to indicate the name or the name prefix of + # the frozen VM, and "resource_pool" can be set to indicate that a set of frozen VMs should be created on each + # ESXi host. + frozen_vm: + # The name of the frozen VM, or the prefix for a set of frozen VMs. Can only be unset when + # "frozen_vm.resource_pool" is set and pointing to an existing resource pool of Frozen VMs. + name: frozen-vm + # The library item of the OVF template of the frozen VM. If set, the frozen VM or a set of frozen VMs will + # be deployed from an OVF template specified by library item. + library_item: + # The resource pool name of the frozen VMs, can point to an existing resource pool of frozen VMs. + # Otherwise, "frozen_vm.library_item" must be specified and a set of frozen VMs will be deployed + # on each ESXi host. The frozen VMs will be named as "{frozen_vm.name}-{the vm's ip address}" + resource_pool: + # The vSphere cluster name, only makes sense when "frozen_vm.library_item" is set and + # "frozen_vm.resource_pool" is unset. Indicates to deploy a single frozen VM on the vSphere cluster + # from OVF template. + cluster: + # The target vSphere datastore name for storing the vmdk of the frozen VM to be deployed from OVF template. + # Will take effect only when "frozen_vm.library_item" is set. If "frozen_vm.resource_pool" is also set, + # this datastore must be a shared datastore among the ESXi hosts. + datastore: + # The GPU related configurations + gpu_config: + # If using dynamic PCI passthrough to bind the physical GPU on an ESXi host to a Ray node VM. + # Dynamic PCI passthrough can support vSphere DRS, otherwise using regular PCI passthrough will not support + # vSphere DRS. + dynamic_pci_passthrough: False + +# How Ray will authenticate with newly launched nodes. +auth: + ssh_user: ray +# By default Ray creates a new private keypair, but you can also use your own. +# If you do so, make sure to also set "KeyName" in the head and worker node +# configurations below. +# ssh_private_key: /path/to/your/key.pem + +# Tell the autoscaler the allowed node types and the resources they provide. +# The key is the name of the node type, which is just for debugging purposes. +# The node config specifies the launch config and physical instance type. +available_node_types: + ray.head.default: + # The node type's CPU and Memory resources are by default the same as the frozen VM. + # You can override the resources here. Adding GPU to the head node is not recommended. + # resources: { "CPU": 2, "Memory": 4096} + resources: {"CPU": 2} + node_config: + # The resource pool where the head node should live, if unset, will be + # the frozen VM's resource pool. + resource_pool: + # The datastore to store the vmdk of the head node vm, if unset, will be + # the frozen VM's datastore. + datastore: + worker: + # The minimum number of nodes of this type to launch. + # This number should be >= 0. + min_workers: 0 + # The node type's CPU and Memory resources are by default the same as the frozen VM. + # You can override the resources here. For GPU, currently only Nvidia GPU is supported. If no ESXi host can + # fulfill the requirement, the Ray node creation will fail. The number of created nodes may not meet the desired + # minimum number. The vSphere node provider will not distinguish the GPU type. It will just count the quantity: + # mount the first k random available Nvidia GPU to the VM, if the user set {"GPU": k}. + # resources: {"CPU": 2, "Memory": 4096, "GPU": 1} + resources: {"CPU": 2} + node_config: + # The resource pool where the worker node should live, if unset, will be + # the frozen VM's resource pool. + resource_pool: + # The datastore to store the vmdk(s) of the worker node vm(s), if unset, will be + # the frozen VM's datastore. + datastore: + +# Specify the node type of the head node (as configured above). +head_node_type: ray.head.default + +# Files or directories to copy to the head and worker nodes. The format is a +# dictionary from REMOTE_PATH: LOCAL_PATH, e.g. +file_mounts: { +# "/path1/on/remote/machine": "/path1/on/local/machine", +# "/path2/on/remote/machine": "/path2/on/local/machine", +} + +# Files or directories to copy from the head node to the worker nodes. The format is a +# list of paths. The same path on the head node will be copied to the worker node. +# This behavior is a subset of the file_mounts behavior. In the vast majority of cases +# you should just use file_mounts. Only use this if you know what you're doing! +cluster_synced_files: [] + +# Whether changes to directories in file_mounts or cluster_synced_files in the head node +# should sync to the worker node continuously +file_mounts_sync_continuously: False + +# Patterns for files to exclude when running rsync up or rsync down +rsync_exclude: [] + +# Pattern files to use for filtering out files when running rsync up or rsync down. The file is searched for +# in the source directory and recursively through all subdirectories. For example, if .gitignore is provided +# as a value, the behavior will match git's behavior for finding and using .gitignore files. +rsync_filter: [] + +# List of commands that will be run before `setup_commands`. If docker is +# enabled, these commands will run outside the container and before docker +# is setup. +initialization_commands: [] + +# List of shell commands to run to set up nodes. +setup_commands: [] + +# Custom commands that will be run on the head node after common setup. +head_setup_commands: + - pip install 'git+https://github.com/vmware/vsphere-automation-sdk-python.git' + +# Custom commands that will be run on worker nodes after common setup. +worker_setup_commands: [] + +# Command to start ray on the head node. You don't need to change this. +head_start_ray_commands: + - ray stop + - ulimit -n 65536; ray start --head --port=6379 --object-manager-port=8076 --autoscaling-config=~/ray_bootstrap_config.yaml --dashboard-host=0.0.0.0 + +# Command to start ray on worker nodes. You don't need to change this. +worker_start_ray_commands: + - ray stop + - ulimit -n 65536; ray start --address=$RAY_HEAD_IP:6379 --object-manager-port=8076 diff --git a/.venv/lib/python3.11/site-packages/ray/data/_internal/__pycache__/batcher.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/data/_internal/__pycache__/batcher.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..61fcfa0524a2c94ba4f5b6b30a071ebd1e0945ef Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/data/_internal/__pycache__/batcher.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/data/_internal/__pycache__/block_list.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/data/_internal/__pycache__/block_list.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..632fa1bb7bd8978250ed3c12aafba905d51c886a Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/data/_internal/__pycache__/block_list.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/data/_internal/__pycache__/compute.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/data/_internal/__pycache__/compute.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..384844fbc4e0614ec86b05d3fef4e6bd95d59fd5 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/data/_internal/__pycache__/compute.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/data/_internal/__pycache__/size_estimator.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/data/_internal/__pycache__/size_estimator.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..56e2890d414744f633ca3cd05573dff22aeb5fb8 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/data/_internal/__pycache__/size_estimator.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/data/_internal/__pycache__/torch_iterable_dataset.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/data/_internal/__pycache__/torch_iterable_dataset.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..aa08cd47b8b71b4816e02bf38cec4343e2e4a0c7 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/data/_internal/__pycache__/torch_iterable_dataset.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/data/_internal/logical/operators/count_operator.py b/.venv/lib/python3.11/site-packages/ray/data/_internal/logical/operators/count_operator.py new file mode 100644 index 0000000000000000000000000000000000000000..409c99e3c000698622be3f6294d419d1c59ff2d8 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/data/_internal/logical/operators/count_operator.py @@ -0,0 +1,20 @@ +from typing import List + +from ray.data._internal.logical.interfaces import LogicalOperator + + +class Count(LogicalOperator): + """Logical operator that represents counting the number of rows in inputs. + + Physical operators that implement this logical operator should produce one or more + rows with a single column named `Count.COLUMN_NAME`. When you sum the values in + this column, you should get the total number of rows in the dataset. + """ + + COLUMN_NAME = "__num_rows" + + def __init__( + self, + input_dependencies: List["LogicalOperator"], + ): + super().__init__("Count", input_dependencies) diff --git a/.venv/lib/python3.11/site-packages/ray/data/_internal/logical/operators/from_operators.py b/.venv/lib/python3.11/site-packages/ray/data/_internal/logical/operators/from_operators.py new file mode 100644 index 0000000000000000000000000000000000000000..afe5e8200bb14040e712e4b2faef12e79516f8df --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/data/_internal/logical/operators/from_operators.py @@ -0,0 +1,105 @@ +import abc +import functools +from typing import TYPE_CHECKING, List, Optional, Union + +from ray.data._internal.execution.interfaces import RefBundle +from ray.data._internal.logical.interfaces import LogicalOperator +from ray.data._internal.util import unify_block_metadata_schema +from ray.data.block import Block, BlockMetadata +from ray.types import ObjectRef + +if TYPE_CHECKING: + import pyarrow as pa + + ArrowTable = Union["pa.Table", bytes] + + +class AbstractFrom(LogicalOperator, metaclass=abc.ABCMeta): + """Abstract logical operator for `from_*`.""" + + def __init__( + self, + input_blocks: List[ObjectRef[Block]], + input_metadata: List[BlockMetadata], + ): + super().__init__(self.__class__.__name__, [], len(input_blocks)) + assert len(input_blocks) == len(input_metadata), ( + len(input_blocks), + len(input_metadata), + ) + # `owns_blocks` is False because this op may be shared by multiple Datasets. + self._input_data = [ + RefBundle([(input_blocks[i], input_metadata[i])], owns_blocks=False) + for i in range(len(input_blocks)) + ] + + @property + def input_data(self) -> List[RefBundle]: + return self._input_data + + def output_data(self) -> Optional[List[RefBundle]]: + return self._input_data + + def aggregate_output_metadata(self) -> BlockMetadata: + return self._cached_output_metadata + + @functools.cached_property + def _cached_output_metadata(self) -> BlockMetadata: + return BlockMetadata( + num_rows=self._num_rows(), + size_bytes=self._size_bytes(), + schema=self._schema(), + input_files=None, + exec_stats=None, + ) + + def _num_rows(self): + if all(bundle.num_rows() is not None for bundle in self._input_data): + return sum(bundle.num_rows() for bundle in self._input_data) + else: + return None + + def _size_bytes(self): + metadata = [m for bundle in self._input_data for m in bundle.metadata] + if all(m.size_bytes is not None for m in metadata): + return sum(m.size_bytes for m in metadata) + else: + return None + + def _schema(self): + metadata = [m for bundle in self._input_data for m in bundle.metadata] + return unify_block_metadata_schema(metadata) + + def is_lineage_serializable(self) -> bool: + # This operator isn't serializable because it contains ObjectRefs. + return False + + +class FromItems(AbstractFrom): + """Logical operator for `from_items`.""" + + pass + + +class FromBlocks(AbstractFrom): + """Logical operator for `from_blocks`.""" + + pass + + +class FromNumpy(AbstractFrom): + """Logical operator for `from_numpy`.""" + + pass + + +class FromArrow(AbstractFrom): + """Logical operator for `from_arrow`.""" + + pass + + +class FromPandas(AbstractFrom): + """Logical operator for `from_pandas`.""" + + pass diff --git a/.venv/lib/python3.11/site-packages/ray/data/_internal/logical/rules/__init__.py b/.venv/lib/python3.11/site-packages/ray/data/_internal/logical/rules/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..92a769261403be41df443ab0a740505ecc934ffd --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/data/_internal/logical/rules/__init__.py @@ -0,0 +1,4 @@ +from ray.data._internal.logical.rules.operator_fusion import OperatorFusionRule +from ray.data._internal.logical.rules.randomize_blocks import ReorderRandomizeBlocksRule + +__all__ = ["ReorderRandomizeBlocksRule", "OperatorFusionRule"] diff --git a/.venv/lib/python3.11/site-packages/ray/data/_internal/logical/rules/__pycache__/__init__.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/data/_internal/logical/rules/__pycache__/__init__.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..24fbb39ce203afc8f4283b9ffe2b4534918ad65f Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/data/_internal/logical/rules/__pycache__/__init__.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/data/_internal/logical/rules/__pycache__/inherit_batch_format.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/data/_internal/logical/rules/__pycache__/inherit_batch_format.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..fe3f5b6b0a782ed62a1d373f36129d075ec043e4 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/data/_internal/logical/rules/__pycache__/inherit_batch_format.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/data/_internal/logical/rules/__pycache__/inherit_target_max_block_size.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/data/_internal/logical/rules/__pycache__/inherit_target_max_block_size.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..5d73d218f0b9629234e590668c847bc4999062eb Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/data/_internal/logical/rules/__pycache__/inherit_target_max_block_size.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/data/_internal/logical/rules/__pycache__/limit_pushdown.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/data/_internal/logical/rules/__pycache__/limit_pushdown.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..5655bec7b049bda73cac5906833d65da8ff80f40 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/data/_internal/logical/rules/__pycache__/limit_pushdown.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/data/_internal/logical/rules/__pycache__/randomize_blocks.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/data/_internal/logical/rules/__pycache__/randomize_blocks.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..b1adfad53f41de51343d97a54f0f6ff2ae0e765d Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/data/_internal/logical/rules/__pycache__/randomize_blocks.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/data/_internal/logical/rules/__pycache__/set_read_parallelism.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/data/_internal/logical/rules/__pycache__/set_read_parallelism.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..d35e70eb9e45ddd3c2e82fc814f6b6720576c93b Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/data/_internal/logical/rules/__pycache__/set_read_parallelism.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/data/_internal/logical/rules/__pycache__/zero_copy_map_fusion.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/data/_internal/logical/rules/__pycache__/zero_copy_map_fusion.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..d7a8fcaedd294cbd66edc20a753a785b51b7456b Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/data/_internal/logical/rules/__pycache__/zero_copy_map_fusion.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/data/_internal/planner/__pycache__/__init__.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/data/_internal/planner/__pycache__/__init__.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c48f87b608fc9ec6fc3eecb5fd3ff9a3134bd749 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/data/_internal/planner/__pycache__/__init__.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/data/_internal/planner/__pycache__/aggregate.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/data/_internal/planner/__pycache__/aggregate.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c4b1644840cca76fdf3b0ebe5429504986b13f3b Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/data/_internal/planner/__pycache__/aggregate.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/data/_internal/planner/__pycache__/plan_from_pandas_op.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/data/_internal/planner/__pycache__/plan_from_pandas_op.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..dec4f37f8e20ad22a91e443ca86b05a9520709b8 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/data/_internal/planner/__pycache__/plan_from_pandas_op.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/data/_internal/planner/__pycache__/plan_read_op.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/data/_internal/planner/__pycache__/plan_read_op.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..a4d8c61b5f1051d4f37cff8b0df54167be776bd4 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/data/_internal/planner/__pycache__/plan_read_op.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/data/_internal/planner/__pycache__/plan_write_op.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/data/_internal/planner/__pycache__/plan_write_op.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..6a9b5a794d4b5df6a304765f9a7fa999997e4ea0 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/data/_internal/planner/__pycache__/plan_write_op.cpython-311.pyc differ