Add files using upload-large-folder tool
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- .venv/lib/python3.11/site-packages/ray/autoscaler/_private/aliyun/__init__.py +0 -0
- .venv/lib/python3.11/site-packages/ray/autoscaler/_private/aliyun/__pycache__/__init__.cpython-311.pyc +0 -0
- .venv/lib/python3.11/site-packages/ray/autoscaler/_private/aliyun/__pycache__/config.cpython-311.pyc +0 -0
- .venv/lib/python3.11/site-packages/ray/autoscaler/_private/aliyun/__pycache__/node_provider.cpython-311.pyc +0 -0
- .venv/lib/python3.11/site-packages/ray/autoscaler/_private/aliyun/__pycache__/utils.cpython-311.pyc +0 -0
- .venv/lib/python3.11/site-packages/ray/autoscaler/_private/aliyun/config.py +116 -0
- .venv/lib/python3.11/site-packages/ray/autoscaler/_private/aliyun/node_provider.py +324 -0
- .venv/lib/python3.11/site-packages/ray/autoscaler/_private/aliyun/utils.py +461 -0
- .venv/lib/python3.11/site-packages/ray/autoscaler/_private/aws/__pycache__/utils.cpython-311.pyc +0 -0
- .venv/lib/python3.11/site-packages/ray/autoscaler/_private/kuberay/__init__.py +0 -0
- .venv/lib/python3.11/site-packages/ray/autoscaler/_private/kuberay/__pycache__/__init__.cpython-311.pyc +0 -0
- .venv/lib/python3.11/site-packages/ray/autoscaler/_private/kuberay/__pycache__/autoscaling_config.cpython-311.pyc +0 -0
- .venv/lib/python3.11/site-packages/ray/autoscaler/_private/kuberay/__pycache__/node_provider.cpython-311.pyc +0 -0
- .venv/lib/python3.11/site-packages/ray/autoscaler/_private/kuberay/__pycache__/run_autoscaler.cpython-311.pyc +0 -0
- .venv/lib/python3.11/site-packages/ray/autoscaler/_private/kuberay/__pycache__/utils.cpython-311.pyc +0 -0
- .venv/lib/python3.11/site-packages/ray/autoscaler/_private/kuberay/node_provider.py +536 -0
- .venv/lib/python3.11/site-packages/ray/autoscaler/_private/kuberay/run_autoscaler.py +119 -0
- .venv/lib/python3.11/site-packages/ray/autoscaler/_private/kuberay/utils.py +111 -0
- .venv/lib/python3.11/site-packages/ray/autoscaler/_private/readonly/__init__.py +0 -0
- .venv/lib/python3.11/site-packages/ray/autoscaler/_private/readonly/__pycache__/__init__.cpython-311.pyc +0 -0
- .venv/lib/python3.11/site-packages/ray/autoscaler/_private/readonly/__pycache__/node_provider.cpython-311.pyc +0 -0
- .venv/lib/python3.11/site-packages/ray/autoscaler/_private/readonly/node_provider.py +80 -0
- .venv/lib/python3.11/site-packages/ray/autoscaler/aws/__init__.py +0 -0
- .venv/lib/python3.11/site-packages/ray/autoscaler/aws/__pycache__/__init__.cpython-311.pyc +0 -0
- .venv/lib/python3.11/site-packages/ray/autoscaler/aws/cloudwatch/prometheus.yml +15 -0
- .venv/lib/python3.11/site-packages/ray/autoscaler/aws/cloudwatch/ray_prometheus_waiter.sh +23 -0
- .venv/lib/python3.11/site-packages/ray/autoscaler/aws/defaults.yaml +144 -0
- .venv/lib/python3.11/site-packages/ray/autoscaler/azure/__init__.py +0 -0
- .venv/lib/python3.11/site-packages/ray/autoscaler/azure/__pycache__/__init__.cpython-311.pyc +0 -0
- .venv/lib/python3.11/site-packages/ray/autoscaler/azure/defaults.yaml +152 -0
- .venv/lib/python3.11/site-packages/ray/autoscaler/sdk/__init__.py +29 -0
- .venv/lib/python3.11/site-packages/ray/autoscaler/sdk/__pycache__/__init__.cpython-311.pyc +0 -0
- .venv/lib/python3.11/site-packages/ray/autoscaler/sdk/__pycache__/sdk.cpython-311.pyc +0 -0
- .venv/lib/python3.11/site-packages/ray/autoscaler/sdk/sdk.py +343 -0
- .venv/lib/python3.11/site-packages/ray/autoscaler/v2/__init__.py +0 -0
- .venv/lib/python3.11/site-packages/ray/autoscaler/v2/autoscaler.py +201 -0
- .venv/lib/python3.11/site-packages/ray/autoscaler/v2/event_logger.py +157 -0
- .venv/lib/python3.11/site-packages/ray/autoscaler/v2/instance_manager/__init__.py +0 -0
- .venv/lib/python3.11/site-packages/ray/autoscaler/v2/instance_manager/common.py +472 -0
- .venv/lib/python3.11/site-packages/ray/autoscaler/v2/instance_manager/config.py +541 -0
- .venv/lib/python3.11/site-packages/ray/autoscaler/v2/instance_manager/instance_manager.py +270 -0
- .venv/lib/python3.11/site-packages/ray/autoscaler/v2/instance_manager/instance_storage.py +151 -0
- .venv/lib/python3.11/site-packages/ray/autoscaler/v2/instance_manager/node_provider.py +522 -0
- .venv/lib/python3.11/site-packages/ray/autoscaler/v2/instance_manager/ray_installer.py +99 -0
- .venv/lib/python3.11/site-packages/ray/autoscaler/v2/instance_manager/reconciler.py +1565 -0
- .venv/lib/python3.11/site-packages/ray/autoscaler/v2/instance_manager/storage.py +180 -0
- .venv/lib/python3.11/site-packages/ray/autoscaler/v2/metrics_reporter.py +100 -0
- .venv/lib/python3.11/site-packages/ray/autoscaler/v2/monitor.py +302 -0
- .venv/lib/python3.11/site-packages/ray/autoscaler/v2/scheduler.py +1642 -0
- .venv/lib/python3.11/site-packages/ray/autoscaler/v2/schema.py +351 -0
.venv/lib/python3.11/site-packages/ray/autoscaler/_private/aliyun/__init__.py
ADDED
|
File without changes
|
.venv/lib/python3.11/site-packages/ray/autoscaler/_private/aliyun/__pycache__/__init__.cpython-311.pyc
ADDED
|
Binary file (203 Bytes). View file
|
|
|
.venv/lib/python3.11/site-packages/ray/autoscaler/_private/aliyun/__pycache__/config.cpython-311.pyc
ADDED
|
Binary file (6.06 kB). View file
|
|
|
.venv/lib/python3.11/site-packages/ray/autoscaler/_private/aliyun/__pycache__/node_provider.cpython-311.pyc
ADDED
|
Binary file (18.3 kB). View file
|
|
|
.venv/lib/python3.11/site-packages/ray/autoscaler/_private/aliyun/__pycache__/utils.cpython-311.pyc
ADDED
|
Binary file (24.5 kB). View file
|
|
|
.venv/lib/python3.11/site-packages/ray/autoscaler/_private/aliyun/config.py
ADDED
|
@@ -0,0 +1,116 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import logging
|
| 2 |
+
import os
|
| 3 |
+
import stat
|
| 4 |
+
|
| 5 |
+
from ray.autoscaler._private.aliyun.utils import AcsClient
|
| 6 |
+
|
| 7 |
+
# instance status
|
| 8 |
+
PENDING = "Pending"
|
| 9 |
+
RUNNING = "Running"
|
| 10 |
+
STARTING = "Starting"
|
| 11 |
+
STOPPING = "Stopping"
|
| 12 |
+
STOPPED = "Stopped"
|
| 13 |
+
|
| 14 |
+
logger = logging.getLogger(__name__)
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
def bootstrap_aliyun(config):
|
| 18 |
+
# print(config["provider"])
|
| 19 |
+
# create vpc
|
| 20 |
+
_get_or_create_vpc(config)
|
| 21 |
+
# create security group id
|
| 22 |
+
_get_or_create_security_group(config)
|
| 23 |
+
# create vswitch
|
| 24 |
+
_get_or_create_vswitch(config)
|
| 25 |
+
# create key pair
|
| 26 |
+
_get_or_import_key_pair(config)
|
| 27 |
+
# print(config["provider"])
|
| 28 |
+
return config
|
| 29 |
+
|
| 30 |
+
|
| 31 |
+
def _client(config):
|
| 32 |
+
return AcsClient(
|
| 33 |
+
access_key=config["provider"].get("access_key"),
|
| 34 |
+
access_key_secret=config["provider"].get("access_key_secret"),
|
| 35 |
+
region_id=config["provider"]["region"],
|
| 36 |
+
max_retries=1,
|
| 37 |
+
)
|
| 38 |
+
|
| 39 |
+
|
| 40 |
+
def _get_or_create_security_group(config):
|
| 41 |
+
cli = _client(config)
|
| 42 |
+
security_groups = cli.describe_security_groups(vpc_id=config["provider"]["vpc_id"])
|
| 43 |
+
if security_groups is not None and len(security_groups) > 0:
|
| 44 |
+
config["provider"]["security_group_id"] = security_groups[0]["SecurityGroupId"]
|
| 45 |
+
return config
|
| 46 |
+
|
| 47 |
+
security_group_id = cli.create_security_group(vpc_id=config["provider"]["vpc_id"])
|
| 48 |
+
|
| 49 |
+
for rule in config["provider"].get("security_group_rule", {}):
|
| 50 |
+
cli.authorize_security_group(
|
| 51 |
+
security_group_id=security_group_id,
|
| 52 |
+
port_range=rule["port_range"],
|
| 53 |
+
source_cidr_ip=rule["source_cidr_ip"],
|
| 54 |
+
ip_protocol=rule["ip_protocol"],
|
| 55 |
+
)
|
| 56 |
+
config["provider"]["security_group_id"] = security_group_id
|
| 57 |
+
return
|
| 58 |
+
|
| 59 |
+
|
| 60 |
+
def _get_or_create_vpc(config):
|
| 61 |
+
cli = _client(config)
|
| 62 |
+
vpcs = cli.describe_vpcs()
|
| 63 |
+
if vpcs is not None and len(vpcs) > 0:
|
| 64 |
+
config["provider"]["vpc_id"] = vpcs[0].get("VpcId")
|
| 65 |
+
return
|
| 66 |
+
|
| 67 |
+
vpc_id = cli.create_vpc()
|
| 68 |
+
if vpc_id is not None:
|
| 69 |
+
config["provider"]["vpc_id"] = vpc_id
|
| 70 |
+
|
| 71 |
+
|
| 72 |
+
def _get_or_create_vswitch(config):
|
| 73 |
+
cli = _client(config)
|
| 74 |
+
vswitches = cli.describe_v_switches(vpc_id=config["provider"]["vpc_id"])
|
| 75 |
+
if vswitches is not None and len(vswitches) > 0:
|
| 76 |
+
config["provider"]["v_switch_id"] = vswitches[0].get("VSwitchId")
|
| 77 |
+
return
|
| 78 |
+
|
| 79 |
+
v_switch_id = cli.create_v_switch(
|
| 80 |
+
vpc_id=config["provider"]["vpc_id"],
|
| 81 |
+
zone_id=config["provider"]["zone_id"],
|
| 82 |
+
cidr_block=config["provider"]["cidr_block"],
|
| 83 |
+
)
|
| 84 |
+
|
| 85 |
+
if v_switch_id is not None:
|
| 86 |
+
config["provider"]["v_switch_id"] = v_switch_id
|
| 87 |
+
|
| 88 |
+
|
| 89 |
+
def _get_or_import_key_pair(config):
|
| 90 |
+
cli = _client(config)
|
| 91 |
+
key_name = config["provider"].get("key_name", "ray")
|
| 92 |
+
key_path = os.path.expanduser("~/.ssh/{}".format(key_name))
|
| 93 |
+
keypairs = cli.describe_key_pairs(key_pair_name=key_name)
|
| 94 |
+
|
| 95 |
+
if keypairs is not None and len(keypairs) > 0:
|
| 96 |
+
if "ssh_private_key" not in config["auth"]:
|
| 97 |
+
logger.info(
|
| 98 |
+
"{} keypair exists, use {} as local ssh key".format(key_name, key_path)
|
| 99 |
+
)
|
| 100 |
+
config["auth"]["ssh_private_key"] = key_path
|
| 101 |
+
else:
|
| 102 |
+
if "ssh_private_key" not in config["auth"]:
|
| 103 |
+
# create new keypair
|
| 104 |
+
resp = cli.create_key_pair(key_pair_name=key_name)
|
| 105 |
+
if resp is not None:
|
| 106 |
+
with open(key_path, "w+") as f:
|
| 107 |
+
f.write(resp.get("PrivateKeyBody"))
|
| 108 |
+
os.chmod(key_path, stat.S_IRUSR)
|
| 109 |
+
config["auth"]["ssh_private_key"] = key_path
|
| 110 |
+
else:
|
| 111 |
+
public_key_file = config["auth"]["ssh_private_key"] + ".pub"
|
| 112 |
+
# create new keypair, from local file
|
| 113 |
+
with open(public_key_file) as f:
|
| 114 |
+
public_key = f.readline().strip("\n")
|
| 115 |
+
cli.import_key_pair(key_pair_name=key_name, public_key_body=public_key)
|
| 116 |
+
return
|
.venv/lib/python3.11/site-packages/ray/autoscaler/_private/aliyun/node_provider.py
ADDED
|
@@ -0,0 +1,324 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import logging
|
| 2 |
+
import random
|
| 3 |
+
import threading
|
| 4 |
+
import time
|
| 5 |
+
from collections import defaultdict
|
| 6 |
+
from typing import Any, Dict, List, Optional
|
| 7 |
+
|
| 8 |
+
from ray.autoscaler._private.aliyun.config import (
|
| 9 |
+
PENDING,
|
| 10 |
+
RUNNING,
|
| 11 |
+
STOPPED,
|
| 12 |
+
STOPPING,
|
| 13 |
+
bootstrap_aliyun,
|
| 14 |
+
)
|
| 15 |
+
from ray.autoscaler._private.aliyun.utils import AcsClient
|
| 16 |
+
from ray.autoscaler._private.cli_logger import cli_logger
|
| 17 |
+
from ray.autoscaler._private.constants import BOTO_MAX_RETRIES
|
| 18 |
+
from ray.autoscaler._private.log_timer import LogTimer
|
| 19 |
+
from ray.autoscaler.node_provider import NodeProvider
|
| 20 |
+
from ray.autoscaler.tags import (
|
| 21 |
+
TAG_RAY_CLUSTER_NAME,
|
| 22 |
+
TAG_RAY_LAUNCH_CONFIG,
|
| 23 |
+
TAG_RAY_NODE_KIND,
|
| 24 |
+
TAG_RAY_NODE_NAME,
|
| 25 |
+
TAG_RAY_NODE_STATUS,
|
| 26 |
+
TAG_RAY_USER_NODE_TYPE,
|
| 27 |
+
)
|
| 28 |
+
|
| 29 |
+
logger = logging.getLogger(__name__)
|
| 30 |
+
|
| 31 |
+
TAG_BATCH_DELAY = 1
|
| 32 |
+
STOPPING_NODE_DELAY = 1
|
| 33 |
+
|
| 34 |
+
|
| 35 |
+
class AliyunNodeProvider(NodeProvider):
|
| 36 |
+
def __init__(self, provider_config, cluster_name):
|
| 37 |
+
NodeProvider.__init__(self, provider_config, cluster_name)
|
| 38 |
+
self.cache_stopped_nodes = provider_config.get("cache_stopped_nodes", True)
|
| 39 |
+
self.acs = AcsClient(
|
| 40 |
+
access_key=provider_config["access_key"],
|
| 41 |
+
access_key_secret=provider_config["access_key_secret"],
|
| 42 |
+
region_id=provider_config["region"],
|
| 43 |
+
max_retries=BOTO_MAX_RETRIES,
|
| 44 |
+
)
|
| 45 |
+
|
| 46 |
+
# Try availability zones round-robin, starting from random offset
|
| 47 |
+
self.subnet_idx = random.randint(0, 100)
|
| 48 |
+
|
| 49 |
+
# Tags that we believe to actually be on the node.
|
| 50 |
+
self.tag_cache = {}
|
| 51 |
+
# Tags that we will soon upload.
|
| 52 |
+
self.tag_cache_pending = defaultdict(dict)
|
| 53 |
+
# Number of threads waiting for a batched tag update.
|
| 54 |
+
self.batch_thread_count = 0
|
| 55 |
+
self.batch_update_done = threading.Event()
|
| 56 |
+
self.batch_update_done.set()
|
| 57 |
+
self.ready_for_new_batch = threading.Event()
|
| 58 |
+
self.ready_for_new_batch.set()
|
| 59 |
+
self.tag_cache_lock = threading.Lock()
|
| 60 |
+
self.count_lock = threading.Lock()
|
| 61 |
+
|
| 62 |
+
# Cache of node objects from the last nodes() call. This avoids
|
| 63 |
+
# excessive DescribeInstances requests.
|
| 64 |
+
self.cached_nodes = {}
|
| 65 |
+
|
| 66 |
+
def non_terminated_nodes(self, tag_filters: Dict[str, str]) -> List[str]:
|
| 67 |
+
tags = [
|
| 68 |
+
{
|
| 69 |
+
"Key": TAG_RAY_CLUSTER_NAME,
|
| 70 |
+
"Value": self.cluster_name,
|
| 71 |
+
},
|
| 72 |
+
]
|
| 73 |
+
for k, v in tag_filters.items():
|
| 74 |
+
tags.append(
|
| 75 |
+
{
|
| 76 |
+
"Key": k,
|
| 77 |
+
"Value": v,
|
| 78 |
+
}
|
| 79 |
+
)
|
| 80 |
+
|
| 81 |
+
instances = self.acs.describe_instances(tags=tags)
|
| 82 |
+
non_terminated_instance = []
|
| 83 |
+
for instance in instances:
|
| 84 |
+
if instance.get("Status") == RUNNING or instance.get("Status") == PENDING:
|
| 85 |
+
non_terminated_instance.append(instance.get("InstanceId"))
|
| 86 |
+
self.cached_nodes[instance.get("InstanceId")] = instance
|
| 87 |
+
return non_terminated_instance
|
| 88 |
+
|
| 89 |
+
def is_running(self, node_id: str) -> bool:
|
| 90 |
+
instances = self.acs.describe_instances(instance_ids=[node_id])
|
| 91 |
+
if instances is not None:
|
| 92 |
+
instance = instances[0]
|
| 93 |
+
return instance.get("Status") == "Running"
|
| 94 |
+
cli_logger.error("Invalid node id: %s", node_id)
|
| 95 |
+
return False
|
| 96 |
+
|
| 97 |
+
def is_terminated(self, node_id: str) -> bool:
|
| 98 |
+
instances = self.acs.describe_instances(instance_ids=[node_id])
|
| 99 |
+
if instances is not None:
|
| 100 |
+
assert len(instances) == 1
|
| 101 |
+
instance = instances[0]
|
| 102 |
+
return instance.get("Status") == "Stopped"
|
| 103 |
+
cli_logger.error("Invalid node id: %s", node_id)
|
| 104 |
+
return False
|
| 105 |
+
|
| 106 |
+
def node_tags(self, node_id: str) -> Dict[str, str]:
|
| 107 |
+
instances = self.acs.describe_instances(instance_ids=[node_id])
|
| 108 |
+
if instances is not None:
|
| 109 |
+
assert len(instances) == 1
|
| 110 |
+
instance = instances[0]
|
| 111 |
+
if instance.get("Tags") is not None:
|
| 112 |
+
node_tags = dict()
|
| 113 |
+
for tag in instance.get("Tags").get("Tag"):
|
| 114 |
+
node_tags[tag.get("TagKey")] = tag.get("TagValue")
|
| 115 |
+
return node_tags
|
| 116 |
+
return dict()
|
| 117 |
+
|
| 118 |
+
def external_ip(self, node_id: str) -> str:
|
| 119 |
+
while True:
|
| 120 |
+
instances = self.acs.describe_instances(instance_ids=[node_id])
|
| 121 |
+
if instances is not None:
|
| 122 |
+
assert len(instances)
|
| 123 |
+
instance = instances[0]
|
| 124 |
+
if (
|
| 125 |
+
instance.get("PublicIpAddress") is not None
|
| 126 |
+
and instance.get("PublicIpAddress").get("IpAddress") is not None
|
| 127 |
+
):
|
| 128 |
+
if len(instance.get("PublicIpAddress").get("IpAddress")) > 0:
|
| 129 |
+
return instance.get("PublicIpAddress").get("IpAddress")[0]
|
| 130 |
+
cli_logger.error("PublicIpAddress attribute is not exist. %s" % instance)
|
| 131 |
+
time.sleep(STOPPING_NODE_DELAY)
|
| 132 |
+
|
| 133 |
+
def internal_ip(self, node_id: str) -> str:
|
| 134 |
+
while True:
|
| 135 |
+
instances = self.acs.describe_instances(instance_ids=[node_id])
|
| 136 |
+
if instances is not None:
|
| 137 |
+
assert len(instances) == 1
|
| 138 |
+
instance = instances[0]
|
| 139 |
+
if (
|
| 140 |
+
instance.get("VpcAttributes") is not None
|
| 141 |
+
and instance.get("VpcAttributes").get("PrivateIpAddress")
|
| 142 |
+
is not None
|
| 143 |
+
and len(
|
| 144 |
+
instance.get("VpcAttributes")
|
| 145 |
+
.get("PrivateIpAddress")
|
| 146 |
+
.get("IpAddress")
|
| 147 |
+
)
|
| 148 |
+
> 0
|
| 149 |
+
):
|
| 150 |
+
return (
|
| 151 |
+
instance.get("VpcAttributes")
|
| 152 |
+
.get("PrivateIpAddress")
|
| 153 |
+
.get("IpAddress")[0]
|
| 154 |
+
)
|
| 155 |
+
cli_logger.error("InnerIpAddress attribute is not exist. %s" % instance)
|
| 156 |
+
time.sleep(STOPPING_NODE_DELAY)
|
| 157 |
+
|
| 158 |
+
def set_node_tags(self, node_id: str, tags: Dict[str, str]) -> None:
|
| 159 |
+
is_batching_thread = False
|
| 160 |
+
with self.tag_cache_lock:
|
| 161 |
+
if not self.tag_cache_pending:
|
| 162 |
+
is_batching_thread = True
|
| 163 |
+
# Wait for threads in the last batch to exit
|
| 164 |
+
self.ready_for_new_batch.wait()
|
| 165 |
+
self.ready_for_new_batch.clear()
|
| 166 |
+
self.batch_update_done.clear()
|
| 167 |
+
self.tag_cache_pending[node_id].update(tags)
|
| 168 |
+
|
| 169 |
+
if is_batching_thread:
|
| 170 |
+
time.sleep(TAG_BATCH_DELAY)
|
| 171 |
+
with self.tag_cache_lock:
|
| 172 |
+
self._update_node_tags()
|
| 173 |
+
self.batch_update_done.set()
|
| 174 |
+
|
| 175 |
+
with self.count_lock:
|
| 176 |
+
self.batch_thread_count += 1
|
| 177 |
+
self.batch_update_done.wait()
|
| 178 |
+
|
| 179 |
+
with self.count_lock:
|
| 180 |
+
self.batch_thread_count -= 1
|
| 181 |
+
if self.batch_thread_count == 0:
|
| 182 |
+
self.ready_for_new_batch.set()
|
| 183 |
+
|
| 184 |
+
def _update_node_tags(self):
|
| 185 |
+
batch_updates = defaultdict(list)
|
| 186 |
+
|
| 187 |
+
for node_id, tags in self.tag_cache_pending.items():
|
| 188 |
+
for x in tags.items():
|
| 189 |
+
batch_updates[x].append(node_id)
|
| 190 |
+
self.tag_cache[node_id] = tags
|
| 191 |
+
|
| 192 |
+
self.tag_cache_pending = defaultdict(dict)
|
| 193 |
+
|
| 194 |
+
self._create_tags(batch_updates)
|
| 195 |
+
|
| 196 |
+
def _create_tags(self, batch_updates):
|
| 197 |
+
|
| 198 |
+
for (k, v), node_ids in batch_updates.items():
|
| 199 |
+
m = "Set tag {}={} on {}".format(k, v, node_ids)
|
| 200 |
+
with LogTimer("AliyunNodeProvider: {}".format(m)):
|
| 201 |
+
if k == TAG_RAY_NODE_NAME:
|
| 202 |
+
k = "Name"
|
| 203 |
+
|
| 204 |
+
self.acs.tag_resource(node_ids, [{"Key": k, "Value": v}])
|
| 205 |
+
|
| 206 |
+
def create_node(
|
| 207 |
+
self, node_config: Dict[str, Any], tags: Dict[str, str], count: int
|
| 208 |
+
) -> Optional[Dict[str, Any]]:
|
| 209 |
+
filter_tags = [
|
| 210 |
+
{
|
| 211 |
+
"Key": TAG_RAY_CLUSTER_NAME,
|
| 212 |
+
"Value": self.cluster_name,
|
| 213 |
+
},
|
| 214 |
+
{"Key": TAG_RAY_NODE_KIND, "Value": tags[TAG_RAY_NODE_KIND]},
|
| 215 |
+
{"Key": TAG_RAY_USER_NODE_TYPE, "Value": tags[TAG_RAY_USER_NODE_TYPE]},
|
| 216 |
+
{"Key": TAG_RAY_LAUNCH_CONFIG, "Value": tags[TAG_RAY_LAUNCH_CONFIG]},
|
| 217 |
+
{"Key": TAG_RAY_NODE_NAME, "Value": tags[TAG_RAY_NODE_NAME]},
|
| 218 |
+
]
|
| 219 |
+
|
| 220 |
+
reused_nodes_dict = {}
|
| 221 |
+
if self.cache_stopped_nodes:
|
| 222 |
+
reuse_nodes_candidate = self.acs.describe_instances(tags=filter_tags)
|
| 223 |
+
if reuse_nodes_candidate:
|
| 224 |
+
with cli_logger.group("Stopping instances to reuse"):
|
| 225 |
+
reuse_node_ids = []
|
| 226 |
+
for node in reuse_nodes_candidate:
|
| 227 |
+
node_id = node.get("InstanceId")
|
| 228 |
+
status = node.get("Status")
|
| 229 |
+
if status != STOPPING and status != STOPPED:
|
| 230 |
+
continue
|
| 231 |
+
if status == STOPPING:
|
| 232 |
+
# wait for node stopped
|
| 233 |
+
while (
|
| 234 |
+
self.acs.describe_instances(instance_ids=[node_id])[
|
| 235 |
+
0
|
| 236 |
+
].get("Status")
|
| 237 |
+
== STOPPING
|
| 238 |
+
):
|
| 239 |
+
logging.info("wait for %s stop" % node_id)
|
| 240 |
+
time.sleep(STOPPING_NODE_DELAY)
|
| 241 |
+
# logger.info("reuse %s" % node_id)
|
| 242 |
+
reuse_node_ids.append(node_id)
|
| 243 |
+
reused_nodes_dict[node.get("InstanceId")] = node
|
| 244 |
+
self.acs.start_instance(node_id)
|
| 245 |
+
self.tag_cache[node_id] = node.get("Tags")
|
| 246 |
+
self.set_node_tags(node_id, tags)
|
| 247 |
+
if len(reuse_node_ids) == count:
|
| 248 |
+
break
|
| 249 |
+
count -= len(reuse_node_ids)
|
| 250 |
+
|
| 251 |
+
created_nodes_dict = {}
|
| 252 |
+
if count > 0:
|
| 253 |
+
filter_tags.append(
|
| 254 |
+
{"Key": TAG_RAY_NODE_STATUS, "Value": tags[TAG_RAY_NODE_STATUS]}
|
| 255 |
+
)
|
| 256 |
+
instance_id_sets = self.acs.run_instances(
|
| 257 |
+
instance_type=node_config["InstanceType"],
|
| 258 |
+
image_id=node_config["ImageId"],
|
| 259 |
+
tags=filter_tags,
|
| 260 |
+
amount=count,
|
| 261 |
+
vswitch_id=self.provider_config["v_switch_id"],
|
| 262 |
+
security_group_id=self.provider_config["security_group_id"],
|
| 263 |
+
key_pair_name=self.provider_config["key_name"],
|
| 264 |
+
)
|
| 265 |
+
instances = self.acs.describe_instances(instance_ids=instance_id_sets)
|
| 266 |
+
|
| 267 |
+
if instances is not None:
|
| 268 |
+
for instance in instances:
|
| 269 |
+
created_nodes_dict[instance.get("InstanceId")] = instance
|
| 270 |
+
|
| 271 |
+
all_created_nodes = reused_nodes_dict
|
| 272 |
+
all_created_nodes.update(created_nodes_dict)
|
| 273 |
+
return all_created_nodes
|
| 274 |
+
|
| 275 |
+
def terminate_node(self, node_id: str) -> None:
|
| 276 |
+
logger.info("terminate node: %s" % node_id)
|
| 277 |
+
if self.cache_stopped_nodes:
|
| 278 |
+
logger.info(
|
| 279 |
+
"Stopping instance {} (to terminate instead, "
|
| 280 |
+
"set `cache_stopped_nodes: False` "
|
| 281 |
+
"under `provider` in the cluster configuration)"
|
| 282 |
+
).format(node_id)
|
| 283 |
+
self.acs.stop_instance(node_id)
|
| 284 |
+
else:
|
| 285 |
+
self.acs.delete_instance(node_id)
|
| 286 |
+
|
| 287 |
+
def terminate_nodes(self, node_ids: List[str]) -> None:
|
| 288 |
+
if not node_ids:
|
| 289 |
+
return
|
| 290 |
+
if self.cache_stopped_nodes:
|
| 291 |
+
logger.info(
|
| 292 |
+
"Stopping instances {} (to terminate instead, "
|
| 293 |
+
"set `cache_stopped_nodes: False` "
|
| 294 |
+
"under `provider` in the cluster configuration)".format(node_ids)
|
| 295 |
+
)
|
| 296 |
+
|
| 297 |
+
self.acs.stop_instances(node_ids)
|
| 298 |
+
else:
|
| 299 |
+
self.acs.delete_instances(node_ids)
|
| 300 |
+
|
| 301 |
+
def _get_node(self, node_id):
|
| 302 |
+
"""Refresh and get info for this node, updating the cache."""
|
| 303 |
+
self.non_terminated_nodes({}) # Side effect: updates cache
|
| 304 |
+
|
| 305 |
+
if node_id in self.cached_nodes:
|
| 306 |
+
return self.cached_nodes[node_id]
|
| 307 |
+
|
| 308 |
+
# Node not in {pending, running} -- retry with a point query. This
|
| 309 |
+
# usually means the node was recently preempted or terminated.
|
| 310 |
+
matches = self.acs.describe_instances(instance_ids=[node_id])
|
| 311 |
+
|
| 312 |
+
assert len(matches) == 1, "Invalid instance id {}".format(node_id)
|
| 313 |
+
return matches[0]
|
| 314 |
+
|
| 315 |
+
def _get_cached_node(self, node_id):
|
| 316 |
+
"""Return node info from cache if possible, otherwise fetches it."""
|
| 317 |
+
if node_id in self.cached_nodes:
|
| 318 |
+
return self.cached_nodes[node_id]
|
| 319 |
+
|
| 320 |
+
return self._get_node(node_id)
|
| 321 |
+
|
| 322 |
+
@staticmethod
|
| 323 |
+
def bootstrap_config(cluster_config):
|
| 324 |
+
return bootstrap_aliyun(cluster_config)
|
.venv/lib/python3.11/site-packages/ray/autoscaler/_private/aliyun/utils.py
ADDED
|
@@ -0,0 +1,461 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import json
|
| 2 |
+
import logging
|
| 3 |
+
|
| 4 |
+
from aliyunsdkcore import client
|
| 5 |
+
from aliyunsdkcore.acs_exception.exceptions import ClientException, ServerException
|
| 6 |
+
from aliyunsdkecs.request.v20140526.AllocatePublicIpAddressRequest import (
|
| 7 |
+
AllocatePublicIpAddressRequest,
|
| 8 |
+
)
|
| 9 |
+
from aliyunsdkecs.request.v20140526.AuthorizeSecurityGroupRequest import (
|
| 10 |
+
AuthorizeSecurityGroupRequest,
|
| 11 |
+
)
|
| 12 |
+
from aliyunsdkecs.request.v20140526.CreateInstanceRequest import CreateInstanceRequest
|
| 13 |
+
from aliyunsdkecs.request.v20140526.CreateKeyPairRequest import CreateKeyPairRequest
|
| 14 |
+
from aliyunsdkecs.request.v20140526.CreateSecurityGroupRequest import (
|
| 15 |
+
CreateSecurityGroupRequest,
|
| 16 |
+
)
|
| 17 |
+
from aliyunsdkecs.request.v20140526.CreateVpcRequest import CreateVpcRequest
|
| 18 |
+
from aliyunsdkecs.request.v20140526.CreateVSwitchRequest import CreateVSwitchRequest
|
| 19 |
+
from aliyunsdkecs.request.v20140526.DeleteInstanceRequest import DeleteInstanceRequest
|
| 20 |
+
from aliyunsdkecs.request.v20140526.DeleteInstancesRequest import DeleteInstancesRequest
|
| 21 |
+
from aliyunsdkecs.request.v20140526.DeleteKeyPairsRequest import DeleteKeyPairsRequest
|
| 22 |
+
from aliyunsdkecs.request.v20140526.DescribeInstancesRequest import (
|
| 23 |
+
DescribeInstancesRequest,
|
| 24 |
+
)
|
| 25 |
+
from aliyunsdkecs.request.v20140526.DescribeKeyPairsRequest import (
|
| 26 |
+
DescribeKeyPairsRequest,
|
| 27 |
+
)
|
| 28 |
+
from aliyunsdkecs.request.v20140526.DescribeSecurityGroupsRequest import (
|
| 29 |
+
DescribeSecurityGroupsRequest,
|
| 30 |
+
)
|
| 31 |
+
from aliyunsdkecs.request.v20140526.DescribeVpcsRequest import DescribeVpcsRequest
|
| 32 |
+
from aliyunsdkecs.request.v20140526.DescribeVSwitchesRequest import (
|
| 33 |
+
DescribeVSwitchesRequest,
|
| 34 |
+
)
|
| 35 |
+
from aliyunsdkecs.request.v20140526.ImportKeyPairRequest import ImportKeyPairRequest
|
| 36 |
+
from aliyunsdkecs.request.v20140526.RunInstancesRequest import RunInstancesRequest
|
| 37 |
+
from aliyunsdkecs.request.v20140526.StartInstanceRequest import StartInstanceRequest
|
| 38 |
+
from aliyunsdkecs.request.v20140526.StopInstanceRequest import StopInstanceRequest
|
| 39 |
+
from aliyunsdkecs.request.v20140526.StopInstancesRequest import StopInstancesRequest
|
| 40 |
+
from aliyunsdkecs.request.v20140526.TagResourcesRequest import TagResourcesRequest
|
| 41 |
+
|
| 42 |
+
|
| 43 |
+
class AcsClient:
|
| 44 |
+
"""
|
| 45 |
+
A wrapper around Aliyun SDK. We use this wrapper in aliyun node provider.
|
| 46 |
+
|
| 47 |
+
Parameters:
|
| 48 |
+
access_key: The AccessKey ID of your aliyun account.
|
| 49 |
+
access_key_secret: The AccessKey secret of your aliyun account.
|
| 50 |
+
region_id: A region is a geographic area where a data center resides.
|
| 51 |
+
Region_id is the ID of region (e.g., cn-hangzhou,
|
| 52 |
+
us-west-1, etc.)
|
| 53 |
+
max_retries: The maximum number of retries each connection.
|
| 54 |
+
"""
|
| 55 |
+
|
| 56 |
+
def __init__(self, access_key, access_key_secret, region_id, max_retries):
|
| 57 |
+
self.cli = client.AcsClient(
|
| 58 |
+
ak=access_key,
|
| 59 |
+
secret=access_key_secret,
|
| 60 |
+
max_retry_time=max_retries,
|
| 61 |
+
region_id=region_id,
|
| 62 |
+
)
|
| 63 |
+
|
| 64 |
+
def describe_instances(self, tags=None, instance_ids=None):
|
| 65 |
+
"""Query the details of one or more Elastic Compute Service (ECS) instances.
|
| 66 |
+
|
| 67 |
+
:param tags: The tags of the instance.
|
| 68 |
+
:param instance_ids: The IDs of ECS instances
|
| 69 |
+
:return: ECS instance list
|
| 70 |
+
"""
|
| 71 |
+
request = DescribeInstancesRequest()
|
| 72 |
+
if tags is not None:
|
| 73 |
+
request.set_Tags(tags)
|
| 74 |
+
if instance_ids is not None:
|
| 75 |
+
request.set_InstanceIds(instance_ids)
|
| 76 |
+
response = self._send_request(request)
|
| 77 |
+
if response is not None:
|
| 78 |
+
instance_list = response.get("Instances").get("Instance")
|
| 79 |
+
return instance_list
|
| 80 |
+
return None
|
| 81 |
+
|
| 82 |
+
def create_instance(
|
| 83 |
+
self,
|
| 84 |
+
instance_type,
|
| 85 |
+
image_id,
|
| 86 |
+
tags,
|
| 87 |
+
key_pair_name,
|
| 88 |
+
optimized="optimized",
|
| 89 |
+
instance_charge_type="PostPaid",
|
| 90 |
+
spot_strategy="SpotWithPriceLimit",
|
| 91 |
+
internet_charge_type="PayByTraffic",
|
| 92 |
+
internet_max_bandwidth_out=5,
|
| 93 |
+
):
|
| 94 |
+
"""Create a subscription or pay-as-you-go ECS instance.
|
| 95 |
+
|
| 96 |
+
:param instance_type: The instance type of the ECS.
|
| 97 |
+
:param image_id: The ID of the image used to create the instance.
|
| 98 |
+
:param tags: The tags of the instance.
|
| 99 |
+
:param key_pair_name: The name of the key pair to be bound to
|
| 100 |
+
the instance.
|
| 101 |
+
:param optimized: Specifies whether the instance is I/O optimized
|
| 102 |
+
:param instance_charge_type: The billing method of the instance.
|
| 103 |
+
Default value: PostPaid.
|
| 104 |
+
:param spot_strategy: The preemption policy for the pay-as-you-go
|
| 105 |
+
instance.
|
| 106 |
+
:param internet_charge_type: The billing method for network usage.
|
| 107 |
+
Default value: PayByTraffic.
|
| 108 |
+
:param internet_max_bandwidth_out: The maximum inbound public
|
| 109 |
+
bandwidth. Unit: Mbit/s.
|
| 110 |
+
:return: The created instance ID.
|
| 111 |
+
"""
|
| 112 |
+
request = CreateInstanceRequest()
|
| 113 |
+
request.set_InstanceType(instance_type)
|
| 114 |
+
request.set_ImageId(image_id)
|
| 115 |
+
request.set_IoOptimized(optimized)
|
| 116 |
+
request.set_InstanceChargeType(instance_charge_type)
|
| 117 |
+
request.set_SpotStrategy(spot_strategy)
|
| 118 |
+
request.set_InternetChargeType(internet_charge_type)
|
| 119 |
+
request.set_InternetMaxBandwidthOut(internet_max_bandwidth_out)
|
| 120 |
+
request.set_KeyPairName(key_pair_name)
|
| 121 |
+
request.set_Tags(tags)
|
| 122 |
+
|
| 123 |
+
response = self._send_request(request)
|
| 124 |
+
if response is not None:
|
| 125 |
+
instance_id = response.get("InstanceId")
|
| 126 |
+
logging.info("instance %s created task submit successfully.", instance_id)
|
| 127 |
+
return instance_id
|
| 128 |
+
logging.error("instance created failed.")
|
| 129 |
+
return None
|
| 130 |
+
|
| 131 |
+
def run_instances(
|
| 132 |
+
self,
|
| 133 |
+
instance_type,
|
| 134 |
+
image_id,
|
| 135 |
+
tags,
|
| 136 |
+
security_group_id,
|
| 137 |
+
vswitch_id,
|
| 138 |
+
key_pair_name,
|
| 139 |
+
amount=1,
|
| 140 |
+
optimized="optimized",
|
| 141 |
+
instance_charge_type="PostPaid",
|
| 142 |
+
spot_strategy="SpotWithPriceLimit",
|
| 143 |
+
internet_charge_type="PayByTraffic",
|
| 144 |
+
internet_max_bandwidth_out=1,
|
| 145 |
+
):
|
| 146 |
+
"""Create one or more pay-as-you-go or subscription
|
| 147 |
+
Elastic Compute Service (ECS) instances
|
| 148 |
+
|
| 149 |
+
:param instance_type: The instance type of the ECS.
|
| 150 |
+
:param image_id: The ID of the image used to create the instance.
|
| 151 |
+
:param tags: The tags of the instance.
|
| 152 |
+
:param security_group_id: The ID of the security group to which to
|
| 153 |
+
assign the instance. Instances in the same
|
| 154 |
+
security group can communicate with
|
| 155 |
+
each other.
|
| 156 |
+
:param vswitch_id: The ID of the vSwitch to which to connect
|
| 157 |
+
the instance.
|
| 158 |
+
:param key_pair_name: The name of the key pair to be bound to
|
| 159 |
+
the instance.
|
| 160 |
+
:param amount: The number of instances that you want to create.
|
| 161 |
+
:param optimized: Specifies whether the instance is I/O optimized
|
| 162 |
+
:param instance_charge_type: The billing method of the instance.
|
| 163 |
+
Default value: PostPaid.
|
| 164 |
+
:param spot_strategy: The preemption policy for the pay-as-you-go
|
| 165 |
+
instance.
|
| 166 |
+
:param internet_charge_type: The billing method for network usage.
|
| 167 |
+
Default value: PayByTraffic.
|
| 168 |
+
:param internet_max_bandwidth_out: The maximum inbound public
|
| 169 |
+
bandwidth. Unit: Mbit/s.
|
| 170 |
+
:return: The created instance IDs.
|
| 171 |
+
"""
|
| 172 |
+
request = RunInstancesRequest()
|
| 173 |
+
request.set_InstanceType(instance_type)
|
| 174 |
+
request.set_ImageId(image_id)
|
| 175 |
+
request.set_IoOptimized(optimized)
|
| 176 |
+
request.set_InstanceChargeType(instance_charge_type)
|
| 177 |
+
request.set_SpotStrategy(spot_strategy)
|
| 178 |
+
request.set_InternetChargeType(internet_charge_type)
|
| 179 |
+
request.set_InternetMaxBandwidthOut(internet_max_bandwidth_out)
|
| 180 |
+
request.set_Tags(tags)
|
| 181 |
+
request.set_Amount(amount)
|
| 182 |
+
request.set_SecurityGroupId(security_group_id)
|
| 183 |
+
request.set_VSwitchId(vswitch_id)
|
| 184 |
+
request.set_KeyPairName(key_pair_name)
|
| 185 |
+
|
| 186 |
+
response = self._send_request(request)
|
| 187 |
+
if response is not None:
|
| 188 |
+
instance_ids = response.get("InstanceIdSets").get("InstanceIdSet")
|
| 189 |
+
return instance_ids
|
| 190 |
+
logging.error("instance created failed.")
|
| 191 |
+
return None
|
| 192 |
+
|
| 193 |
+
def create_security_group(self, vpc_id):
|
| 194 |
+
"""Create a security group
|
| 195 |
+
|
| 196 |
+
:param vpc_id: The ID of the VPC in which to create
|
| 197 |
+
the security group.
|
| 198 |
+
:return: The created security group ID.
|
| 199 |
+
"""
|
| 200 |
+
request = CreateSecurityGroupRequest()
|
| 201 |
+
request.set_VpcId(vpc_id)
|
| 202 |
+
response = self._send_request(request)
|
| 203 |
+
if response is not None:
|
| 204 |
+
security_group_id = response.get("SecurityGroupId")
|
| 205 |
+
return security_group_id
|
| 206 |
+
return None
|
| 207 |
+
|
| 208 |
+
def describe_security_groups(self, vpc_id=None, tags=None):
|
| 209 |
+
"""Query basic information of security groups.
|
| 210 |
+
|
| 211 |
+
:param vpc_id: The ID of the VPC to which the security group belongs.
|
| 212 |
+
:param tags: The tags of the security group.
|
| 213 |
+
:return: Security group list.
|
| 214 |
+
"""
|
| 215 |
+
request = DescribeSecurityGroupsRequest()
|
| 216 |
+
if vpc_id is not None:
|
| 217 |
+
request.set_VpcId(vpc_id)
|
| 218 |
+
if tags is not None:
|
| 219 |
+
request.set_Tags(tags)
|
| 220 |
+
response = self._send_request(request)
|
| 221 |
+
if response is not None:
|
| 222 |
+
security_groups = response.get("SecurityGroups").get("SecurityGroup")
|
| 223 |
+
return security_groups
|
| 224 |
+
logging.error("describe security group failed.")
|
| 225 |
+
return None
|
| 226 |
+
|
| 227 |
+
def authorize_security_group(
|
| 228 |
+
self, ip_protocol, port_range, security_group_id, source_cidr_ip
|
| 229 |
+
):
|
| 230 |
+
"""Create an inbound security group rule.
|
| 231 |
+
|
| 232 |
+
:param ip_protocol: The transport layer protocol.
|
| 233 |
+
:param port_range: The range of destination ports relevant to
|
| 234 |
+
the transport layer protocol.
|
| 235 |
+
:param security_group_id: The ID of the destination security group.
|
| 236 |
+
:param source_cidr_ip: The range of source IPv4 addresses.
|
| 237 |
+
CIDR blocks and IPv4 addresses are supported.
|
| 238 |
+
"""
|
| 239 |
+
request = AuthorizeSecurityGroupRequest()
|
| 240 |
+
request.set_IpProtocol(ip_protocol)
|
| 241 |
+
request.set_PortRange(port_range)
|
| 242 |
+
request.set_SecurityGroupId(security_group_id)
|
| 243 |
+
request.set_SourceCidrIp(source_cidr_ip)
|
| 244 |
+
self._send_request(request)
|
| 245 |
+
|
| 246 |
+
def create_v_switch(self, vpc_id, zone_id, cidr_block):
|
| 247 |
+
"""Create vSwitches to divide the VPC into one or more subnets
|
| 248 |
+
|
| 249 |
+
:param vpc_id: The ID of the VPC to which the VSwitch belongs.
|
| 250 |
+
:param zone_id: The ID of the zone to which
|
| 251 |
+
the target VSwitch belongs.
|
| 252 |
+
:param cidr_block: The CIDR block of the VSwitch.
|
| 253 |
+
:return:
|
| 254 |
+
"""
|
| 255 |
+
request = CreateVSwitchRequest()
|
| 256 |
+
request.set_ZoneId(zone_id)
|
| 257 |
+
request.set_VpcId(vpc_id)
|
| 258 |
+
request.set_CidrBlock(cidr_block)
|
| 259 |
+
response = self._send_request(request)
|
| 260 |
+
if response is not None:
|
| 261 |
+
return response.get("VSwitchId")
|
| 262 |
+
else:
|
| 263 |
+
logging.error("create_v_switch vpc_id %s failed.", vpc_id)
|
| 264 |
+
return None
|
| 265 |
+
|
| 266 |
+
def create_vpc(self):
|
| 267 |
+
"""Creates a virtual private cloud (VPC).
|
| 268 |
+
|
| 269 |
+
:return: The created VPC ID.
|
| 270 |
+
"""
|
| 271 |
+
request = CreateVpcRequest()
|
| 272 |
+
response = self._send_request(request)
|
| 273 |
+
if response is not None:
|
| 274 |
+
return response.get("VpcId")
|
| 275 |
+
return None
|
| 276 |
+
|
| 277 |
+
def describe_vpcs(self):
|
| 278 |
+
"""Queries one or more VPCs in a region.
|
| 279 |
+
|
| 280 |
+
:return: VPC list.
|
| 281 |
+
"""
|
| 282 |
+
request = DescribeVpcsRequest()
|
| 283 |
+
response = self._send_request(request)
|
| 284 |
+
if response is not None:
|
| 285 |
+
return response.get("Vpcs").get("Vpc")
|
| 286 |
+
return None
|
| 287 |
+
|
| 288 |
+
def tag_resource(self, resource_ids, tags, resource_type="instance"):
|
| 289 |
+
"""Create and bind tags to specified ECS resources.
|
| 290 |
+
|
| 291 |
+
:param resource_ids: The IDs of N resources.
|
| 292 |
+
:param tags: The tags of the resource.
|
| 293 |
+
:param resource_type: The type of the resource.
|
| 294 |
+
"""
|
| 295 |
+
request = TagResourcesRequest()
|
| 296 |
+
request.set_Tags(tags)
|
| 297 |
+
request.set_ResourceType(resource_type)
|
| 298 |
+
request.set_ResourceIds(resource_ids)
|
| 299 |
+
response = self._send_request(request)
|
| 300 |
+
if response is not None:
|
| 301 |
+
logging.info("instance %s create tag successfully.", resource_ids)
|
| 302 |
+
else:
|
| 303 |
+
logging.error("instance %s create tag failed.", resource_ids)
|
| 304 |
+
|
| 305 |
+
def start_instance(self, instance_id):
|
| 306 |
+
"""Start an ECS instance.
|
| 307 |
+
|
| 308 |
+
:param instance_id: The Ecs instance ID.
|
| 309 |
+
"""
|
| 310 |
+
request = StartInstanceRequest()
|
| 311 |
+
request.set_InstanceId(instance_id)
|
| 312 |
+
response = self._send_request(request)
|
| 313 |
+
|
| 314 |
+
if response is not None:
|
| 315 |
+
logging.info("instance %s start successfully.", instance_id)
|
| 316 |
+
else:
|
| 317 |
+
logging.error("instance %s start failed.", instance_id)
|
| 318 |
+
|
| 319 |
+
def stop_instance(self, instance_id, force_stop=False):
|
| 320 |
+
"""Stop an ECS instance that is in the Running state.
|
| 321 |
+
|
| 322 |
+
:param instance_id: The Ecs instance ID.
|
| 323 |
+
:param force_stop: Specifies whether to forcibly stop the instance.
|
| 324 |
+
:return:
|
| 325 |
+
"""
|
| 326 |
+
request = StopInstanceRequest()
|
| 327 |
+
request.set_InstanceId(instance_id)
|
| 328 |
+
request.set_ForceStop(force_stop)
|
| 329 |
+
logging.info("Stop %s command submit successfully.", instance_id)
|
| 330 |
+
self._send_request(request)
|
| 331 |
+
|
| 332 |
+
def stop_instances(self, instance_ids, stopped_mode="StopCharging"):
|
| 333 |
+
"""Stop one or more ECS instances that are in the Running state.
|
| 334 |
+
|
| 335 |
+
:param instance_ids: The IDs of instances.
|
| 336 |
+
:param stopped_mode: Specifies whether billing for the instance
|
| 337 |
+
continues after the instance is stopped.
|
| 338 |
+
"""
|
| 339 |
+
request = StopInstancesRequest()
|
| 340 |
+
request.set_InstanceIds(instance_ids)
|
| 341 |
+
request.set_StoppedMode(stopped_mode)
|
| 342 |
+
response = self._send_request(request)
|
| 343 |
+
if response is None:
|
| 344 |
+
logging.error("stop_instances failed")
|
| 345 |
+
|
| 346 |
+
def delete_instance(self, instance_id):
|
| 347 |
+
"""Release a pay-as-you-go instance or
|
| 348 |
+
an expired subscription instance.
|
| 349 |
+
|
| 350 |
+
:param instance_id: The ID of the instance that you want to release.
|
| 351 |
+
"""
|
| 352 |
+
request = DeleteInstanceRequest()
|
| 353 |
+
request.set_InstanceId(instance_id)
|
| 354 |
+
request.set_Force(True)
|
| 355 |
+
logging.info("Delete %s command submit successfully", instance_id)
|
| 356 |
+
self._send_request(request)
|
| 357 |
+
|
| 358 |
+
def delete_instances(self, instance_ids):
|
| 359 |
+
"""Release one or more pay-as-you-go instances or
|
| 360 |
+
expired subscription instances.
|
| 361 |
+
|
| 362 |
+
:param instance_ids: The IDs of instances that you want to release.
|
| 363 |
+
"""
|
| 364 |
+
request = DeleteInstancesRequest()
|
| 365 |
+
request.set_Force(True)
|
| 366 |
+
request.set_InstanceIds(instance_ids)
|
| 367 |
+
self._send_request(request)
|
| 368 |
+
|
| 369 |
+
def allocate_public_address(self, instance_id):
|
| 370 |
+
"""Assign a public IP address to an ECS instance.
|
| 371 |
+
|
| 372 |
+
:param instance_id: The ID of the instance to which you want to
|
| 373 |
+
assign a public IP address.
|
| 374 |
+
:return: The assigned ip.
|
| 375 |
+
"""
|
| 376 |
+
request = AllocatePublicIpAddressRequest()
|
| 377 |
+
request.set_InstanceId(instance_id)
|
| 378 |
+
response = self._send_request(request)
|
| 379 |
+
if response is not None:
|
| 380 |
+
return response.get("IpAddress")
|
| 381 |
+
|
| 382 |
+
def create_key_pair(self, key_pair_name):
|
| 383 |
+
"""Create an SSH key pair.
|
| 384 |
+
|
| 385 |
+
:param key_pair_name: The name of the key pair.
|
| 386 |
+
:return: The created keypair data.
|
| 387 |
+
"""
|
| 388 |
+
request = CreateKeyPairRequest()
|
| 389 |
+
request.set_KeyPairName(key_pair_name)
|
| 390 |
+
response = self._send_request(request)
|
| 391 |
+
if response is not None:
|
| 392 |
+
logging.info("Create Key Pair %s Successfully", response.get("KeyPairId"))
|
| 393 |
+
return response
|
| 394 |
+
else:
|
| 395 |
+
logging.error("Create Key Pair Failed")
|
| 396 |
+
return None
|
| 397 |
+
|
| 398 |
+
def import_key_pair(self, key_pair_name, public_key_body):
|
| 399 |
+
"""Import the public key of an RSA-encrypted key pair
|
| 400 |
+
that is generated by a third-party tool.
|
| 401 |
+
|
| 402 |
+
:param key_pair_name: The name of the key pair.
|
| 403 |
+
:param public_key_body: The public key of the key pair.
|
| 404 |
+
"""
|
| 405 |
+
request = ImportKeyPairRequest()
|
| 406 |
+
request.set_KeyPairName(key_pair_name)
|
| 407 |
+
request.set_PublicKeyBody(public_key_body)
|
| 408 |
+
self._send_request(request)
|
| 409 |
+
|
| 410 |
+
def delete_key_pairs(self, key_pair_names):
|
| 411 |
+
"""Delete one or more SSH key pairs.
|
| 412 |
+
|
| 413 |
+
:param key_pair_names: The name of the key pair.
|
| 414 |
+
:return:
|
| 415 |
+
"""
|
| 416 |
+
request = DeleteKeyPairsRequest()
|
| 417 |
+
request.set_KeyPairNames(key_pair_names)
|
| 418 |
+
self._send_request(request)
|
| 419 |
+
|
| 420 |
+
def describe_key_pairs(self, key_pair_name=None):
|
| 421 |
+
"""Query one or more key pairs.
|
| 422 |
+
|
| 423 |
+
:param key_pair_name: The name of the key pair.
|
| 424 |
+
:return:
|
| 425 |
+
"""
|
| 426 |
+
request = DescribeKeyPairsRequest()
|
| 427 |
+
if key_pair_name is not None:
|
| 428 |
+
request.set_KeyPairName(key_pair_name)
|
| 429 |
+
response = self._send_request(request)
|
| 430 |
+
if response is not None:
|
| 431 |
+
return response.get("KeyPairs").get("KeyPair")
|
| 432 |
+
else:
|
| 433 |
+
return None
|
| 434 |
+
|
| 435 |
+
def describe_v_switches(self, vpc_id=None):
|
| 436 |
+
"""Queries one or more VSwitches.
|
| 437 |
+
|
| 438 |
+
:param vpc_id: The ID of the VPC to which the VSwitch belongs.
|
| 439 |
+
:return: VSwitch list.
|
| 440 |
+
"""
|
| 441 |
+
request = DescribeVSwitchesRequest()
|
| 442 |
+
if vpc_id is not None:
|
| 443 |
+
request.set_VpcId(vpc_id)
|
| 444 |
+
response = self._send_request(request)
|
| 445 |
+
if response is not None:
|
| 446 |
+
return response.get("VSwitches").get("VSwitch")
|
| 447 |
+
else:
|
| 448 |
+
logging.error("Describe VSwitches Failed.")
|
| 449 |
+
return None
|
| 450 |
+
|
| 451 |
+
def _send_request(self, request):
|
| 452 |
+
"""send open api request"""
|
| 453 |
+
request.set_accept_format("json")
|
| 454 |
+
try:
|
| 455 |
+
response_str = self.cli.do_action_with_exception(request)
|
| 456 |
+
response_detail = json.loads(response_str)
|
| 457 |
+
return response_detail
|
| 458 |
+
except (ClientException, ServerException) as e:
|
| 459 |
+
logging.error(request.get_action_name())
|
| 460 |
+
logging.error(e)
|
| 461 |
+
return None
|
.venv/lib/python3.11/site-packages/ray/autoscaler/_private/aws/__pycache__/utils.cpython-311.pyc
ADDED
|
Binary file (8.36 kB). View file
|
|
|
.venv/lib/python3.11/site-packages/ray/autoscaler/_private/kuberay/__init__.py
ADDED
|
File without changes
|
.venv/lib/python3.11/site-packages/ray/autoscaler/_private/kuberay/__pycache__/__init__.cpython-311.pyc
ADDED
|
Binary file (204 Bytes). View file
|
|
|
.venv/lib/python3.11/site-packages/ray/autoscaler/_private/kuberay/__pycache__/autoscaling_config.cpython-311.pyc
ADDED
|
Binary file (16.6 kB). View file
|
|
|
.venv/lib/python3.11/site-packages/ray/autoscaler/_private/kuberay/__pycache__/node_provider.cpython-311.pyc
ADDED
|
Binary file (24.6 kB). View file
|
|
|
.venv/lib/python3.11/site-packages/ray/autoscaler/_private/kuberay/__pycache__/run_autoscaler.cpython-311.pyc
ADDED
|
Binary file (5.01 kB). View file
|
|
|
.venv/lib/python3.11/site-packages/ray/autoscaler/_private/kuberay/__pycache__/utils.cpython-311.pyc
ADDED
|
Binary file (4.45 kB). View file
|
|
|
.venv/lib/python3.11/site-packages/ray/autoscaler/_private/kuberay/node_provider.py
ADDED
|
@@ -0,0 +1,536 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import datetime
|
| 2 |
+
import json
|
| 3 |
+
import logging
|
| 4 |
+
import os
|
| 5 |
+
from abc import ABC, abstractmethod
|
| 6 |
+
from collections import defaultdict
|
| 7 |
+
from typing import Any, Dict, List, Optional, Tuple
|
| 8 |
+
|
| 9 |
+
import requests
|
| 10 |
+
|
| 11 |
+
from ray.autoscaler._private.constants import WORKER_LIVENESS_CHECK_KEY
|
| 12 |
+
from ray.autoscaler._private.util import NodeID, NodeIP, NodeKind, NodeStatus, NodeType
|
| 13 |
+
from ray.autoscaler.batching_node_provider import (
|
| 14 |
+
BatchingNodeProvider,
|
| 15 |
+
NodeData,
|
| 16 |
+
ScaleRequest,
|
| 17 |
+
)
|
| 18 |
+
from ray.autoscaler.tags import (
|
| 19 |
+
NODE_KIND_HEAD,
|
| 20 |
+
NODE_KIND_WORKER,
|
| 21 |
+
STATUS_UP_TO_DATE,
|
| 22 |
+
STATUS_UPDATE_FAILED,
|
| 23 |
+
TAG_RAY_USER_NODE_TYPE,
|
| 24 |
+
)
|
| 25 |
+
|
| 26 |
+
# Key for KubeRay label that identifies a Ray pod as head or worker.
|
| 27 |
+
KUBERAY_LABEL_KEY_KIND = "ray.io/node-type"
|
| 28 |
+
# Key for KubeRay label that identifies the worker group (autoscaler node type) of a
|
| 29 |
+
# Ray pod.
|
| 30 |
+
KUBERAY_LABEL_KEY_TYPE = "ray.io/group"
|
| 31 |
+
|
| 32 |
+
# These should be synced with:
|
| 33 |
+
# https://github.com/ray-project/kuberay/blob/f2d94ffe213dd8f69481b09c474047cb899fa73b/ray-operator/apis/ray/v1/raycluster_types.go#L165-L171 # noqa
|
| 34 |
+
# Kind label value indicating the pod is the head.
|
| 35 |
+
KUBERAY_KIND_HEAD = "head"
|
| 36 |
+
# Kind label value indicating the pod is the worker.
|
| 37 |
+
KUBERAY_KIND_WORKER = "worker"
|
| 38 |
+
|
| 39 |
+
# KubeRay CRD version
|
| 40 |
+
KUBERAY_CRD_VER = os.getenv("KUBERAY_CRD_VER", "v1alpha1")
|
| 41 |
+
|
| 42 |
+
KUBERAY_REQUEST_TIMEOUT_S = int(os.getenv("KUBERAY_REQUEST_TIMEOUT_S", 60))
|
| 43 |
+
|
| 44 |
+
RAY_HEAD_POD_NAME = os.getenv("RAY_HEAD_POD_NAME")
|
| 45 |
+
|
| 46 |
+
# https://kubernetes.io/docs/tasks/run-application/access-api-from-pod
|
| 47 |
+
# While running in a Pod, your container can create an HTTPS URL for the
|
| 48 |
+
# Kubernetes API server by fetching the KUBERNETES_SERVICE_HOST and
|
| 49 |
+
# KUBERNETES_SERVICE_PORT_HTTPS environment variables.
|
| 50 |
+
KUBERNETES_SERVICE_HOST = os.getenv(
|
| 51 |
+
"KUBERNETES_SERVICE_HOST", "https://kubernetes.default"
|
| 52 |
+
)
|
| 53 |
+
KUBERNETES_SERVICE_PORT = os.getenv("KUBERNETES_SERVICE_PORT_HTTPS", "443")
|
| 54 |
+
KUBERNETES_HOST = f"{KUBERNETES_SERVICE_HOST}:{KUBERNETES_SERVICE_PORT}"
|
| 55 |
+
# Key for GKE label that identifies which multi-host replica a pod belongs to
|
| 56 |
+
REPLICA_INDEX_KEY = "replicaIndex"
|
| 57 |
+
|
| 58 |
+
TOKEN_REFRESH_PERIOD = datetime.timedelta(minutes=1)
|
| 59 |
+
|
| 60 |
+
# Design:
|
| 61 |
+
|
| 62 |
+
# Each modification the autoscaler wants to make is posted to the API server goal state
|
| 63 |
+
# (e.g. if the autoscaler wants to scale up, it increases the number of
|
| 64 |
+
# replicas of the worker group it wants to scale, if it wants to scale down
|
| 65 |
+
# it decreases the number of replicas and adds the exact pods that should be
|
| 66 |
+
# terminated to the scaleStrategy).
|
| 67 |
+
|
| 68 |
+
# KubeRayNodeProvider inherits from BatchingNodeProvider.
|
| 69 |
+
# Thus, the autoscaler's create and terminate requests are batched into a single
|
| 70 |
+
# Scale Request object which is submitted at the end of autoscaler update.
|
| 71 |
+
# KubeRay node provider converts the ScaleRequest into a RayCluster CR patch
|
| 72 |
+
# and applies the patch in the submit_scale_request method.
|
| 73 |
+
|
| 74 |
+
# To reduce potential for race conditions, KubeRayNodeProvider
|
| 75 |
+
# aborts the autoscaler update if the operator has not yet processed workersToDelete -
|
| 76 |
+
# see KubeRayNodeProvider.safe_to_scale().
|
| 77 |
+
# Once it is confirmed that workersToDelete have been cleaned up, KubeRayNodeProvider
|
| 78 |
+
# clears the workersToDelete list.
|
| 79 |
+
|
| 80 |
+
|
| 81 |
+
# Note: Log handlers set up in autoscaling monitor entrypoint.
|
| 82 |
+
logger = logging.getLogger(__name__)
|
| 83 |
+
|
| 84 |
+
|
| 85 |
+
def node_data_from_pod(pod: Dict[str, Any]) -> NodeData:
|
| 86 |
+
"""Converts a Ray pod extracted from K8s into Ray NodeData.
|
| 87 |
+
NodeData is processed by BatchingNodeProvider.
|
| 88 |
+
"""
|
| 89 |
+
kind, type = kind_and_type(pod)
|
| 90 |
+
status = status_tag(pod)
|
| 91 |
+
ip = pod_ip(pod)
|
| 92 |
+
replica_index = _replica_index_label(pod)
|
| 93 |
+
return NodeData(
|
| 94 |
+
kind=kind, type=type, replica_index=replica_index, status=status, ip=ip
|
| 95 |
+
)
|
| 96 |
+
|
| 97 |
+
|
| 98 |
+
def kind_and_type(pod: Dict[str, Any]) -> Tuple[NodeKind, NodeType]:
|
| 99 |
+
"""Determine Ray node kind (head or workers) and node type (worker group name)
|
| 100 |
+
from a Ray pod's labels.
|
| 101 |
+
"""
|
| 102 |
+
labels = pod["metadata"]["labels"]
|
| 103 |
+
kind = (
|
| 104 |
+
NODE_KIND_HEAD
|
| 105 |
+
if labels[KUBERAY_LABEL_KEY_KIND] == KUBERAY_KIND_HEAD
|
| 106 |
+
else NODE_KIND_WORKER
|
| 107 |
+
)
|
| 108 |
+
type = labels[KUBERAY_LABEL_KEY_TYPE]
|
| 109 |
+
return kind, type
|
| 110 |
+
|
| 111 |
+
|
| 112 |
+
def _replica_index_label(pod: Dict[str, Any]) -> Optional[str]:
|
| 113 |
+
"""Returns the replicaIndex label for a Pod in a multi-host TPU worker group.
|
| 114 |
+
The replicaIndex label is set by the GKE TPU Ray webhook and is of
|
| 115 |
+
the form {$WORKER_GROUP_NAME-$REPLICA_INDEX} where $REPLICA_INDEX
|
| 116 |
+
is an integer from 0 to Replicas-1.
|
| 117 |
+
"""
|
| 118 |
+
labels = pod["metadata"]["labels"]
|
| 119 |
+
return labels.get(REPLICA_INDEX_KEY, None)
|
| 120 |
+
|
| 121 |
+
|
| 122 |
+
def pod_ip(pod: Dict[str, Any]) -> NodeIP:
|
| 123 |
+
return pod["status"].get("podIP", "IP not yet assigned")
|
| 124 |
+
|
| 125 |
+
|
| 126 |
+
def status_tag(pod: Dict[str, Any]) -> NodeStatus:
|
| 127 |
+
"""Convert pod state to Ray autoscaler node status.
|
| 128 |
+
|
| 129 |
+
See the doc string of the class
|
| 130 |
+
batching_node_provider.NodeData for the semantics of node status.
|
| 131 |
+
"""
|
| 132 |
+
if (
|
| 133 |
+
"containerStatuses" not in pod["status"]
|
| 134 |
+
or not pod["status"]["containerStatuses"]
|
| 135 |
+
):
|
| 136 |
+
return "pending"
|
| 137 |
+
|
| 138 |
+
state = pod["status"]["containerStatuses"][0]["state"]
|
| 139 |
+
|
| 140 |
+
if "pending" in state:
|
| 141 |
+
return "pending"
|
| 142 |
+
if "running" in state:
|
| 143 |
+
return STATUS_UP_TO_DATE
|
| 144 |
+
if "waiting" in state:
|
| 145 |
+
return "waiting"
|
| 146 |
+
if "terminated" in state:
|
| 147 |
+
return STATUS_UPDATE_FAILED
|
| 148 |
+
raise ValueError("Unexpected container state.")
|
| 149 |
+
|
| 150 |
+
|
| 151 |
+
def worker_delete_patch(group_index: str, workers_to_delete: List[NodeID]):
|
| 152 |
+
path = f"/spec/workerGroupSpecs/{group_index}/scaleStrategy"
|
| 153 |
+
value = {"workersToDelete": workers_to_delete}
|
| 154 |
+
return replace_patch(path, value)
|
| 155 |
+
|
| 156 |
+
|
| 157 |
+
def worker_replica_patch(group_index: str, target_replicas: int):
|
| 158 |
+
path = f"/spec/workerGroupSpecs/{group_index}/replicas"
|
| 159 |
+
value = target_replicas
|
| 160 |
+
return replace_patch(path, value)
|
| 161 |
+
|
| 162 |
+
|
| 163 |
+
def replace_patch(path: str, value: Any) -> Dict[str, Any]:
|
| 164 |
+
return {"op": "replace", "path": path, "value": value}
|
| 165 |
+
|
| 166 |
+
|
| 167 |
+
def load_k8s_secrets() -> Tuple[Dict[str, str], str]:
|
| 168 |
+
"""
|
| 169 |
+
Loads secrets needed to access K8s resources.
|
| 170 |
+
|
| 171 |
+
Returns:
|
| 172 |
+
headers: Headers with K8s access token
|
| 173 |
+
verify: Path to certificate
|
| 174 |
+
"""
|
| 175 |
+
with open("/var/run/secrets/kubernetes.io/serviceaccount/token") as secret:
|
| 176 |
+
token = secret.read()
|
| 177 |
+
|
| 178 |
+
headers = {
|
| 179 |
+
"Authorization": "Bearer " + token,
|
| 180 |
+
}
|
| 181 |
+
verify = "/var/run/secrets/kubernetes.io/serviceaccount/ca.crt"
|
| 182 |
+
|
| 183 |
+
return headers, verify
|
| 184 |
+
|
| 185 |
+
|
| 186 |
+
def url_from_resource(
|
| 187 |
+
namespace: str,
|
| 188 |
+
path: str,
|
| 189 |
+
kuberay_crd_version: str = KUBERAY_CRD_VER,
|
| 190 |
+
kubernetes_host: str = KUBERNETES_HOST,
|
| 191 |
+
) -> str:
|
| 192 |
+
"""Convert resource path to REST URL for Kubernetes API server.
|
| 193 |
+
|
| 194 |
+
Args:
|
| 195 |
+
namespace: The K8s namespace of the resource
|
| 196 |
+
path: The part of the resource path that starts with the resource type.
|
| 197 |
+
Supported resource types are "pods" and "rayclusters".
|
| 198 |
+
kuberay_crd_version: The API version of the KubeRay CRD.
|
| 199 |
+
Looks like "v1alpha1", "v1".
|
| 200 |
+
kubernetes_host: The host of the Kubernetes API server.
|
| 201 |
+
Uses $KUBERNETES_SERVICE_HOST and
|
| 202 |
+
$KUBERNETES_SERVICE_PORT to construct the kubernetes_host if not provided.
|
| 203 |
+
|
| 204 |
+
When set by Kubernetes,
|
| 205 |
+
$KUBERNETES_SERVICE_HOST could be an IP address. That's why the https
|
| 206 |
+
scheme is added here.
|
| 207 |
+
|
| 208 |
+
Defaults to "https://kubernetes.default:443".
|
| 209 |
+
"""
|
| 210 |
+
if kubernetes_host.startswith("http://"):
|
| 211 |
+
raise ValueError("Kubernetes host must be accessed over HTTPS.")
|
| 212 |
+
if not kubernetes_host.startswith("https://"):
|
| 213 |
+
kubernetes_host = "https://" + kubernetes_host
|
| 214 |
+
if path.startswith("pods"):
|
| 215 |
+
api_group = "/api/v1"
|
| 216 |
+
elif path.startswith("rayclusters"):
|
| 217 |
+
api_group = "/apis/ray.io/" + kuberay_crd_version
|
| 218 |
+
else:
|
| 219 |
+
raise NotImplementedError("Tried to access unknown entity at {}".format(path))
|
| 220 |
+
return kubernetes_host + api_group + "/namespaces/" + namespace + "/" + path
|
| 221 |
+
|
| 222 |
+
|
| 223 |
+
def _worker_group_index(raycluster: Dict[str, Any], group_name: str) -> int:
|
| 224 |
+
"""Extract worker group index from RayCluster."""
|
| 225 |
+
group_names = [
|
| 226 |
+
spec["groupName"] for spec in raycluster["spec"].get("workerGroupSpecs", [])
|
| 227 |
+
]
|
| 228 |
+
return group_names.index(group_name)
|
| 229 |
+
|
| 230 |
+
|
| 231 |
+
def _worker_group_max_replicas(
|
| 232 |
+
raycluster: Dict[str, Any], group_index: int
|
| 233 |
+
) -> Optional[int]:
|
| 234 |
+
"""Extract the maxReplicas of a worker group.
|
| 235 |
+
|
| 236 |
+
If maxReplicas is unset, return None, to be interpreted as "no constraint".
|
| 237 |
+
At time of writing, it should be impossible for maxReplicas to be unset, but it's
|
| 238 |
+
better to handle this anyway.
|
| 239 |
+
"""
|
| 240 |
+
return raycluster["spec"]["workerGroupSpecs"][group_index].get("maxReplicas")
|
| 241 |
+
|
| 242 |
+
|
| 243 |
+
def _worker_group_replicas(raycluster: Dict[str, Any], group_index: int):
|
| 244 |
+
# 1 is the default replicas value used by the KubeRay operator
|
| 245 |
+
return raycluster["spec"]["workerGroupSpecs"][group_index].get("replicas", 1)
|
| 246 |
+
|
| 247 |
+
|
| 248 |
+
class IKubernetesHttpApiClient(ABC):
|
| 249 |
+
"""
|
| 250 |
+
An interface for a Kubernetes HTTP API client.
|
| 251 |
+
|
| 252 |
+
This interface could be used to mock the Kubernetes API client in tests.
|
| 253 |
+
"""
|
| 254 |
+
|
| 255 |
+
@abstractmethod
|
| 256 |
+
def get(self, path: str) -> Dict[str, Any]:
|
| 257 |
+
"""Wrapper for REST GET of resource with proper headers."""
|
| 258 |
+
pass
|
| 259 |
+
|
| 260 |
+
@abstractmethod
|
| 261 |
+
def patch(self, path: str, payload: List[Dict[str, Any]]) -> Dict[str, Any]:
|
| 262 |
+
"""Wrapper for REST PATCH of resource with proper headers."""
|
| 263 |
+
pass
|
| 264 |
+
|
| 265 |
+
|
| 266 |
+
class KubernetesHttpApiClient(IKubernetesHttpApiClient):
|
| 267 |
+
def __init__(self, namespace: str, kuberay_crd_version: str = KUBERAY_CRD_VER):
|
| 268 |
+
self._kuberay_crd_version = kuberay_crd_version
|
| 269 |
+
self._namespace = namespace
|
| 270 |
+
self._token_expires_at = datetime.datetime.now() + TOKEN_REFRESH_PERIOD
|
| 271 |
+
self._headers, self._verify = None, None
|
| 272 |
+
|
| 273 |
+
def _get_refreshed_headers_and_verify(self):
|
| 274 |
+
if (datetime.datetime.now() >= self._token_expires_at) or (
|
| 275 |
+
self._headers is None or self._verify is None
|
| 276 |
+
):
|
| 277 |
+
logger.info("Refreshing K8s API client token and certs.")
|
| 278 |
+
self._headers, self._verify = load_k8s_secrets()
|
| 279 |
+
self._token_expires_at = datetime.datetime.now() + TOKEN_REFRESH_PERIOD
|
| 280 |
+
return self._headers, self._verify
|
| 281 |
+
else:
|
| 282 |
+
return self._headers, self._verify
|
| 283 |
+
|
| 284 |
+
def get(self, path: str) -> Dict[str, Any]:
|
| 285 |
+
"""Wrapper for REST GET of resource with proper headers.
|
| 286 |
+
|
| 287 |
+
Args:
|
| 288 |
+
path: The part of the resource path that starts with the resource type.
|
| 289 |
+
|
| 290 |
+
Returns:
|
| 291 |
+
The JSON response of the GET request.
|
| 292 |
+
|
| 293 |
+
Raises:
|
| 294 |
+
HTTPError: If the GET request fails.
|
| 295 |
+
"""
|
| 296 |
+
url = url_from_resource(
|
| 297 |
+
namespace=self._namespace,
|
| 298 |
+
path=path,
|
| 299 |
+
kuberay_crd_version=self._kuberay_crd_version,
|
| 300 |
+
)
|
| 301 |
+
|
| 302 |
+
headers, verify = self._get_refreshed_headers_and_verify()
|
| 303 |
+
result = requests.get(
|
| 304 |
+
url,
|
| 305 |
+
headers=headers,
|
| 306 |
+
timeout=KUBERAY_REQUEST_TIMEOUT_S,
|
| 307 |
+
verify=verify,
|
| 308 |
+
)
|
| 309 |
+
if not result.status_code == 200:
|
| 310 |
+
result.raise_for_status()
|
| 311 |
+
return result.json()
|
| 312 |
+
|
| 313 |
+
def patch(self, path: str, payload: List[Dict[str, Any]]) -> Dict[str, Any]:
|
| 314 |
+
"""Wrapper for REST PATCH of resource with proper headers
|
| 315 |
+
|
| 316 |
+
Args:
|
| 317 |
+
path: The part of the resource path that starts with the resource type.
|
| 318 |
+
payload: The JSON patch payload.
|
| 319 |
+
|
| 320 |
+
Returns:
|
| 321 |
+
The JSON response of the PATCH request.
|
| 322 |
+
|
| 323 |
+
Raises:
|
| 324 |
+
HTTPError: If the PATCH request fails.
|
| 325 |
+
"""
|
| 326 |
+
url = url_from_resource(
|
| 327 |
+
namespace=self._namespace,
|
| 328 |
+
path=path,
|
| 329 |
+
kuberay_crd_version=self._kuberay_crd_version,
|
| 330 |
+
)
|
| 331 |
+
headers, verify = self._get_refreshed_headers_and_verify()
|
| 332 |
+
result = requests.patch(
|
| 333 |
+
url,
|
| 334 |
+
json.dumps(payload),
|
| 335 |
+
headers={**headers, "Content-type": "application/json-patch+json"},
|
| 336 |
+
verify=verify,
|
| 337 |
+
)
|
| 338 |
+
if not result.status_code == 200:
|
| 339 |
+
result.raise_for_status()
|
| 340 |
+
return result.json()
|
| 341 |
+
|
| 342 |
+
|
| 343 |
+
class KubeRayNodeProvider(BatchingNodeProvider): # type: ignore
|
| 344 |
+
def __init__(
|
| 345 |
+
self,
|
| 346 |
+
provider_config: Dict[str, Any],
|
| 347 |
+
cluster_name: str,
|
| 348 |
+
):
|
| 349 |
+
logger.info("Creating KubeRayNodeProvider.")
|
| 350 |
+
self.namespace = provider_config["namespace"]
|
| 351 |
+
self.cluster_name = cluster_name
|
| 352 |
+
|
| 353 |
+
self.k8s_api_client = KubernetesHttpApiClient(self.namespace)
|
| 354 |
+
|
| 355 |
+
assert (
|
| 356 |
+
provider_config.get(WORKER_LIVENESS_CHECK_KEY, True) is False
|
| 357 |
+
), f"To use KubeRayNodeProvider, must set `{WORKER_LIVENESS_CHECK_KEY}:False`."
|
| 358 |
+
BatchingNodeProvider.__init__(self, provider_config, cluster_name)
|
| 359 |
+
|
| 360 |
+
def get_node_data(self) -> Dict[NodeID, NodeData]:
|
| 361 |
+
"""Queries K8s for pods in the RayCluster. Converts that pod data into a
|
| 362 |
+
map of pod name to Ray NodeData, as required by BatchingNodeProvider.
|
| 363 |
+
"""
|
| 364 |
+
# Store the raycluster CR
|
| 365 |
+
self._raycluster = self._get(f"rayclusters/{self.cluster_name}")
|
| 366 |
+
|
| 367 |
+
# Get the pods resource version.
|
| 368 |
+
# Specifying a resource version in list requests is important for scalability:
|
| 369 |
+
# https://kubernetes.io/docs/reference/using-api/api-concepts/#semantics-for-get-and-list
|
| 370 |
+
resource_version = self._get_pods_resource_version()
|
| 371 |
+
if resource_version:
|
| 372 |
+
logger.info(
|
| 373 |
+
f"Listing pods for RayCluster {self.cluster_name}"
|
| 374 |
+
f" in namespace {self.namespace}"
|
| 375 |
+
f" at pods resource version >= {resource_version}."
|
| 376 |
+
)
|
| 377 |
+
|
| 378 |
+
# Filter pods by cluster_name.
|
| 379 |
+
label_selector = requests.utils.quote(f"ray.io/cluster={self.cluster_name}")
|
| 380 |
+
|
| 381 |
+
resource_path = f"pods?labelSelector={label_selector}"
|
| 382 |
+
if resource_version:
|
| 383 |
+
resource_path += (
|
| 384 |
+
f"&resourceVersion={resource_version}"
|
| 385 |
+
+ "&resourceVersionMatch=NotOlderThan"
|
| 386 |
+
)
|
| 387 |
+
|
| 388 |
+
pod_list = self._get(resource_path)
|
| 389 |
+
fetched_resource_version = pod_list["metadata"]["resourceVersion"]
|
| 390 |
+
logger.info(
|
| 391 |
+
f"Fetched pod data at resource version" f" {fetched_resource_version}."
|
| 392 |
+
)
|
| 393 |
+
|
| 394 |
+
# Extract node data from the pod list.
|
| 395 |
+
node_data_dict = {}
|
| 396 |
+
for pod in pod_list["items"]:
|
| 397 |
+
# Kubernetes sets metadata.deletionTimestamp immediately after admitting a
|
| 398 |
+
# request to delete an object. Full removal of the object may take some time
|
| 399 |
+
# after the deletion timestamp is set. See link for details:
|
| 400 |
+
# https://kubernetes.io/docs/reference/using-api/api-concepts/#resource-deletion
|
| 401 |
+
if "deletionTimestamp" in pod["metadata"]:
|
| 402 |
+
# Ignore pods marked for termination.
|
| 403 |
+
continue
|
| 404 |
+
pod_name = pod["metadata"]["name"]
|
| 405 |
+
node_data_dict[pod_name] = node_data_from_pod(pod)
|
| 406 |
+
return node_data_dict
|
| 407 |
+
|
| 408 |
+
def submit_scale_request(self, scale_request: ScaleRequest):
|
| 409 |
+
"""Converts the scale request generated by BatchingNodeProvider into
|
| 410 |
+
a patch that modifies the RayCluster CR's replicas and/or workersToDelete
|
| 411 |
+
fields. Then submits the patch to the K8s API server.
|
| 412 |
+
"""
|
| 413 |
+
# Transform the scale request into a patch payload.
|
| 414 |
+
patch_payload = self._scale_request_to_patch_payload(
|
| 415 |
+
scale_request, self._raycluster
|
| 416 |
+
)
|
| 417 |
+
|
| 418 |
+
# Submit the patch to K8s.
|
| 419 |
+
logger.info(
|
| 420 |
+
"Autoscaler is submitting the following patch to RayCluster "
|
| 421 |
+
f"{self.cluster_name} in namespace {self.namespace}."
|
| 422 |
+
)
|
| 423 |
+
logger.info(patch_payload)
|
| 424 |
+
self._submit_raycluster_patch(patch_payload)
|
| 425 |
+
|
| 426 |
+
def safe_to_scale(self) -> bool:
|
| 427 |
+
"""Returns False iff non_terminated_nodes contains any pods in the RayCluster's
|
| 428 |
+
workersToDelete lists.
|
| 429 |
+
|
| 430 |
+
Explanation:
|
| 431 |
+
If there are any workersToDelete which are non-terminated,
|
| 432 |
+
we should wait for the operator to do its job and delete those
|
| 433 |
+
pods. Therefore, we back off the autoscaler update.
|
| 434 |
+
|
| 435 |
+
If, on the other hand, all of the workersToDelete have already been cleaned up,
|
| 436 |
+
then we patch away the workersToDelete lists and return True.
|
| 437 |
+
In the future, we may consider having the operator clean up workersToDelete
|
| 438 |
+
on it own:
|
| 439 |
+
https://github.com/ray-project/kuberay/issues/733
|
| 440 |
+
|
| 441 |
+
Note (Dmitri):
|
| 442 |
+
It is stylistically bad that this function has a side effect.
|
| 443 |
+
"""
|
| 444 |
+
# Get the list of nodes.
|
| 445 |
+
node_set = set(self.node_data_dict.keys())
|
| 446 |
+
worker_groups = self._raycluster["spec"].get("workerGroupSpecs", [])
|
| 447 |
+
|
| 448 |
+
# Accumulates the indices of worker groups with non-empty workersToDelete
|
| 449 |
+
non_empty_worker_group_indices = []
|
| 450 |
+
|
| 451 |
+
for group_index, worker_group in enumerate(worker_groups):
|
| 452 |
+
workersToDelete = worker_group.get("scaleStrategy", {}).get(
|
| 453 |
+
"workersToDelete", []
|
| 454 |
+
)
|
| 455 |
+
if workersToDelete:
|
| 456 |
+
non_empty_worker_group_indices.append(group_index)
|
| 457 |
+
for worker in workersToDelete:
|
| 458 |
+
if worker in node_set:
|
| 459 |
+
# The operator hasn't removed this worker yet. Abort
|
| 460 |
+
# the autoscaler update.
|
| 461 |
+
logger.warning(f"Waiting for operator to remove worker {worker}.")
|
| 462 |
+
return False
|
| 463 |
+
|
| 464 |
+
# All required workersToDelete have been removed.
|
| 465 |
+
# Clean up the workersToDelete field.
|
| 466 |
+
patch_payload = []
|
| 467 |
+
for group_index in non_empty_worker_group_indices:
|
| 468 |
+
patch = worker_delete_patch(group_index, workers_to_delete=[])
|
| 469 |
+
patch_payload.append(patch)
|
| 470 |
+
if patch_payload:
|
| 471 |
+
logger.info("Cleaning up workers to delete.")
|
| 472 |
+
logger.info(f"Submitting patch {patch_payload}.")
|
| 473 |
+
self._submit_raycluster_patch(patch_payload)
|
| 474 |
+
|
| 475 |
+
# It's safe to proceed with the autoscaler update.
|
| 476 |
+
return True
|
| 477 |
+
|
| 478 |
+
def _get_pods_resource_version(self) -> str:
|
| 479 |
+
"""
|
| 480 |
+
Extract a recent pods resource version by reading the head pod's
|
| 481 |
+
metadata.resourceVersion of the response.
|
| 482 |
+
"""
|
| 483 |
+
if not RAY_HEAD_POD_NAME:
|
| 484 |
+
return None
|
| 485 |
+
pod_resp = self._get(f"pods/{RAY_HEAD_POD_NAME}")
|
| 486 |
+
return pod_resp["metadata"]["resourceVersion"]
|
| 487 |
+
|
| 488 |
+
def _scale_request_to_patch_payload(
|
| 489 |
+
self, scale_request: ScaleRequest, raycluster: Dict[str, Any]
|
| 490 |
+
) -> List[Dict[str, Any]]:
|
| 491 |
+
"""Converts autoscaler scale request into a RayCluster CR patch payload."""
|
| 492 |
+
patch_payload = []
|
| 493 |
+
# Collect patches for replica counts.
|
| 494 |
+
for node_type, target_replicas in scale_request.desired_num_workers.items():
|
| 495 |
+
group_index = _worker_group_index(raycluster, node_type)
|
| 496 |
+
group_max_replicas = _worker_group_max_replicas(raycluster, group_index)
|
| 497 |
+
# Cap the replica count to maxReplicas.
|
| 498 |
+
if group_max_replicas is not None and group_max_replicas < target_replicas:
|
| 499 |
+
logger.warning(
|
| 500 |
+
"Autoscaler attempted to create "
|
| 501 |
+
+ "more than maxReplicas pods of type {}.".format(node_type)
|
| 502 |
+
)
|
| 503 |
+
target_replicas = group_max_replicas
|
| 504 |
+
# Check if we need to change the target count.
|
| 505 |
+
if target_replicas == _worker_group_replicas(raycluster, group_index):
|
| 506 |
+
# No patch required.
|
| 507 |
+
continue
|
| 508 |
+
# Need to patch replica count. Format the patch and add it to the payload.
|
| 509 |
+
patch = worker_replica_patch(group_index, target_replicas)
|
| 510 |
+
patch_payload.append(patch)
|
| 511 |
+
|
| 512 |
+
# Maps node_type to nodes to delete for that group.
|
| 513 |
+
deletion_groups = defaultdict(list)
|
| 514 |
+
for worker in scale_request.workers_to_delete:
|
| 515 |
+
node_type = self.node_tags(worker)[TAG_RAY_USER_NODE_TYPE]
|
| 516 |
+
deletion_groups[node_type].append(worker)
|
| 517 |
+
|
| 518 |
+
for node_type, workers_to_delete in deletion_groups.items():
|
| 519 |
+
group_index = _worker_group_index(raycluster, node_type)
|
| 520 |
+
patch = worker_delete_patch(group_index, workers_to_delete)
|
| 521 |
+
patch_payload.append(patch)
|
| 522 |
+
|
| 523 |
+
return patch_payload
|
| 524 |
+
|
| 525 |
+
def _submit_raycluster_patch(self, patch_payload: List[Dict[str, Any]]):
|
| 526 |
+
"""Submits a patch to modify a RayCluster CR."""
|
| 527 |
+
path = "rayclusters/{}".format(self.cluster_name)
|
| 528 |
+
self._patch(path, patch_payload)
|
| 529 |
+
|
| 530 |
+
def _get(self, path: str) -> Dict[str, Any]:
|
| 531 |
+
"""Wrapper for REST GET of resource with proper headers."""
|
| 532 |
+
return self.k8s_api_client.get(path)
|
| 533 |
+
|
| 534 |
+
def _patch(self, path: str, payload: List[Dict[str, Any]]) -> Dict[str, Any]:
|
| 535 |
+
"""Wrapper for REST PATCH of resource with proper headers."""
|
| 536 |
+
return self.k8s_api_client.patch(path, payload)
|
.venv/lib/python3.11/site-packages/ray/autoscaler/_private/kuberay/run_autoscaler.py
ADDED
|
@@ -0,0 +1,119 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import logging
|
| 2 |
+
import os
|
| 3 |
+
import subprocess
|
| 4 |
+
import time
|
| 5 |
+
|
| 6 |
+
import ray
|
| 7 |
+
from ray._private import ray_constants
|
| 8 |
+
from ray._private.ray_logging import setup_component_logger
|
| 9 |
+
from ray._private.services import get_node_ip_address
|
| 10 |
+
from ray._private.utils import try_to_create_directory
|
| 11 |
+
from ray._raylet import GcsClient
|
| 12 |
+
from ray.autoscaler._private.kuberay.autoscaling_config import AutoscalingConfigProducer
|
| 13 |
+
from ray.autoscaler._private.monitor import Monitor
|
| 14 |
+
from ray.autoscaler.v2.instance_manager.config import KubeRayConfigReader
|
| 15 |
+
from ray.autoscaler.v2.utils import is_autoscaler_v2
|
| 16 |
+
|
| 17 |
+
logger = logging.getLogger(__name__)
|
| 18 |
+
|
| 19 |
+
BACKOFF_S = 5
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
def _get_log_dir() -> str:
|
| 23 |
+
return os.path.join(
|
| 24 |
+
ray._private.utils.get_ray_temp_dir(),
|
| 25 |
+
ray._private.ray_constants.SESSION_LATEST,
|
| 26 |
+
"logs",
|
| 27 |
+
)
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
def run_kuberay_autoscaler(cluster_name: str, cluster_namespace: str):
|
| 31 |
+
"""Wait until the Ray head container is ready. Then start the autoscaler."""
|
| 32 |
+
head_ip = get_node_ip_address()
|
| 33 |
+
ray_address = f"{head_ip}:6379"
|
| 34 |
+
while True:
|
| 35 |
+
try:
|
| 36 |
+
# Autoscaler Ray version might not exactly match GCS version, so skip the
|
| 37 |
+
# version check when checking GCS status.
|
| 38 |
+
subprocess.check_call(
|
| 39 |
+
[
|
| 40 |
+
"ray",
|
| 41 |
+
"health-check",
|
| 42 |
+
"--address",
|
| 43 |
+
ray_address,
|
| 44 |
+
"--skip-version-check",
|
| 45 |
+
]
|
| 46 |
+
)
|
| 47 |
+
logger.info("The Ray head is ready. Starting the autoscaler.")
|
| 48 |
+
break
|
| 49 |
+
except subprocess.CalledProcessError:
|
| 50 |
+
logger.warning(
|
| 51 |
+
f"The Ray head is not ready. Will check again in {BACKOFF_S} seconds."
|
| 52 |
+
)
|
| 53 |
+
time.sleep(BACKOFF_S)
|
| 54 |
+
|
| 55 |
+
# The Ray head container sets up the log directory. Thus, we set up logging
|
| 56 |
+
# only after the Ray head is ready.
|
| 57 |
+
_setup_logging()
|
| 58 |
+
|
| 59 |
+
# autoscaling_config_producer reads the RayCluster CR from K8s and uses the CR
|
| 60 |
+
# to output an autoscaling config.
|
| 61 |
+
autoscaling_config_producer = AutoscalingConfigProducer(
|
| 62 |
+
cluster_name, cluster_namespace
|
| 63 |
+
)
|
| 64 |
+
|
| 65 |
+
gcs_client = GcsClient(ray_address)
|
| 66 |
+
if is_autoscaler_v2(fetch_from_server=True, gcs_client=gcs_client):
|
| 67 |
+
from ray.autoscaler.v2.monitor import AutoscalerMonitor as MonitorV2
|
| 68 |
+
|
| 69 |
+
MonitorV2(
|
| 70 |
+
address=gcs_client.address,
|
| 71 |
+
config_reader=KubeRayConfigReader(autoscaling_config_producer),
|
| 72 |
+
log_dir=_get_log_dir(),
|
| 73 |
+
monitor_ip=head_ip,
|
| 74 |
+
).run()
|
| 75 |
+
else:
|
| 76 |
+
Monitor(
|
| 77 |
+
address=gcs_client.address,
|
| 78 |
+
# The `autoscaling_config` arg can be a dict or a `Callable: () -> dict`.
|
| 79 |
+
# In this case, it's a callable.
|
| 80 |
+
autoscaling_config=autoscaling_config_producer,
|
| 81 |
+
monitor_ip=head_ip,
|
| 82 |
+
# Let the autoscaler process exit after it hits 5 exceptions.
|
| 83 |
+
# (See ray.autoscaler._private.constants.AUTOSCALER_MAX_NUM_FAILURES.)
|
| 84 |
+
# Kubernetes will then restart the autoscaler container.
|
| 85 |
+
retry_on_failure=False,
|
| 86 |
+
).run()
|
| 87 |
+
|
| 88 |
+
|
| 89 |
+
def _setup_logging() -> None:
|
| 90 |
+
"""Log to autoscaler log file
|
| 91 |
+
(typically, /tmp/ray/session_latest/logs/monitor.*)
|
| 92 |
+
|
| 93 |
+
Also log to pod stdout (logs viewable with `kubectl logs <head-pod> -c autoscaler`).
|
| 94 |
+
"""
|
| 95 |
+
log_dir = _get_log_dir()
|
| 96 |
+
# The director should already exist, but try (safely) to create it just in case.
|
| 97 |
+
try_to_create_directory(log_dir)
|
| 98 |
+
|
| 99 |
+
# Write logs at info level to monitor.log.
|
| 100 |
+
setup_component_logger(
|
| 101 |
+
logging_level=ray_constants.LOGGER_LEVEL,
|
| 102 |
+
logging_format=ray_constants.LOGGER_FORMAT,
|
| 103 |
+
log_dir=log_dir,
|
| 104 |
+
filename=ray_constants.MONITOR_LOG_FILE_NAME, # monitor.log
|
| 105 |
+
max_bytes=ray_constants.LOGGING_ROTATE_BYTES,
|
| 106 |
+
backup_count=ray_constants.LOGGING_ROTATE_BACKUP_COUNT,
|
| 107 |
+
)
|
| 108 |
+
|
| 109 |
+
# For the autoscaler, the root logger _also_ needs to write to stderr, not just
|
| 110 |
+
# ray_constants.MONITOR_LOG_FILE_NAME.
|
| 111 |
+
level = logging.getLevelName(ray_constants.LOGGER_LEVEL.upper())
|
| 112 |
+
stderr_handler = logging._StderrHandler()
|
| 113 |
+
stderr_handler.setFormatter(logging.Formatter(ray_constants.LOGGER_FORMAT))
|
| 114 |
+
stderr_handler.setLevel(level)
|
| 115 |
+
logging.root.setLevel(level)
|
| 116 |
+
logging.root.addHandler(stderr_handler)
|
| 117 |
+
|
| 118 |
+
# The stdout handler was set up in the Ray CLI entry point.
|
| 119 |
+
# See ray.scripts.scripts::cli().
|
.venv/lib/python3.11/site-packages/ray/autoscaler/_private/kuberay/utils.py
ADDED
|
@@ -0,0 +1,111 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Source:
|
| 2 |
+
# https://github.com/kubernetes-client/python/blob/master/kubernetes/utils/quantity.py
|
| 3 |
+
from decimal import Decimal, InvalidOperation
|
| 4 |
+
from functools import reduce
|
| 5 |
+
from typing import Optional
|
| 6 |
+
|
| 7 |
+
# Mapping used to get generation for TPU-{accelerator}-head resource
|
| 8 |
+
# https://cloud.google.com/kubernetes-engine/docs/how-to/tpus#run
|
| 9 |
+
gke_tpu_accelerator_to_generation = {
|
| 10 |
+
"tpu-v4-podslice": "v4",
|
| 11 |
+
"tpu-v5-lite-device": "v5e",
|
| 12 |
+
"tpu-v5-lite-podslice": "v5e",
|
| 13 |
+
"tpu-v5p-slice": "v5p",
|
| 14 |
+
"tpu-v6e-slice": "v6e",
|
| 15 |
+
}
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
def parse_quantity(quantity):
|
| 19 |
+
"""
|
| 20 |
+
Parse kubernetes canonical form quantity like 200Mi to a decimal number.
|
| 21 |
+
Supported SI suffixes:
|
| 22 |
+
base1024: Ki | Mi | Gi | Ti | Pi | Ei
|
| 23 |
+
base1000: n | u | m | "" | k | M | G | T | P | E
|
| 24 |
+
|
| 25 |
+
See
|
| 26 |
+
https://github.com/kubernetes/apimachinery/blob/master/pkg/api/resource/quantity.go
|
| 27 |
+
|
| 28 |
+
Input:
|
| 29 |
+
quantity: string. kubernetes canonical form quantity
|
| 30 |
+
|
| 31 |
+
Returns:
|
| 32 |
+
Decimal
|
| 33 |
+
|
| 34 |
+
Raises:
|
| 35 |
+
ValueError on invalid or unknown input
|
| 36 |
+
"""
|
| 37 |
+
if isinstance(quantity, (int, float, Decimal)):
|
| 38 |
+
return Decimal(quantity)
|
| 39 |
+
|
| 40 |
+
exponents = {
|
| 41 |
+
"n": -3,
|
| 42 |
+
"u": -2,
|
| 43 |
+
"m": -1,
|
| 44 |
+
"K": 1,
|
| 45 |
+
"k": 1,
|
| 46 |
+
"M": 2,
|
| 47 |
+
"G": 3,
|
| 48 |
+
"T": 4,
|
| 49 |
+
"P": 5,
|
| 50 |
+
"E": 6,
|
| 51 |
+
}
|
| 52 |
+
|
| 53 |
+
quantity = str(quantity)
|
| 54 |
+
number = quantity
|
| 55 |
+
suffix = None
|
| 56 |
+
if len(quantity) >= 2 and quantity[-1] == "i":
|
| 57 |
+
if quantity[-2] in exponents:
|
| 58 |
+
number = quantity[:-2]
|
| 59 |
+
suffix = quantity[-2:]
|
| 60 |
+
elif len(quantity) >= 1 and quantity[-1] in exponents:
|
| 61 |
+
number = quantity[:-1]
|
| 62 |
+
suffix = quantity[-1:]
|
| 63 |
+
|
| 64 |
+
try:
|
| 65 |
+
number = Decimal(number)
|
| 66 |
+
except InvalidOperation:
|
| 67 |
+
raise ValueError("Invalid number format: {}".format(number))
|
| 68 |
+
|
| 69 |
+
if suffix is None:
|
| 70 |
+
return number
|
| 71 |
+
|
| 72 |
+
if suffix.endswith("i"):
|
| 73 |
+
base = 1024
|
| 74 |
+
elif len(suffix) == 1:
|
| 75 |
+
base = 1000
|
| 76 |
+
else:
|
| 77 |
+
raise ValueError("{} has unknown suffix".format(quantity))
|
| 78 |
+
|
| 79 |
+
# handle SI inconsistency
|
| 80 |
+
if suffix == "ki":
|
| 81 |
+
raise ValueError("{} has unknown suffix".format(quantity))
|
| 82 |
+
|
| 83 |
+
if suffix[0] not in exponents:
|
| 84 |
+
raise ValueError("{} has unknown suffix".format(quantity))
|
| 85 |
+
|
| 86 |
+
exponent = Decimal(exponents[suffix[0]])
|
| 87 |
+
return number * (base**exponent)
|
| 88 |
+
|
| 89 |
+
|
| 90 |
+
def tpu_node_selectors_to_type(topology: str, accelerator: str) -> Optional[str]:
|
| 91 |
+
"""Convert Kubernetes gke-tpu nodeSelectors to TPU accelerator_type
|
| 92 |
+
for a kuberay TPU worker group.
|
| 93 |
+
Args:
|
| 94 |
+
topology: value of the cloud.google.com/gke-tpu-topology Kubernetes
|
| 95 |
+
nodeSelector, describes the physical topology of the TPU podslice.
|
| 96 |
+
accelerator: value of the cloud.google.com/gke-tpu-accelerator nodeSelector,
|
| 97 |
+
the name of the TPU accelerator, e.g. tpu-v4-podslice
|
| 98 |
+
Returns:
|
| 99 |
+
A string, accelerator_type, e.g. "v4-8".
|
| 100 |
+
"""
|
| 101 |
+
if topology and accelerator:
|
| 102 |
+
generation = gke_tpu_accelerator_to_generation[accelerator]
|
| 103 |
+
# Reduce e.g. "2x2x2" to 8
|
| 104 |
+
chip_dimensions = [int(chip_count) for chip_count in topology.split("x")]
|
| 105 |
+
num_chips = reduce(lambda x, y: x * y, chip_dimensions)
|
| 106 |
+
default_num_cores_per_chip = 1
|
| 107 |
+
if generation == "v4" or generation == "v5p":
|
| 108 |
+
default_num_cores_per_chip = 2
|
| 109 |
+
num_cores = num_chips * default_num_cores_per_chip
|
| 110 |
+
return f"{generation}-{num_cores}"
|
| 111 |
+
return None
|
.venv/lib/python3.11/site-packages/ray/autoscaler/_private/readonly/__init__.py
ADDED
|
File without changes
|
.venv/lib/python3.11/site-packages/ray/autoscaler/_private/readonly/__pycache__/__init__.cpython-311.pyc
ADDED
|
Binary file (205 Bytes). View file
|
|
|
.venv/lib/python3.11/site-packages/ray/autoscaler/_private/readonly/__pycache__/node_provider.cpython-311.pyc
ADDED
|
Binary file (4.39 kB). View file
|
|
|
.venv/lib/python3.11/site-packages/ray/autoscaler/_private/readonly/node_provider.py
ADDED
|
@@ -0,0 +1,80 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from typing import List, Tuple
|
| 2 |
+
|
| 3 |
+
from ray.autoscaler._private.util import format_readonly_node_type
|
| 4 |
+
from ray.autoscaler.node_provider import NodeProvider
|
| 5 |
+
from ray.autoscaler.tags import (
|
| 6 |
+
NODE_KIND_HEAD,
|
| 7 |
+
STATUS_UP_TO_DATE,
|
| 8 |
+
TAG_RAY_NODE_KIND,
|
| 9 |
+
TAG_RAY_NODE_NAME,
|
| 10 |
+
TAG_RAY_NODE_STATUS,
|
| 11 |
+
TAG_RAY_USER_NODE_TYPE,
|
| 12 |
+
)
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
class ReadOnlyNodeProvider(NodeProvider):
|
| 16 |
+
"""A node provider that merely reports the current cluster state.
|
| 17 |
+
|
| 18 |
+
This is used for laptop mode / manual cluster setup modes, in order to
|
| 19 |
+
provide status reporting in the same way for users."""
|
| 20 |
+
|
| 21 |
+
def __init__(self, provider_config, cluster_name):
|
| 22 |
+
NodeProvider.__init__(self, provider_config, cluster_name)
|
| 23 |
+
self.nodes = {}
|
| 24 |
+
|
| 25 |
+
def is_readonly(self):
|
| 26 |
+
return True
|
| 27 |
+
|
| 28 |
+
def _set_nodes(self, nodes: List[Tuple[str, str]]):
|
| 29 |
+
"""Update the set of nodes in the cluster.
|
| 30 |
+
|
| 31 |
+
Args:
|
| 32 |
+
nodes: List of (node_id, node_manager_address) tuples.
|
| 33 |
+
"""
|
| 34 |
+
new_nodes = {}
|
| 35 |
+
for node_id, node_manager_address in nodes:
|
| 36 |
+
# We make up a fake node type for each node (since each node
|
| 37 |
+
# could have its own unique configuration).
|
| 38 |
+
new_nodes[node_id] = {
|
| 39 |
+
# Keep prefix in sync with node config gen in monitor.py
|
| 40 |
+
"node_type": format_readonly_node_type(node_id),
|
| 41 |
+
"ip": node_manager_address,
|
| 42 |
+
}
|
| 43 |
+
self.nodes = new_nodes
|
| 44 |
+
|
| 45 |
+
def non_terminated_nodes(self, tag_filters):
|
| 46 |
+
return list(self.nodes.keys())
|
| 47 |
+
|
| 48 |
+
def is_running(self, node_id):
|
| 49 |
+
return node_id in self.nodes
|
| 50 |
+
|
| 51 |
+
def is_terminated(self, node_id):
|
| 52 |
+
return node_id not in self.nodes
|
| 53 |
+
|
| 54 |
+
def node_tags(self, node_id):
|
| 55 |
+
tags = {
|
| 56 |
+
TAG_RAY_NODE_KIND: NODE_KIND_HEAD,
|
| 57 |
+
TAG_RAY_USER_NODE_TYPE: self.nodes[node_id]["node_type"],
|
| 58 |
+
TAG_RAY_NODE_NAME: node_id,
|
| 59 |
+
TAG_RAY_NODE_STATUS: STATUS_UP_TO_DATE,
|
| 60 |
+
}
|
| 61 |
+
return tags
|
| 62 |
+
|
| 63 |
+
def external_ip(self, node_id):
|
| 64 |
+
return node_id
|
| 65 |
+
|
| 66 |
+
def internal_ip(self, node_id):
|
| 67 |
+
return node_id
|
| 68 |
+
|
| 69 |
+
def set_node_tags(self, node_id, tags):
|
| 70 |
+
raise AssertionError("Readonly node provider cannot be updated")
|
| 71 |
+
|
| 72 |
+
def create_node(self, node_config, tags, count):
|
| 73 |
+
raise AssertionError("Readonly node provider cannot be updated")
|
| 74 |
+
|
| 75 |
+
def terminate_node(self, node_id):
|
| 76 |
+
raise AssertionError("Readonly node provider cannot be updated")
|
| 77 |
+
|
| 78 |
+
@staticmethod
|
| 79 |
+
def bootstrap_config(cluster_config):
|
| 80 |
+
return cluster_config
|
.venv/lib/python3.11/site-packages/ray/autoscaler/aws/__init__.py
ADDED
|
File without changes
|
.venv/lib/python3.11/site-packages/ray/autoscaler/aws/__pycache__/__init__.cpython-311.pyc
ADDED
|
Binary file (191 Bytes). View file
|
|
|
.venv/lib/python3.11/site-packages/ray/autoscaler/aws/cloudwatch/prometheus.yml
ADDED
|
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Prometheus config file
|
| 2 |
+
|
| 3 |
+
# my global config
|
| 4 |
+
global:
|
| 5 |
+
scrape_interval: 10s
|
| 6 |
+
evaluation_interval: 10s
|
| 7 |
+
scrape_timeout: 10s
|
| 8 |
+
|
| 9 |
+
# use ray file-based service discovery file as scrape target.
|
| 10 |
+
scrape_configs:
|
| 11 |
+
- job_name: 'ray'
|
| 12 |
+
file_sd_configs:
|
| 13 |
+
- files:
|
| 14 |
+
- '/tmp/ray/prom_metrics_service_discovery.json'
|
| 15 |
+
refresh_interval: 1m
|
.venv/lib/python3.11/site-packages/ray/autoscaler/aws/cloudwatch/ray_prometheus_waiter.sh
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/bin/bash
|
| 2 |
+
|
| 3 |
+
MAX_ATTEMPTS=120
|
| 4 |
+
DELAY_SECONDS=10
|
| 5 |
+
RAY_PROM_METRICS_FILE_PATH="/tmp/ray/prom_metrics_service_discovery.json"
|
| 6 |
+
CLUSTER_NAME=$1
|
| 7 |
+
while [ $MAX_ATTEMPTS -gt 0 ]; do
|
| 8 |
+
if [ -f $RAY_PROM_METRICS_FILE_PATH ]; then
|
| 9 |
+
echo "Ray Prometheus metrics service discovery file found at: $RAY_PROM_METRICS_FILE_PATH."
|
| 10 |
+
echo "Restarting cloudwatch agent.This may take a few minutes..."
|
| 11 |
+
sudo /opt/aws/amazon-cloudwatch-agent/bin/amazon-cloudwatch-agent-ctl -m ec2 -a stop
|
| 12 |
+
echo "Cloudwatch agent stopped, starting cloudwatch agent..."
|
| 13 |
+
sudo /opt/aws/amazon-cloudwatch-agent/bin/amazon-cloudwatch-agent-ctl -a fetch-config -m ec2 -s -c "ssm:AmazonCloudWatch-ray_agent_config_$CLUSTER_NAME"
|
| 14 |
+
echo "Cloudwatch agent successfully restarted!"
|
| 15 |
+
exit 0
|
| 16 |
+
else
|
| 17 |
+
echo "Ray Prometheus metrics service discovery file not found at: $RAY_PROM_METRICS_FILE_PATH. Will check again in $DELAY_SECONDS seconds..."
|
| 18 |
+
sleep $DELAY_SECONDS
|
| 19 |
+
MAX_ATTEMPTS=$((MAX_ATTEMPTS-1))
|
| 20 |
+
fi
|
| 21 |
+
done
|
| 22 |
+
echo "Ray Prometheus metrics service discovery file not found at: $RAY_PROM_METRICS_FILE_PATH. Ray system metrics will not be available in CloudWatch."
|
| 23 |
+
exit 1
|
.venv/lib/python3.11/site-packages/ray/autoscaler/aws/defaults.yaml
ADDED
|
@@ -0,0 +1,144 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# An unique identifier for the head node and workers of this cluster.
|
| 2 |
+
cluster_name: default
|
| 3 |
+
|
| 4 |
+
# The maximum number of workers nodes to launch in addition to the head
|
| 5 |
+
# node.
|
| 6 |
+
max_workers: 2
|
| 7 |
+
|
| 8 |
+
# The autoscaler will scale up the cluster faster with higher upscaling speed.
|
| 9 |
+
# E.g., if the task requires adding more nodes then autoscaler will gradually
|
| 10 |
+
# scale up the cluster in chunks of upscaling_speed*currently_running_nodes.
|
| 11 |
+
# This number should be > 0.
|
| 12 |
+
upscaling_speed: 1.0
|
| 13 |
+
|
| 14 |
+
# This executes all commands on all nodes in the docker container,
|
| 15 |
+
# and opens all the necessary ports to support the Ray cluster.
|
| 16 |
+
# Empty string means disabled.
|
| 17 |
+
docker: {}
|
| 18 |
+
|
| 19 |
+
# If a node is idle for this many minutes, it will be removed.
|
| 20 |
+
idle_timeout_minutes: 5
|
| 21 |
+
|
| 22 |
+
# Cloud-provider specific configuration.
|
| 23 |
+
provider:
|
| 24 |
+
type: aws
|
| 25 |
+
region: us-west-2
|
| 26 |
+
# Availability zone(s), comma-separated, that nodes may be launched in.
|
| 27 |
+
# Nodes will be launched in the first listed availability zone and will
|
| 28 |
+
# be tried in the subsequent availability zones if launching fails.
|
| 29 |
+
availability_zone: us-west-2a,us-west-2b
|
| 30 |
+
# Whether to allow node reuse. If set to False, nodes will be terminated
|
| 31 |
+
# instead of stopped.
|
| 32 |
+
cache_stopped_nodes: True # If not present, the default is True.
|
| 33 |
+
|
| 34 |
+
# How Ray will authenticate with newly launched nodes.
|
| 35 |
+
auth:
|
| 36 |
+
ssh_user: ubuntu
|
| 37 |
+
# By default Ray creates a new private keypair, but you can also use your own.
|
| 38 |
+
# If you do so, make sure to also set "KeyName" in the head and worker node
|
| 39 |
+
# configurations below.
|
| 40 |
+
# ssh_private_key: /path/to/your/key.pem
|
| 41 |
+
|
| 42 |
+
# Tell the autoscaler the allowed node types and the resources they provide.
|
| 43 |
+
# The key is the name of the node type, which is just for debugging purposes.
|
| 44 |
+
# The node config specifies the launch config and physical instance type.
|
| 45 |
+
available_node_types:
|
| 46 |
+
ray.head.default:
|
| 47 |
+
# The node type's CPU and GPU resources are auto-detected based on AWS instance type.
|
| 48 |
+
# If desired, you can override the autodetected CPU and GPU resources advertised to the autoscaler.
|
| 49 |
+
# You can also set custom resources.
|
| 50 |
+
# For example, to mark a node type as having 1 CPU, 1 GPU, and 5 units of a resource called "custom", set
|
| 51 |
+
# resources: {"CPU": 1, "GPU": 1, "custom": 5}
|
| 52 |
+
resources: {}
|
| 53 |
+
# Provider-specific config for this node type, e.g. instance type. By default
|
| 54 |
+
# Ray will auto-configure unspecified fields such as SubnetId and KeyName.
|
| 55 |
+
# For more documentation on available fields, see:
|
| 56 |
+
# http://boto3.readthedocs.io/en/latest/reference/services/ec2.html#EC2.ServiceResource.create_instances
|
| 57 |
+
node_config:
|
| 58 |
+
InstanceType: m5.large
|
| 59 |
+
# You can provision additional disk space with a conf as follows
|
| 60 |
+
BlockDeviceMappings:
|
| 61 |
+
- DeviceName: /dev/sda1
|
| 62 |
+
Ebs:
|
| 63 |
+
VolumeSize: 256
|
| 64 |
+
# Additional options in the boto docs.
|
| 65 |
+
ray.worker.default:
|
| 66 |
+
# The minimum number of nodes of this type to launch.
|
| 67 |
+
# This number should be >= 0.
|
| 68 |
+
min_workers: 0
|
| 69 |
+
# The node type's CPU and GPU resources are auto-detected based on AWS instance type.
|
| 70 |
+
# If desired, you can override the autodetected CPU and GPU resources advertised to the autoscaler.
|
| 71 |
+
# You can also set custom resources.
|
| 72 |
+
# For example, to mark a node type as having 1 CPU, 1 GPU, and 5 units of a resource called "custom", set
|
| 73 |
+
# resources: {"CPU": 1, "GPU": 1, "custom": 5}
|
| 74 |
+
resources: {}
|
| 75 |
+
# Provider-specific config for this node type, e.g. instance type. By default
|
| 76 |
+
# Ray will auto-configure unspecified fields such as SubnetId and KeyName.
|
| 77 |
+
# For more documentation on available fields, see:
|
| 78 |
+
# http://boto3.readthedocs.io/en/latest/reference/services/ec2.html#EC2.ServiceResource.create_instances
|
| 79 |
+
node_config:
|
| 80 |
+
InstanceType: m5.large
|
| 81 |
+
# Run workers on spot by default. Comment this out to use on-demand.
|
| 82 |
+
InstanceMarketOptions:
|
| 83 |
+
MarketType: spot
|
| 84 |
+
# Additional options can be found in the boto docs, e.g.
|
| 85 |
+
# SpotOptions:
|
| 86 |
+
# MaxPrice: MAX_HOURLY_PRICE
|
| 87 |
+
# Additional options in the boto docs.
|
| 88 |
+
|
| 89 |
+
# Specify the node type of the head node (as configured above).
|
| 90 |
+
head_node_type: ray.head.default
|
| 91 |
+
|
| 92 |
+
# Files or directories to copy to the head and worker nodes. The format is a
|
| 93 |
+
# dictionary from REMOTE_PATH: LOCAL_PATH, e.g.
|
| 94 |
+
file_mounts: {
|
| 95 |
+
# "/path1/on/remote/machine": "/path1/on/local/machine",
|
| 96 |
+
# "/path2/on/remote/machine": "/path2/on/local/machine",
|
| 97 |
+
}
|
| 98 |
+
|
| 99 |
+
# Files or directories to copy from the head node to the worker nodes. The format is a
|
| 100 |
+
# list of paths. The same path on the head node will be copied to the worker node.
|
| 101 |
+
# This behavior is a subset of the file_mounts behavior. In the vast majority of cases
|
| 102 |
+
# you should just use file_mounts. Only use this if you know what you're doing!
|
| 103 |
+
cluster_synced_files: []
|
| 104 |
+
|
| 105 |
+
# Whether changes to directories in file_mounts or cluster_synced_files in the head node
|
| 106 |
+
# should sync to the worker node continuously
|
| 107 |
+
file_mounts_sync_continuously: False
|
| 108 |
+
|
| 109 |
+
# Patterns for files to exclude when running rsync up or rsync down
|
| 110 |
+
rsync_exclude: []
|
| 111 |
+
|
| 112 |
+
# Pattern files to use for filtering out files when running rsync up or rsync down. The file is searched for
|
| 113 |
+
# in the source directory and recursively through all subdirectories. For example, if .gitignore is provided
|
| 114 |
+
# as a value, the behavior will match git's behavior for finding and using .gitignore files.
|
| 115 |
+
rsync_filter: []
|
| 116 |
+
|
| 117 |
+
# List of commands that will be run before `setup_commands`. If docker is
|
| 118 |
+
# enabled, these commands will run outside the container and before docker
|
| 119 |
+
# is setup.
|
| 120 |
+
initialization_commands: []
|
| 121 |
+
|
| 122 |
+
# List of shell commands to run to set up nodes.
|
| 123 |
+
setup_commands:
|
| 124 |
+
- >-
|
| 125 |
+
(stat $HOME/anaconda3/envs/tensorflow2_p38/ &> /dev/null &&
|
| 126 |
+
echo 'export PATH="$HOME/anaconda3/envs/tensorflow2_p38/bin:$PATH"' >> ~/.bashrc) || true
|
| 127 |
+
- which ray || pip install -U "ray[default] @ https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-3.0.0.dev0-cp38-cp38-manylinux2014_x86_64.whl"
|
| 128 |
+
|
| 129 |
+
# Custom commands that will be run on the head node after common setup.
|
| 130 |
+
head_setup_commands:
|
| 131 |
+
- pip install 'boto3>=1.4.8' # 1.4.8 adds InstanceMarketOptions
|
| 132 |
+
|
| 133 |
+
# Custom commands that will be run on worker nodes after common setup.
|
| 134 |
+
worker_setup_commands: []
|
| 135 |
+
|
| 136 |
+
# Command to start ray on the head node. You don't need to change this.
|
| 137 |
+
head_start_ray_commands:
|
| 138 |
+
- ray stop
|
| 139 |
+
- ulimit -n 65536; ray start --head --port=6379 --object-manager-port=8076 --autoscaling-config=~/ray_bootstrap_config.yaml --dashboard-host=0.0.0.0
|
| 140 |
+
|
| 141 |
+
# Command to start ray on worker nodes. You don't need to change this.
|
| 142 |
+
worker_start_ray_commands:
|
| 143 |
+
- ray stop
|
| 144 |
+
- ulimit -n 65536; ray start --address=$RAY_HEAD_IP:6379 --object-manager-port=8076
|
.venv/lib/python3.11/site-packages/ray/autoscaler/azure/__init__.py
ADDED
|
File without changes
|
.venv/lib/python3.11/site-packages/ray/autoscaler/azure/__pycache__/__init__.cpython-311.pyc
ADDED
|
Binary file (193 Bytes). View file
|
|
|
.venv/lib/python3.11/site-packages/ray/autoscaler/azure/defaults.yaml
ADDED
|
@@ -0,0 +1,152 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# An unique identifier for the head node and workers of this cluster.
|
| 2 |
+
cluster_name: default
|
| 3 |
+
|
| 4 |
+
# The maximum number of workers nodes to launch in addition to the head
|
| 5 |
+
# node.
|
| 6 |
+
max_workers: 2
|
| 7 |
+
|
| 8 |
+
# The autoscaler will scale up the cluster faster with higher upscaling speed.
|
| 9 |
+
# E.g., if the task requires adding more nodes then autoscaler will gradually
|
| 10 |
+
# scale up the cluster in chunks of upscaling_speed*currently_running_nodes.
|
| 11 |
+
# This number should be > 0.
|
| 12 |
+
upscaling_speed: 1.0
|
| 13 |
+
|
| 14 |
+
# This executes all commands on all nodes in the docker container,
|
| 15 |
+
# and opens all the necessary ports to support the Ray cluster.
|
| 16 |
+
# Empty object means disabled.
|
| 17 |
+
docker: {}
|
| 18 |
+
|
| 19 |
+
# If a node is idle for this many minutes, it will be removed.
|
| 20 |
+
idle_timeout_minutes: 5
|
| 21 |
+
|
| 22 |
+
# Cloud-provider specific configuration.
|
| 23 |
+
provider:
|
| 24 |
+
type: azure
|
| 25 |
+
# https://azure.microsoft.com/en-us/global-infrastructure/locations
|
| 26 |
+
location: westus2
|
| 27 |
+
resource_group: ray-cluster
|
| 28 |
+
# set subscription id otherwise the default from az cli will be used
|
| 29 |
+
# subscription_id: 00000000-0000-0000-0000-000000000000
|
| 30 |
+
# set unique subnet mask or a random mask will be used
|
| 31 |
+
# subnet_mask: 10.0.0.0/16
|
| 32 |
+
# set unique id for resources in this cluster
|
| 33 |
+
# if not set a default id will be generated based on the resource group and cluster name
|
| 34 |
+
# unique_id: RAY1
|
| 35 |
+
|
| 36 |
+
# How Ray will authenticate with newly launched nodes.
|
| 37 |
+
auth:
|
| 38 |
+
ssh_user: ubuntu
|
| 39 |
+
# you must specify paths to matching private and public key pair files
|
| 40 |
+
# use `ssh-keygen -t rsa -b 4096` to generate a new ssh key pair
|
| 41 |
+
ssh_private_key: ~/.ssh/id_rsa
|
| 42 |
+
# changes to this should match what is specified in file_mounts
|
| 43 |
+
ssh_public_key: ~/.ssh/id_rsa.pub
|
| 44 |
+
|
| 45 |
+
# More specific customization to node configurations can be made using the ARM template azure-vm-template.json file
|
| 46 |
+
# See documentation here: https://docs.microsoft.com/en-us/azure/templates/microsoft.compute/2019-03-01/virtualmachines
|
| 47 |
+
# Changes to the local file will be used during deployment of the head node, however worker nodes deployment occurs
|
| 48 |
+
# on the head node, so changes to the template must be included in the wheel file used in setup_commands section below
|
| 49 |
+
|
| 50 |
+
# Tell the autoscaler the allowed node types and the resources they provide.
|
| 51 |
+
# The key is the name of the node type, which is just for debugging purposes.
|
| 52 |
+
# The node config specifies the launch config and physical instance type.
|
| 53 |
+
available_node_types:
|
| 54 |
+
ray.head.default:
|
| 55 |
+
resources: {"CPU": 2}
|
| 56 |
+
# Provider-specific config, e.g. instance type.
|
| 57 |
+
node_config:
|
| 58 |
+
azure_arm_parameters:
|
| 59 |
+
vmSize: Standard_D2s_v3
|
| 60 |
+
# List images https://docs.microsoft.com/en-us/azure/virtual-machines/linux/cli-ps-findimage
|
| 61 |
+
imagePublisher: microsoft-dsvm
|
| 62 |
+
imageOffer: ubuntu-1804
|
| 63 |
+
imageSku: 1804-gen2
|
| 64 |
+
imageVersion: latest
|
| 65 |
+
|
| 66 |
+
ray.worker.default:
|
| 67 |
+
# The minimum number of nodes of this type to launch.
|
| 68 |
+
# This number should be >= 0.
|
| 69 |
+
min_workers: 0
|
| 70 |
+
# The resources provided by this node type.
|
| 71 |
+
resources: {"CPU": 2}
|
| 72 |
+
# Provider-specific config, e.g. instance type.
|
| 73 |
+
node_config:
|
| 74 |
+
azure_arm_parameters:
|
| 75 |
+
vmSize: Standard_D2s_v3
|
| 76 |
+
# List images https://docs.microsoft.com/en-us/azure/virtual-machines/linux/cli-ps-findimage
|
| 77 |
+
imagePublisher: microsoft-dsvm
|
| 78 |
+
imageOffer: ubuntu-1804
|
| 79 |
+
imageSku: 1804-gen2
|
| 80 |
+
imageVersion: latest
|
| 81 |
+
# comment lines below to not use Spot instances
|
| 82 |
+
priority: Spot
|
| 83 |
+
# set a maximum price for spot instances if desired
|
| 84 |
+
# billingProfile:
|
| 85 |
+
# maxPrice: -1
|
| 86 |
+
|
| 87 |
+
# Specify the node type of the head node (as configured above).
|
| 88 |
+
head_node_type: ray.head.default
|
| 89 |
+
|
| 90 |
+
# Files or directories to copy to the head and worker nodes. The format is a
|
| 91 |
+
# dictionary from REMOTE_PATH: LOCAL_PATH, e.g.
|
| 92 |
+
file_mounts: {
|
| 93 |
+
# "/path1/on/remote/machine": "/path1/on/local/machine",
|
| 94 |
+
# "/path2/on/remote/machine": "/path2/on/local/machine",
|
| 95 |
+
"~/.ssh/id_rsa.pub": "~/.ssh/id_rsa.pub"
|
| 96 |
+
}
|
| 97 |
+
|
| 98 |
+
# Files or directories to copy from the head node to the worker nodes. The format is a
|
| 99 |
+
# list of paths. The same path on the head node will be copied to the worker node.
|
| 100 |
+
# This behavior is a subset of the file_mounts behavior. In the vast majority of cases
|
| 101 |
+
# you should just use file_mounts. Only use this if you know what you're doing!
|
| 102 |
+
cluster_synced_files: []
|
| 103 |
+
|
| 104 |
+
# Whether changes to directories in file_mounts or cluster_synced_files in the head node
|
| 105 |
+
# should sync to the worker node continuously
|
| 106 |
+
file_mounts_sync_continuously: False
|
| 107 |
+
|
| 108 |
+
# Patterns for files to exclude when running rsync up or rsync down
|
| 109 |
+
rsync_exclude: []
|
| 110 |
+
|
| 111 |
+
# Pattern files to use for filtering out files when running rsync up or rsync down. The file is searched for
|
| 112 |
+
# in the source directory and recursively through all subdirectories. For example, if .gitignore is provided
|
| 113 |
+
# as a value, the behavior will match git's behavior for finding and using .gitignore files.
|
| 114 |
+
rsync_filter: []
|
| 115 |
+
|
| 116 |
+
# List of commands that will be run before `setup_commands`. If docker is
|
| 117 |
+
# enabled, these commands will run outside the container and before docker
|
| 118 |
+
# is setup.
|
| 119 |
+
initialization_commands:
|
| 120 |
+
# get rid of annoying Ubuntu message
|
| 121 |
+
- touch ~/.sudo_as_admin_successful
|
| 122 |
+
|
| 123 |
+
# List of shell commands to run to set up nodes.
|
| 124 |
+
setup_commands:
|
| 125 |
+
# Note: if you're developing Ray, you probably want to create an AMI that
|
| 126 |
+
# has your Ray repo pre-cloned. Then, you can replace the pip installs
|
| 127 |
+
# below with a git checkout <your_sha> (and possibly a recompile).
|
| 128 |
+
- (which conda && echo 'eval "$(conda shell.bash hook)"' >> ~/.bashrc) || true
|
| 129 |
+
# - (conda activate py38_pytorch &> /dev/null && echo 'conda activate py38_pytorch' >> ~/.bashrc) || true
|
| 130 |
+
- (conda activate py38_tensorflow &> /dev/null && echo 'conda activate py38_tensorflow' >> ~/.bashrc) || true
|
| 131 |
+
- which ray || pip install -U "ray[default] @ https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-3.0.0.dev0-cp38-cp38-manylinux2014_x86_64.whl"
|
| 132 |
+
# Consider uncommenting these if you also want to run apt-get commands during setup
|
| 133 |
+
# - sudo pkill -9 apt-get || true
|
| 134 |
+
# - sudo pkill -9 dpkg || true
|
| 135 |
+
# - sudo dpkg --configure -a
|
| 136 |
+
|
| 137 |
+
# Custom commands that will be run on the head node after common setup.
|
| 138 |
+
head_setup_commands:
|
| 139 |
+
- pip install -U azure-cli-core==2.29.1 azure-identity==1.7.0 azure-mgmt-compute==23.1.0 azure-mgmt-network==19.0.0 azure-mgmt-resource==20.0.0 msrestazure==0.6.4
|
| 140 |
+
|
| 141 |
+
# Custom commands that will be run on worker nodes after common setup.
|
| 142 |
+
worker_setup_commands: []
|
| 143 |
+
|
| 144 |
+
# Command to start ray on the head node. You don't need to change this.
|
| 145 |
+
head_start_ray_commands:
|
| 146 |
+
- ray stop
|
| 147 |
+
- ulimit -n 65536; ray start --head --port=6379 --object-manager-port=8076 --autoscaling-config=~/ray_bootstrap_config.yaml --dashboard-host=0.0.0.0
|
| 148 |
+
|
| 149 |
+
# Command to start ray on worker nodes. You don't need to change this.
|
| 150 |
+
worker_start_ray_commands:
|
| 151 |
+
- ray stop
|
| 152 |
+
- ulimit -n 65536; ray start --address=$RAY_HEAD_IP:6379 --object-manager-port=8076
|
.venv/lib/python3.11/site-packages/ray/autoscaler/sdk/__init__.py
ADDED
|
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from ray.autoscaler.sdk.sdk import (
|
| 2 |
+
bootstrap_config,
|
| 3 |
+
configure_logging,
|
| 4 |
+
create_or_update_cluster,
|
| 5 |
+
fillout_defaults,
|
| 6 |
+
get_docker_host_mount_location,
|
| 7 |
+
get_head_node_ip,
|
| 8 |
+
get_worker_node_ips,
|
| 9 |
+
register_callback_handler,
|
| 10 |
+
request_resources,
|
| 11 |
+
rsync,
|
| 12 |
+
run_on_cluster,
|
| 13 |
+
teardown_cluster,
|
| 14 |
+
)
|
| 15 |
+
|
| 16 |
+
__all__ = [
|
| 17 |
+
"create_or_update_cluster",
|
| 18 |
+
"teardown_cluster",
|
| 19 |
+
"run_on_cluster",
|
| 20 |
+
"rsync",
|
| 21 |
+
"get_head_node_ip",
|
| 22 |
+
"get_worker_node_ips",
|
| 23 |
+
"request_resources",
|
| 24 |
+
"configure_logging",
|
| 25 |
+
"bootstrap_config",
|
| 26 |
+
"fillout_defaults",
|
| 27 |
+
"register_callback_handler",
|
| 28 |
+
"get_docker_host_mount_location",
|
| 29 |
+
]
|
.venv/lib/python3.11/site-packages/ray/autoscaler/sdk/__pycache__/__init__.cpython-311.pyc
ADDED
|
Binary file (804 Bytes). View file
|
|
|
.venv/lib/python3.11/site-packages/ray/autoscaler/sdk/__pycache__/sdk.cpython-311.pyc
ADDED
|
Binary file (15.8 kB). View file
|
|
|
.venv/lib/python3.11/site-packages/ray/autoscaler/sdk/sdk.py
ADDED
|
@@ -0,0 +1,343 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""IMPORTANT: this is an experimental interface and not currently stable."""
|
| 2 |
+
|
| 3 |
+
import json
|
| 4 |
+
import os
|
| 5 |
+
import tempfile
|
| 6 |
+
from contextlib import contextmanager
|
| 7 |
+
from typing import Any, Callable, Dict, Iterator, List, Optional, Union
|
| 8 |
+
|
| 9 |
+
from ray.autoscaler._private import commands
|
| 10 |
+
from ray.autoscaler._private.cli_logger import cli_logger
|
| 11 |
+
from ray.autoscaler._private.event_system import CreateClusterEvent # noqa: F401
|
| 12 |
+
from ray.autoscaler._private.event_system import global_event_system # noqa: F401
|
| 13 |
+
from ray.util.annotations import DeveloperAPI
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
@DeveloperAPI
|
| 17 |
+
def create_or_update_cluster(
|
| 18 |
+
cluster_config: Union[dict, str],
|
| 19 |
+
*,
|
| 20 |
+
no_restart: bool = False,
|
| 21 |
+
restart_only: bool = False,
|
| 22 |
+
no_config_cache: bool = False
|
| 23 |
+
) -> Dict[str, Any]:
|
| 24 |
+
"""Create or updates an autoscaling Ray cluster from a config json.
|
| 25 |
+
|
| 26 |
+
Args:
|
| 27 |
+
cluster_config (Union[str, dict]): Either the config dict of the
|
| 28 |
+
cluster, or a path pointing to a file containing the config.
|
| 29 |
+
no_restart: Whether to skip restarting Ray services during the
|
| 30 |
+
update. This avoids interrupting running jobs and can be used to
|
| 31 |
+
dynamically adjust autoscaler configuration.
|
| 32 |
+
restart_only: Whether to skip running setup commands and only
|
| 33 |
+
restart Ray. This cannot be used with 'no-restart'.
|
| 34 |
+
no_config_cache: Whether to disable the config cache and fully
|
| 35 |
+
resolve all environment settings from the Cloud provider again.
|
| 36 |
+
"""
|
| 37 |
+
with _as_config_file(cluster_config) as config_file:
|
| 38 |
+
return commands.create_or_update_cluster(
|
| 39 |
+
config_file=config_file,
|
| 40 |
+
override_min_workers=None,
|
| 41 |
+
override_max_workers=None,
|
| 42 |
+
no_restart=no_restart,
|
| 43 |
+
restart_only=restart_only,
|
| 44 |
+
yes=True,
|
| 45 |
+
override_cluster_name=None,
|
| 46 |
+
no_config_cache=no_config_cache,
|
| 47 |
+
redirect_command_output=None,
|
| 48 |
+
use_login_shells=True,
|
| 49 |
+
)
|
| 50 |
+
|
| 51 |
+
|
| 52 |
+
@DeveloperAPI
|
| 53 |
+
def teardown_cluster(
|
| 54 |
+
cluster_config: Union[dict, str],
|
| 55 |
+
workers_only: bool = False,
|
| 56 |
+
keep_min_workers: bool = False,
|
| 57 |
+
) -> None:
|
| 58 |
+
"""Destroys all nodes of a Ray cluster described by a config json.
|
| 59 |
+
|
| 60 |
+
Args:
|
| 61 |
+
cluster_config (Union[str, dict]): Either the config dict of the
|
| 62 |
+
cluster, or a path pointing to a file containing the config.
|
| 63 |
+
workers_only: Whether to keep the head node running and only
|
| 64 |
+
teardown worker nodes.
|
| 65 |
+
keep_min_workers: Whether to keep min_workers (as specified
|
| 66 |
+
in the YAML) still running.
|
| 67 |
+
"""
|
| 68 |
+
with _as_config_file(cluster_config) as config_file:
|
| 69 |
+
return commands.teardown_cluster(
|
| 70 |
+
config_file=config_file,
|
| 71 |
+
yes=True,
|
| 72 |
+
workers_only=workers_only,
|
| 73 |
+
override_cluster_name=None,
|
| 74 |
+
keep_min_workers=keep_min_workers,
|
| 75 |
+
)
|
| 76 |
+
|
| 77 |
+
|
| 78 |
+
@DeveloperAPI
|
| 79 |
+
def run_on_cluster(
|
| 80 |
+
cluster_config: Union[dict, str],
|
| 81 |
+
*,
|
| 82 |
+
cmd: Optional[str] = None,
|
| 83 |
+
run_env: str = "auto",
|
| 84 |
+
tmux: bool = False,
|
| 85 |
+
stop: bool = False,
|
| 86 |
+
no_config_cache: bool = False,
|
| 87 |
+
port_forward: Optional[commands.Port_forward] = None,
|
| 88 |
+
with_output: bool = False
|
| 89 |
+
) -> Optional[str]:
|
| 90 |
+
"""Runs a command on the specified cluster.
|
| 91 |
+
|
| 92 |
+
Args:
|
| 93 |
+
cluster_config (Union[str, dict]): Either the config dict of the
|
| 94 |
+
cluster, or a path pointing to a file containing the config.
|
| 95 |
+
cmd: the command to run, or None for a no-op command.
|
| 96 |
+
run_env: whether to run the command on the host or in a
|
| 97 |
+
container. Select between "auto", "host" and "docker".
|
| 98 |
+
tmux: whether to run in a tmux session
|
| 99 |
+
stop: whether to stop the cluster after command run
|
| 100 |
+
no_config_cache: Whether to disable the config cache and fully
|
| 101 |
+
resolve all environment settings from the Cloud provider again.
|
| 102 |
+
port_forward ( (int,int) or list[(int,int)]): port(s) to forward.
|
| 103 |
+
with_output: Whether to capture command output.
|
| 104 |
+
|
| 105 |
+
Returns:
|
| 106 |
+
The output of the command as a string.
|
| 107 |
+
"""
|
| 108 |
+
with _as_config_file(cluster_config) as config_file:
|
| 109 |
+
return commands.exec_cluster(
|
| 110 |
+
config_file,
|
| 111 |
+
cmd=cmd,
|
| 112 |
+
run_env=run_env,
|
| 113 |
+
screen=False,
|
| 114 |
+
tmux=tmux,
|
| 115 |
+
stop=stop,
|
| 116 |
+
start=False,
|
| 117 |
+
override_cluster_name=None,
|
| 118 |
+
no_config_cache=no_config_cache,
|
| 119 |
+
port_forward=port_forward,
|
| 120 |
+
with_output=with_output,
|
| 121 |
+
)
|
| 122 |
+
|
| 123 |
+
|
| 124 |
+
@DeveloperAPI
|
| 125 |
+
def rsync(
|
| 126 |
+
cluster_config: Union[dict, str],
|
| 127 |
+
*,
|
| 128 |
+
source: Optional[str],
|
| 129 |
+
target: Optional[str],
|
| 130 |
+
down: bool,
|
| 131 |
+
ip_address: Optional[str] = None,
|
| 132 |
+
use_internal_ip: bool = False,
|
| 133 |
+
no_config_cache: bool = False,
|
| 134 |
+
should_bootstrap: bool = True
|
| 135 |
+
):
|
| 136 |
+
"""Rsyncs files to or from the cluster.
|
| 137 |
+
|
| 138 |
+
Args:
|
| 139 |
+
cluster_config (Union[str, dict]): Either the config dict of the
|
| 140 |
+
cluster, or a path pointing to a file containing the config.
|
| 141 |
+
source: rsync source argument.
|
| 142 |
+
target: rsync target argument.
|
| 143 |
+
down: whether we're syncing remote -> local.
|
| 144 |
+
ip_address: Address of node.
|
| 145 |
+
use_internal_ip: Whether the provided ip_address is
|
| 146 |
+
public or private.
|
| 147 |
+
no_config_cache: Whether to disable the config cache and fully
|
| 148 |
+
resolve all environment settings from the Cloud provider again.
|
| 149 |
+
should_bootstrap: whether to bootstrap cluster config before syncing
|
| 150 |
+
|
| 151 |
+
Raises:
|
| 152 |
+
RuntimeError if the cluster head node is not found.
|
| 153 |
+
"""
|
| 154 |
+
with _as_config_file(cluster_config) as config_file:
|
| 155 |
+
return commands.rsync(
|
| 156 |
+
config_file=config_file,
|
| 157 |
+
source=source,
|
| 158 |
+
target=target,
|
| 159 |
+
override_cluster_name=None,
|
| 160 |
+
down=down,
|
| 161 |
+
ip_address=ip_address,
|
| 162 |
+
use_internal_ip=use_internal_ip,
|
| 163 |
+
no_config_cache=no_config_cache,
|
| 164 |
+
all_nodes=False,
|
| 165 |
+
should_bootstrap=should_bootstrap,
|
| 166 |
+
)
|
| 167 |
+
|
| 168 |
+
|
| 169 |
+
@DeveloperAPI
|
| 170 |
+
def get_head_node_ip(cluster_config: Union[dict, str]) -> str:
|
| 171 |
+
"""Returns head node IP for given configuration file if exists.
|
| 172 |
+
|
| 173 |
+
Args:
|
| 174 |
+
cluster_config (Union[str, dict]): Either the config dict of the
|
| 175 |
+
cluster, or a path pointing to a file containing the config.
|
| 176 |
+
|
| 177 |
+
Returns:
|
| 178 |
+
The ip address of the cluster head node.
|
| 179 |
+
|
| 180 |
+
Raises:
|
| 181 |
+
RuntimeError if the cluster is not found.
|
| 182 |
+
"""
|
| 183 |
+
with _as_config_file(cluster_config) as config_file:
|
| 184 |
+
return commands.get_head_node_ip(config_file)
|
| 185 |
+
|
| 186 |
+
|
| 187 |
+
@DeveloperAPI
|
| 188 |
+
def get_worker_node_ips(cluster_config: Union[dict, str]) -> List[str]:
|
| 189 |
+
"""Returns worker node IPs for given configuration file.
|
| 190 |
+
|
| 191 |
+
Args:
|
| 192 |
+
cluster_config (Union[str, dict]): Either the config dict of the
|
| 193 |
+
cluster, or a path pointing to a file containing the config.
|
| 194 |
+
|
| 195 |
+
Returns:
|
| 196 |
+
List of worker node ip addresses.
|
| 197 |
+
|
| 198 |
+
Raises:
|
| 199 |
+
RuntimeError if the cluster is not found.
|
| 200 |
+
"""
|
| 201 |
+
with _as_config_file(cluster_config) as config_file:
|
| 202 |
+
return commands.get_worker_node_ips(config_file)
|
| 203 |
+
|
| 204 |
+
|
| 205 |
+
@DeveloperAPI
|
| 206 |
+
def request_resources(
|
| 207 |
+
num_cpus: Optional[int] = None, bundles: Optional[List[dict]] = None
|
| 208 |
+
) -> None:
|
| 209 |
+
"""Command the autoscaler to scale to accommodate the specified requests.
|
| 210 |
+
|
| 211 |
+
The cluster will immediately attempt to scale to accommodate the requested
|
| 212 |
+
resources, bypassing normal upscaling speed constraints. This takes into
|
| 213 |
+
account existing resource usage.
|
| 214 |
+
|
| 215 |
+
For example, suppose you call ``request_resources(num_cpus=100)`` and
|
| 216 |
+
there are 45 currently running tasks, each requiring 1 CPU. Then, enough
|
| 217 |
+
nodes will be added so up to 100 tasks can run concurrently. It does
|
| 218 |
+
**not** add enough nodes so that 145 tasks can run.
|
| 219 |
+
|
| 220 |
+
This call is only a hint to the autoscaler. The actual resulting cluster
|
| 221 |
+
size may be slightly larger or smaller than expected depending on the
|
| 222 |
+
internal bin packing algorithm and max worker count restrictions.
|
| 223 |
+
|
| 224 |
+
Args:
|
| 225 |
+
num_cpus: Scale the cluster to ensure this number of CPUs are
|
| 226 |
+
available. This request is persistent until another call to
|
| 227 |
+
request_resources() is made to override.
|
| 228 |
+
bundles (List[ResourceDict]): Scale the cluster to ensure this set of
|
| 229 |
+
resource shapes can fit. This request is persistent until another
|
| 230 |
+
call to request_resources() is made to override.
|
| 231 |
+
|
| 232 |
+
Examples:
|
| 233 |
+
>>> from ray.autoscaler.sdk import request_resources
|
| 234 |
+
>>> # Request 1000 CPUs.
|
| 235 |
+
>>> request_resources(num_cpus=1000) # doctest: +SKIP
|
| 236 |
+
>>> # Request 64 CPUs and also fit a 1-GPU/4-CPU task.
|
| 237 |
+
>>> request_resources( # doctest: +SKIP
|
| 238 |
+
... num_cpus=64, bundles=[{"GPU": 1, "CPU": 4}])
|
| 239 |
+
>>> # Same as requesting num_cpus=3.
|
| 240 |
+
>>> request_resources( # doctest: +SKIP
|
| 241 |
+
... bundles=[{"CPU": 1}, {"CPU": 1}, {"CPU": 1}])
|
| 242 |
+
"""
|
| 243 |
+
if num_cpus is not None and not isinstance(num_cpus, int):
|
| 244 |
+
raise TypeError("num_cpus should be of type int.")
|
| 245 |
+
if bundles is not None:
|
| 246 |
+
if isinstance(bundles, List):
|
| 247 |
+
for bundle in bundles:
|
| 248 |
+
if isinstance(bundle, Dict):
|
| 249 |
+
for key in bundle.keys():
|
| 250 |
+
if not (isinstance(key, str) and isinstance(bundle[key], int)):
|
| 251 |
+
raise TypeError(
|
| 252 |
+
"each bundle key should be str and value as int."
|
| 253 |
+
)
|
| 254 |
+
else:
|
| 255 |
+
raise TypeError("each bundle should be a Dict.")
|
| 256 |
+
else:
|
| 257 |
+
raise TypeError("bundles should be of type List")
|
| 258 |
+
|
| 259 |
+
return commands.request_resources(num_cpus, bundles)
|
| 260 |
+
|
| 261 |
+
|
| 262 |
+
@DeveloperAPI
|
| 263 |
+
def configure_logging(
|
| 264 |
+
log_style: Optional[str] = None,
|
| 265 |
+
color_mode: Optional[str] = None,
|
| 266 |
+
verbosity: Optional[int] = None,
|
| 267 |
+
):
|
| 268 |
+
"""Configures logging for cluster command calls.
|
| 269 |
+
|
| 270 |
+
Args:
|
| 271 |
+
log_style: If 'pretty', outputs with formatting and color.
|
| 272 |
+
If 'record', outputs record-style without formatting.
|
| 273 |
+
'auto' defaults to 'pretty', and disables pretty logging
|
| 274 |
+
if stdin is *not* a TTY. Defaults to "auto".
|
| 275 |
+
color_mode (str):
|
| 276 |
+
Can be "true", "false", or "auto".
|
| 277 |
+
|
| 278 |
+
Enables or disables `colorful`.
|
| 279 |
+
|
| 280 |
+
If `color_mode` is "auto", is set to `not stdout.isatty()`
|
| 281 |
+
vebosity (int):
|
| 282 |
+
Output verbosity (0, 1, 2, 3).
|
| 283 |
+
|
| 284 |
+
Low verbosity will disable `verbose` and `very_verbose` messages.
|
| 285 |
+
|
| 286 |
+
"""
|
| 287 |
+
cli_logger.configure(
|
| 288 |
+
log_style=log_style, color_mode=color_mode, verbosity=verbosity
|
| 289 |
+
)
|
| 290 |
+
|
| 291 |
+
|
| 292 |
+
@contextmanager
|
| 293 |
+
@DeveloperAPI
|
| 294 |
+
def _as_config_file(cluster_config: Union[dict, str]) -> Iterator[str]:
|
| 295 |
+
if isinstance(cluster_config, dict):
|
| 296 |
+
tmp = tempfile.NamedTemporaryFile("w", prefix="autoscaler-sdk-tmp-")
|
| 297 |
+
tmp.write(json.dumps(cluster_config))
|
| 298 |
+
tmp.flush()
|
| 299 |
+
cluster_config = tmp.name
|
| 300 |
+
if not os.path.exists(cluster_config):
|
| 301 |
+
raise ValueError("Cluster config not found {}".format(cluster_config))
|
| 302 |
+
yield cluster_config
|
| 303 |
+
|
| 304 |
+
|
| 305 |
+
@DeveloperAPI
|
| 306 |
+
def bootstrap_config(
|
| 307 |
+
cluster_config: Dict[str, Any], no_config_cache: bool = False
|
| 308 |
+
) -> Dict[str, Any]:
|
| 309 |
+
"""Validate and add provider-specific fields to the config. For example,
|
| 310 |
+
IAM/authentication may be added here."""
|
| 311 |
+
return commands._bootstrap_config(cluster_config, no_config_cache)
|
| 312 |
+
|
| 313 |
+
|
| 314 |
+
@DeveloperAPI
|
| 315 |
+
def fillout_defaults(config: Dict[str, Any]) -> Dict[str, Any]:
|
| 316 |
+
"""Fillout default values for a cluster_config based on the provider."""
|
| 317 |
+
from ray.autoscaler._private.util import fillout_defaults
|
| 318 |
+
|
| 319 |
+
return fillout_defaults(config)
|
| 320 |
+
|
| 321 |
+
|
| 322 |
+
@DeveloperAPI
|
| 323 |
+
def register_callback_handler(
|
| 324 |
+
event_name: str,
|
| 325 |
+
callback: Union[Callable[[Dict], None], List[Callable[[Dict], None]]],
|
| 326 |
+
) -> None:
|
| 327 |
+
"""Registers a callback handler for autoscaler events.
|
| 328 |
+
|
| 329 |
+
Args:
|
| 330 |
+
event_name: Event that callback should be called on. See
|
| 331 |
+
CreateClusterEvent for details on the events available to be
|
| 332 |
+
registered against.
|
| 333 |
+
callback: Callable object that is invoked
|
| 334 |
+
when specified event occurs.
|
| 335 |
+
"""
|
| 336 |
+
global_event_system.add_callback_handler(event_name, callback)
|
| 337 |
+
|
| 338 |
+
|
| 339 |
+
@DeveloperAPI
|
| 340 |
+
def get_docker_host_mount_location(cluster_name: str) -> str:
|
| 341 |
+
"""Return host path that Docker mounts attach to."""
|
| 342 |
+
docker_mount_prefix = "/tmp/ray_tmp_mount/{cluster_name}"
|
| 343 |
+
return docker_mount_prefix.format(cluster_name=cluster_name)
|
.venv/lib/python3.11/site-packages/ray/autoscaler/v2/__init__.py
ADDED
|
File without changes
|
.venv/lib/python3.11/site-packages/ray/autoscaler/v2/autoscaler.py
ADDED
|
@@ -0,0 +1,201 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import logging
|
| 2 |
+
from queue import Queue
|
| 3 |
+
from typing import List, Optional
|
| 4 |
+
|
| 5 |
+
from ray._raylet import GcsClient
|
| 6 |
+
from ray.autoscaler._private.providers import _get_node_provider
|
| 7 |
+
from ray.autoscaler.v2.event_logger import AutoscalerEventLogger
|
| 8 |
+
from ray.autoscaler.v2.instance_manager.cloud_providers.kuberay.cloud_provider import (
|
| 9 |
+
KubeRayProvider,
|
| 10 |
+
)
|
| 11 |
+
from ray.autoscaler.v2.instance_manager.cloud_providers.read_only.cloud_provider import ( # noqa
|
| 12 |
+
ReadOnlyProvider,
|
| 13 |
+
)
|
| 14 |
+
from ray.autoscaler.v2.instance_manager.config import (
|
| 15 |
+
AutoscalingConfig,
|
| 16 |
+
IConfigReader,
|
| 17 |
+
Provider,
|
| 18 |
+
)
|
| 19 |
+
from ray.autoscaler.v2.instance_manager.instance_manager import (
|
| 20 |
+
InstanceManager,
|
| 21 |
+
InstanceUpdatedSubscriber,
|
| 22 |
+
)
|
| 23 |
+
from ray.autoscaler.v2.instance_manager.instance_storage import InstanceStorage
|
| 24 |
+
from ray.autoscaler.v2.instance_manager.node_provider import (
|
| 25 |
+
ICloudInstanceProvider,
|
| 26 |
+
NodeProviderAdapter,
|
| 27 |
+
)
|
| 28 |
+
from ray.autoscaler.v2.instance_manager.reconciler import Reconciler
|
| 29 |
+
from ray.autoscaler.v2.instance_manager.storage import InMemoryStorage
|
| 30 |
+
from ray.autoscaler.v2.instance_manager.subscribers.cloud_instance_updater import (
|
| 31 |
+
CloudInstanceUpdater,
|
| 32 |
+
)
|
| 33 |
+
from ray.autoscaler.v2.instance_manager.subscribers.ray_stopper import RayStopper
|
| 34 |
+
from ray.autoscaler.v2.metrics_reporter import AutoscalerMetricsReporter
|
| 35 |
+
from ray.autoscaler.v2.scheduler import ResourceDemandScheduler
|
| 36 |
+
from ray.autoscaler.v2.sdk import get_cluster_resource_state
|
| 37 |
+
from ray.core.generated.autoscaler_pb2 import AutoscalingState
|
| 38 |
+
|
| 39 |
+
logger = logging.getLogger(__name__)
|
| 40 |
+
|
| 41 |
+
|
| 42 |
+
class Autoscaler:
|
| 43 |
+
def __init__(
|
| 44 |
+
self,
|
| 45 |
+
session_name: str,
|
| 46 |
+
config_reader: IConfigReader,
|
| 47 |
+
gcs_client: GcsClient,
|
| 48 |
+
event_logger: Optional[AutoscalerEventLogger] = None,
|
| 49 |
+
metrics_reporter: Optional[AutoscalerMetricsReporter] = None,
|
| 50 |
+
) -> None:
|
| 51 |
+
"""
|
| 52 |
+
Args:
|
| 53 |
+
session_name: The name of the ray session.
|
| 54 |
+
config_reader: The config reader.
|
| 55 |
+
gcs_client: The GCS client.
|
| 56 |
+
event_logger: The event logger for emitting cluster events.
|
| 57 |
+
metrics_reporter: The metrics reporter for emitting cluster metrics.
|
| 58 |
+
"""
|
| 59 |
+
|
| 60 |
+
self._config_reader = config_reader
|
| 61 |
+
|
| 62 |
+
config = config_reader.get_cached_autoscaling_config()
|
| 63 |
+
logger.info(f"Using Autoscaling Config: \n{config.dump()}")
|
| 64 |
+
|
| 65 |
+
self._gcs_client = gcs_client
|
| 66 |
+
self._cloud_instance_provider = None
|
| 67 |
+
self._instance_manager = None
|
| 68 |
+
self._ray_stop_errors_queue = Queue()
|
| 69 |
+
self._ray_install_errors_queue = Queue()
|
| 70 |
+
self._event_logger = event_logger
|
| 71 |
+
self._metrics_reporter = metrics_reporter
|
| 72 |
+
|
| 73 |
+
self._init_cloud_instance_provider(config, config_reader)
|
| 74 |
+
self._init_instance_manager(
|
| 75 |
+
session_name=session_name,
|
| 76 |
+
config=config,
|
| 77 |
+
cloud_provider=self._cloud_instance_provider,
|
| 78 |
+
gcs_client=self._gcs_client,
|
| 79 |
+
)
|
| 80 |
+
self._scheduler = ResourceDemandScheduler(self._event_logger)
|
| 81 |
+
|
| 82 |
+
def _init_cloud_instance_provider(
|
| 83 |
+
self, config: AutoscalingConfig, config_reader: IConfigReader
|
| 84 |
+
):
|
| 85 |
+
"""
|
| 86 |
+
Initialize the cloud provider, and its dependencies (the v1 node provider)
|
| 87 |
+
|
| 88 |
+
Args:
|
| 89 |
+
config: The autoscaling config.
|
| 90 |
+
config_reader: The config reader.
|
| 91 |
+
|
| 92 |
+
"""
|
| 93 |
+
provider_config = config.get_provider_config()
|
| 94 |
+
if provider_config["type"] == "kuberay":
|
| 95 |
+
provider_config["head_node_type"] = config.get_head_node_type()
|
| 96 |
+
self._cloud_instance_provider = KubeRayProvider(
|
| 97 |
+
config.get_config("cluster_name"),
|
| 98 |
+
provider_config,
|
| 99 |
+
)
|
| 100 |
+
elif config.provider == Provider.READ_ONLY:
|
| 101 |
+
provider_config["gcs_address"] = self._gcs_client.address
|
| 102 |
+
self._cloud_instance_provider = ReadOnlyProvider(
|
| 103 |
+
provider_config=provider_config,
|
| 104 |
+
)
|
| 105 |
+
else:
|
| 106 |
+
node_provider_v1 = _get_node_provider(
|
| 107 |
+
provider_config,
|
| 108 |
+
config.get_config("cluster_name"),
|
| 109 |
+
)
|
| 110 |
+
|
| 111 |
+
self._cloud_instance_provider = NodeProviderAdapter(
|
| 112 |
+
v1_provider=node_provider_v1,
|
| 113 |
+
config_reader=config_reader,
|
| 114 |
+
)
|
| 115 |
+
|
| 116 |
+
def _init_instance_manager(
|
| 117 |
+
self,
|
| 118 |
+
session_name: str,
|
| 119 |
+
cloud_provider: ICloudInstanceProvider,
|
| 120 |
+
gcs_client: GcsClient,
|
| 121 |
+
config: AutoscalingConfig,
|
| 122 |
+
):
|
| 123 |
+
"""
|
| 124 |
+
Initialize the instance manager, and its dependencies.
|
| 125 |
+
"""
|
| 126 |
+
|
| 127 |
+
instance_storage = InstanceStorage(
|
| 128 |
+
cluster_id=session_name,
|
| 129 |
+
storage=InMemoryStorage(),
|
| 130 |
+
)
|
| 131 |
+
subscribers: List[InstanceUpdatedSubscriber] = []
|
| 132 |
+
subscribers.append(CloudInstanceUpdater(cloud_provider=cloud_provider))
|
| 133 |
+
subscribers.append(
|
| 134 |
+
RayStopper(gcs_client=gcs_client, error_queue=self._ray_stop_errors_queue)
|
| 135 |
+
)
|
| 136 |
+
if not config.disable_node_updaters():
|
| 137 |
+
# Supporting ray installer is only needed for providers that doesn't
|
| 138 |
+
# install or manage ray (e.g. AWS, GCP). These providers will be
|
| 139 |
+
# supported in the future.
|
| 140 |
+
raise NotImplementedError(
|
| 141 |
+
"RayInstaller is not supported yet in current "
|
| 142 |
+
"release of the Autoscaler V2. Therefore, providers "
|
| 143 |
+
"that update nodes (with `disable_node_updaters` set to True) "
|
| 144 |
+
"are not supported yet. Only KubeRay is supported for now which sets "
|
| 145 |
+
"disable_node_updaters to True in provider's config."
|
| 146 |
+
)
|
| 147 |
+
|
| 148 |
+
self._instance_manager = InstanceManager(
|
| 149 |
+
instance_storage=instance_storage,
|
| 150 |
+
instance_status_update_subscribers=subscribers,
|
| 151 |
+
)
|
| 152 |
+
|
| 153 |
+
def update_autoscaling_state(
|
| 154 |
+
self,
|
| 155 |
+
) -> Optional[AutoscalingState]:
|
| 156 |
+
"""
|
| 157 |
+
Update the autoscaling state of the cluster by reconciling the current
|
| 158 |
+
state of the cluster resources, the cloud providers as well as instance
|
| 159 |
+
update subscribers with the desired state.
|
| 160 |
+
|
| 161 |
+
Returns:
|
| 162 |
+
AutoscalingState: The new autoscaling state of the cluster or None if
|
| 163 |
+
the state is not updated.
|
| 164 |
+
|
| 165 |
+
Raises:
|
| 166 |
+
No exception.
|
| 167 |
+
"""
|
| 168 |
+
|
| 169 |
+
try:
|
| 170 |
+
ray_stop_errors = []
|
| 171 |
+
while not self._ray_stop_errors_queue.empty():
|
| 172 |
+
ray_stop_errors.append(self._ray_stop_errors_queue.get())
|
| 173 |
+
|
| 174 |
+
ray_install_errors = []
|
| 175 |
+
while not self._ray_install_errors_queue.empty():
|
| 176 |
+
ray_install_errors.append(self._ray_install_errors_queue.get())
|
| 177 |
+
|
| 178 |
+
# Get the current state of the ray cluster resources.
|
| 179 |
+
ray_cluster_resource_state = get_cluster_resource_state(self._gcs_client)
|
| 180 |
+
|
| 181 |
+
# Refresh the config from the source
|
| 182 |
+
self._config_reader.refresh_cached_autoscaling_config()
|
| 183 |
+
autoscaling_config = self._config_reader.get_cached_autoscaling_config()
|
| 184 |
+
|
| 185 |
+
return Reconciler.reconcile(
|
| 186 |
+
instance_manager=self._instance_manager,
|
| 187 |
+
scheduler=self._scheduler,
|
| 188 |
+
cloud_provider=self._cloud_instance_provider,
|
| 189 |
+
ray_cluster_resource_state=ray_cluster_resource_state,
|
| 190 |
+
non_terminated_cloud_instances=(
|
| 191 |
+
self._cloud_instance_provider.get_non_terminated()
|
| 192 |
+
),
|
| 193 |
+
cloud_provider_errors=self._cloud_instance_provider.poll_errors(),
|
| 194 |
+
ray_install_errors=ray_install_errors,
|
| 195 |
+
ray_stop_errors=ray_stop_errors,
|
| 196 |
+
autoscaling_config=autoscaling_config,
|
| 197 |
+
metrics_reporter=self._metrics_reporter,
|
| 198 |
+
)
|
| 199 |
+
except Exception as e:
|
| 200 |
+
logger.exception(e)
|
| 201 |
+
return None
|
.venv/lib/python3.11/site-packages/ray/autoscaler/v2/event_logger.py
ADDED
|
@@ -0,0 +1,157 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import logging
|
| 2 |
+
from collections import defaultdict
|
| 3 |
+
from typing import Dict, List, Optional
|
| 4 |
+
|
| 5 |
+
from ray._private.event.event_logger import EventLoggerAdapter
|
| 6 |
+
from ray.autoscaler.v2.instance_manager.config import NodeTypeConfig
|
| 7 |
+
from ray.autoscaler.v2.schema import NodeType
|
| 8 |
+
from ray.autoscaler.v2.utils import ResourceRequestUtil
|
| 9 |
+
from ray.core.generated.autoscaler_pb2 import (
|
| 10 |
+
ClusterResourceConstraint,
|
| 11 |
+
GangResourceRequest,
|
| 12 |
+
ResourceRequest,
|
| 13 |
+
)
|
| 14 |
+
from ray.core.generated.instance_manager_pb2 import LaunchRequest, TerminationRequest
|
| 15 |
+
|
| 16 |
+
logger = logging.getLogger(__name__)
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
class AutoscalerEventLogger:
|
| 20 |
+
"""
|
| 21 |
+
Logs events related to the autoscaler.
|
| 22 |
+
|
| 23 |
+
# TODO:
|
| 24 |
+
- Add more logging for other events.
|
| 25 |
+
- Rate limit the events if too spammy.
|
| 26 |
+
"""
|
| 27 |
+
|
| 28 |
+
def __init__(self, logger: EventLoggerAdapter):
|
| 29 |
+
self._logger = logger
|
| 30 |
+
|
| 31 |
+
def log_cluster_scheduling_update(
|
| 32 |
+
self,
|
| 33 |
+
node_type_configs: Dict[NodeType, NodeTypeConfig],
|
| 34 |
+
cluster_shape: Dict[NodeType, int],
|
| 35 |
+
launch_requests: Optional[List[LaunchRequest]] = None,
|
| 36 |
+
terminate_requests: Optional[List[TerminationRequest]] = None,
|
| 37 |
+
infeasible_requests: Optional[List[ResourceRequest]] = None,
|
| 38 |
+
infeasible_gang_requests: Optional[List[GangResourceRequest]] = None,
|
| 39 |
+
infeasible_cluster_resource_constraints: Optional[
|
| 40 |
+
List[ClusterResourceConstraint]
|
| 41 |
+
] = None,
|
| 42 |
+
) -> None:
|
| 43 |
+
"""
|
| 44 |
+
Log any update of the cluster scheduling state.
|
| 45 |
+
"""
|
| 46 |
+
|
| 47 |
+
# Log any launch events.
|
| 48 |
+
if launch_requests:
|
| 49 |
+
launch_type_count = defaultdict(int)
|
| 50 |
+
for req in launch_requests:
|
| 51 |
+
launch_type_count[req.instance_type] += req.count
|
| 52 |
+
|
| 53 |
+
for idx, (instance_type, count) in enumerate(launch_type_count.items()):
|
| 54 |
+
log_str = f"Adding {count} node(s) of type {instance_type}."
|
| 55 |
+
self._logger.info(f"{log_str}")
|
| 56 |
+
logger.info(f"{log_str}")
|
| 57 |
+
|
| 58 |
+
# Log any terminate events.
|
| 59 |
+
if terminate_requests:
|
| 60 |
+
termination_by_causes_and_type = defaultdict(int)
|
| 61 |
+
for req in terminate_requests:
|
| 62 |
+
termination_by_causes_and_type[(req.cause, req.instance_type)] += 1
|
| 63 |
+
|
| 64 |
+
cause_reason_map = {
|
| 65 |
+
TerminationRequest.Cause.OUTDATED: "outdated",
|
| 66 |
+
TerminationRequest.Cause.MAX_NUM_NODES: "max number of worker nodes reached", # noqa
|
| 67 |
+
TerminationRequest.Cause.MAX_NUM_NODE_PER_TYPE: "max number of worker nodes per type reached", # noqa
|
| 68 |
+
TerminationRequest.Cause.IDLE: "idle",
|
| 69 |
+
}
|
| 70 |
+
|
| 71 |
+
for idx, ((cause, instance_type), count) in enumerate(
|
| 72 |
+
termination_by_causes_and_type.items()
|
| 73 |
+
):
|
| 74 |
+
log_str = f"Removing {count} nodes of type {instance_type} ({cause_reason_map[cause]})." # noqa
|
| 75 |
+
self._logger.info(f"{log_str}")
|
| 76 |
+
logger.info(f"{log_str}")
|
| 77 |
+
|
| 78 |
+
# Cluster shape changes.
|
| 79 |
+
if launch_requests or terminate_requests:
|
| 80 |
+
total_resources = defaultdict(float)
|
| 81 |
+
|
| 82 |
+
for node_type, count in cluster_shape.items():
|
| 83 |
+
node_config = node_type_configs[node_type]
|
| 84 |
+
for resource_name, resource_quantity in node_config.resources.items():
|
| 85 |
+
total_resources[resource_name] += resource_quantity * count
|
| 86 |
+
|
| 87 |
+
num_cpus = total_resources.get("CPU", 0)
|
| 88 |
+
log_str = f"Resized to {int(num_cpus)} CPUs"
|
| 89 |
+
|
| 90 |
+
if "GPU" in total_resources:
|
| 91 |
+
log_str += f", {int(total_resources['GPU'])} GPUs"
|
| 92 |
+
if "TPU" in total_resources:
|
| 93 |
+
log_str += f", {int(total_resources['TPU'])} TPUs"
|
| 94 |
+
|
| 95 |
+
self._logger.info(f"{log_str}.")
|
| 96 |
+
self._logger.debug(f"Current cluster shape: {dict(cluster_shape)}.")
|
| 97 |
+
|
| 98 |
+
# Log any infeasible requests.
|
| 99 |
+
if infeasible_requests:
|
| 100 |
+
requests_by_count = ResourceRequestUtil.group_by_count(infeasible_requests)
|
| 101 |
+
log_str = "No available node types can fulfill resource requests "
|
| 102 |
+
for idx, req_count in enumerate(requests_by_count):
|
| 103 |
+
resource_map = ResourceRequestUtil.to_resource_map(req_count.request)
|
| 104 |
+
log_str += f"{resource_map}*{req_count.count}"
|
| 105 |
+
if idx < len(requests_by_count) - 1:
|
| 106 |
+
log_str += ", "
|
| 107 |
+
|
| 108 |
+
log_str += (
|
| 109 |
+
". Add suitable node types to this cluster to resolve this issue."
|
| 110 |
+
)
|
| 111 |
+
self._logger.warning(log_str)
|
| 112 |
+
|
| 113 |
+
if infeasible_gang_requests:
|
| 114 |
+
# Log for each placement group requests.
|
| 115 |
+
for gang_request in infeasible_gang_requests:
|
| 116 |
+
log_str = (
|
| 117 |
+
"No available node types can fulfill "
|
| 118 |
+
"placement group requests (detail={details}): ".format(
|
| 119 |
+
details=gang_request.details
|
| 120 |
+
)
|
| 121 |
+
)
|
| 122 |
+
requests_by_count = ResourceRequestUtil.group_by_count(
|
| 123 |
+
gang_request.requests
|
| 124 |
+
)
|
| 125 |
+
for idx, req_count in enumerate(requests_by_count):
|
| 126 |
+
resource_map = ResourceRequestUtil.to_resource_map(
|
| 127 |
+
req_count.request
|
| 128 |
+
)
|
| 129 |
+
log_str += f"{resource_map}*{req_count.count}"
|
| 130 |
+
if idx < len(requests_by_count) - 1:
|
| 131 |
+
log_str += ", "
|
| 132 |
+
|
| 133 |
+
log_str += (
|
| 134 |
+
". Add suitable node types to this cluster to resolve this issue."
|
| 135 |
+
)
|
| 136 |
+
self._logger.warning(log_str)
|
| 137 |
+
|
| 138 |
+
if infeasible_cluster_resource_constraints:
|
| 139 |
+
# We will only have max 1 cluster resource constraint for now since it's
|
| 140 |
+
# from `request_resources()` sdk, where the most recent call would override
|
| 141 |
+
# the previous one.
|
| 142 |
+
for infeasible_constraint in infeasible_cluster_resource_constraints:
|
| 143 |
+
log_str = "No available node types can fulfill cluster constraint: "
|
| 144 |
+
for i, requests_by_count in enumerate(
|
| 145 |
+
infeasible_constraint.resource_requests
|
| 146 |
+
):
|
| 147 |
+
resource_map = ResourceRequestUtil.to_resource_map(
|
| 148 |
+
requests_by_count.request
|
| 149 |
+
)
|
| 150 |
+
log_str += f"{resource_map}*{requests_by_count.count}"
|
| 151 |
+
if i < len(infeasible_constraint.resource_requests) - 1:
|
| 152 |
+
log_str += ", "
|
| 153 |
+
|
| 154 |
+
log_str += (
|
| 155 |
+
". Add suitable node types to this cluster to resolve this issue."
|
| 156 |
+
)
|
| 157 |
+
self._logger.warning(log_str)
|
.venv/lib/python3.11/site-packages/ray/autoscaler/v2/instance_manager/__init__.py
ADDED
|
File without changes
|
.venv/lib/python3.11/site-packages/ray/autoscaler/v2/instance_manager/common.py
ADDED
|
@@ -0,0 +1,472 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import time
|
| 2 |
+
import uuid
|
| 3 |
+
from typing import Dict, List, Optional, Set
|
| 4 |
+
|
| 5 |
+
from ray.core.generated.instance_manager_pb2 import Instance, InstanceUpdateEvent
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
class InstanceUtil:
|
| 9 |
+
"""
|
| 10 |
+
A helper class to group updates and operations on an Instance object defined
|
| 11 |
+
in instance_manager.proto
|
| 12 |
+
"""
|
| 13 |
+
|
| 14 |
+
# Memoized reachable from sets, where the key is the instance status, and
|
| 15 |
+
# the value is the set of instance status that is reachable from the key
|
| 16 |
+
# instance status.
|
| 17 |
+
_reachable_from: Optional[
|
| 18 |
+
Dict["Instance.InstanceStatus", Set["Instance.InstanceStatus"]]
|
| 19 |
+
] = None
|
| 20 |
+
|
| 21 |
+
@staticmethod
|
| 22 |
+
def new_instance(
|
| 23 |
+
instance_id: str,
|
| 24 |
+
instance_type: str,
|
| 25 |
+
status: Instance.InstanceStatus,
|
| 26 |
+
details: str = "",
|
| 27 |
+
) -> Instance:
|
| 28 |
+
"""
|
| 29 |
+
Returns a new instance with the given status.
|
| 30 |
+
|
| 31 |
+
Args:
|
| 32 |
+
instance_id: The instance id.
|
| 33 |
+
instance_type: The instance type.
|
| 34 |
+
status: The status of the new instance.
|
| 35 |
+
details: The details of the status transition.
|
| 36 |
+
"""
|
| 37 |
+
instance = Instance()
|
| 38 |
+
instance.version = 0 # it will be populated by the underlying storage.
|
| 39 |
+
instance.instance_id = instance_id
|
| 40 |
+
instance.instance_type = instance_type
|
| 41 |
+
instance.status = status
|
| 42 |
+
InstanceUtil._record_status_transition(instance, status, details)
|
| 43 |
+
return instance
|
| 44 |
+
|
| 45 |
+
@staticmethod
|
| 46 |
+
def random_instance_id() -> str:
|
| 47 |
+
"""
|
| 48 |
+
Returns a random instance id.
|
| 49 |
+
"""
|
| 50 |
+
return str(uuid.uuid4())
|
| 51 |
+
|
| 52 |
+
@staticmethod
|
| 53 |
+
def is_cloud_instance_allocated(instance_status: Instance.InstanceStatus) -> bool:
|
| 54 |
+
"""
|
| 55 |
+
Returns True if the instance is in a status where there could exist
|
| 56 |
+
a cloud instance allocated by the cloud provider.
|
| 57 |
+
"""
|
| 58 |
+
assert instance_status != Instance.UNKNOWN
|
| 59 |
+
return instance_status in {
|
| 60 |
+
Instance.ALLOCATED,
|
| 61 |
+
Instance.RAY_INSTALLING,
|
| 62 |
+
Instance.RAY_RUNNING,
|
| 63 |
+
Instance.RAY_STOPPING,
|
| 64 |
+
Instance.RAY_STOP_REQUESTED,
|
| 65 |
+
Instance.RAY_STOPPED,
|
| 66 |
+
Instance.TERMINATING,
|
| 67 |
+
Instance.RAY_INSTALL_FAILED,
|
| 68 |
+
Instance.TERMINATION_FAILED,
|
| 69 |
+
}
|
| 70 |
+
|
| 71 |
+
@staticmethod
|
| 72 |
+
def is_ray_running(instance_status: Instance.InstanceStatus) -> bool:
|
| 73 |
+
"""
|
| 74 |
+
Returns True if the instance is in a status where the ray process is
|
| 75 |
+
running on the cloud instance.
|
| 76 |
+
i.e. RAY_RUNNING, RAY_STOP_REQUESTED, RAY_STOPPING
|
| 77 |
+
"""
|
| 78 |
+
assert instance_status != Instance.UNKNOWN
|
| 79 |
+
|
| 80 |
+
if instance_status in InstanceUtil.get_reachable_statuses(
|
| 81 |
+
Instance.RAY_STOPPING
|
| 82 |
+
):
|
| 83 |
+
return False
|
| 84 |
+
|
| 85 |
+
if instance_status in InstanceUtil.get_reachable_statuses(Instance.RAY_RUNNING):
|
| 86 |
+
return True
|
| 87 |
+
|
| 88 |
+
return False
|
| 89 |
+
|
| 90 |
+
@staticmethod
|
| 91 |
+
def is_ray_pending(instance_status: Instance.InstanceStatus) -> bool:
|
| 92 |
+
"""
|
| 93 |
+
Returns True if the instance is in a status where the ray process is
|
| 94 |
+
pending to be started on the cloud instance.
|
| 95 |
+
|
| 96 |
+
"""
|
| 97 |
+
assert instance_status != Instance.UNKNOWN
|
| 98 |
+
# Not gonna be in a RAY_RUNNING status.
|
| 99 |
+
if Instance.RAY_RUNNING not in InstanceUtil.get_reachable_statuses(
|
| 100 |
+
instance_status
|
| 101 |
+
):
|
| 102 |
+
return False
|
| 103 |
+
|
| 104 |
+
# Already running ray.
|
| 105 |
+
if instance_status in InstanceUtil.get_reachable_statuses(Instance.RAY_RUNNING):
|
| 106 |
+
return False
|
| 107 |
+
|
| 108 |
+
return True
|
| 109 |
+
|
| 110 |
+
def is_ray_running_reachable(instance_status: Instance.InstanceStatus) -> bool:
|
| 111 |
+
"""
|
| 112 |
+
Returns True if the instance is in a status where it may transition
|
| 113 |
+
to RAY_RUNNING status.
|
| 114 |
+
"""
|
| 115 |
+
return Instance.RAY_RUNNING in InstanceUtil.get_reachable_statuses(
|
| 116 |
+
instance_status
|
| 117 |
+
)
|
| 118 |
+
|
| 119 |
+
@staticmethod
|
| 120 |
+
def set_status(
|
| 121 |
+
instance: Instance,
|
| 122 |
+
new_instance_status: Instance.InstanceStatus,
|
| 123 |
+
details: str = "",
|
| 124 |
+
) -> bool:
|
| 125 |
+
"""Transitions the instance to the new state.
|
| 126 |
+
|
| 127 |
+
Args:
|
| 128 |
+
instance: The instance to update.
|
| 129 |
+
new_instance_status: The new status to transition to.
|
| 130 |
+
details: The details of the transition.
|
| 131 |
+
|
| 132 |
+
Returns:
|
| 133 |
+
True if the status transition is successful, False otherwise.
|
| 134 |
+
"""
|
| 135 |
+
if (
|
| 136 |
+
new_instance_status
|
| 137 |
+
not in InstanceUtil.get_valid_transitions()[instance.status]
|
| 138 |
+
):
|
| 139 |
+
return False
|
| 140 |
+
instance.status = new_instance_status
|
| 141 |
+
InstanceUtil._record_status_transition(instance, new_instance_status, details)
|
| 142 |
+
return True
|
| 143 |
+
|
| 144 |
+
@staticmethod
|
| 145 |
+
def _record_status_transition(
|
| 146 |
+
instance: Instance, status: Instance.InstanceStatus, details: str
|
| 147 |
+
):
|
| 148 |
+
"""Records the status transition.
|
| 149 |
+
|
| 150 |
+
Args:
|
| 151 |
+
instance: The instance to update.
|
| 152 |
+
status: The new status to transition to.
|
| 153 |
+
"""
|
| 154 |
+
now_ns = time.time_ns()
|
| 155 |
+
instance.status_history.append(
|
| 156 |
+
Instance.StatusHistory(
|
| 157 |
+
instance_status=status,
|
| 158 |
+
timestamp_ns=now_ns,
|
| 159 |
+
details=details,
|
| 160 |
+
)
|
| 161 |
+
)
|
| 162 |
+
|
| 163 |
+
@staticmethod
|
| 164 |
+
def has_timeout(instance: Instance, timeout_s: int) -> bool:
|
| 165 |
+
"""
|
| 166 |
+
Returns True if the instance has been in the current status for more
|
| 167 |
+
than the timeout_seconds.
|
| 168 |
+
|
| 169 |
+
Args:
|
| 170 |
+
instance: The instance to check.
|
| 171 |
+
timeout_seconds: The timeout in seconds.
|
| 172 |
+
|
| 173 |
+
Returns:
|
| 174 |
+
True if the instance has been in the current status for more than
|
| 175 |
+
the timeout_s seconds.
|
| 176 |
+
"""
|
| 177 |
+
cur_status = instance.status
|
| 178 |
+
|
| 179 |
+
status_times_ns = InstanceUtil.get_status_transition_times_ns(
|
| 180 |
+
instance, select_instance_status=cur_status
|
| 181 |
+
)
|
| 182 |
+
assert len(status_times_ns) >= 1, (
|
| 183 |
+
f"instance {instance.instance_id} has {len(status_times_ns)} "
|
| 184 |
+
f"{Instance.InstanceStatus.Name(cur_status)} status"
|
| 185 |
+
)
|
| 186 |
+
status_time_ns = sorted(status_times_ns)[-1]
|
| 187 |
+
if time.time_ns() - status_time_ns <= (timeout_s * 1e9):
|
| 188 |
+
return False
|
| 189 |
+
|
| 190 |
+
return True
|
| 191 |
+
|
| 192 |
+
@staticmethod
|
| 193 |
+
def get_valid_transitions() -> Dict[
|
| 194 |
+
"Instance.InstanceStatus", Set["Instance.InstanceStatus"]
|
| 195 |
+
]:
|
| 196 |
+
return {
|
| 197 |
+
# This is the initial status of a new instance.
|
| 198 |
+
Instance.QUEUED: {
|
| 199 |
+
# Cloud provider requested to launch a node for the instance.
|
| 200 |
+
# This happens when the a launch request is made to the node provider.
|
| 201 |
+
Instance.REQUESTED,
|
| 202 |
+
},
|
| 203 |
+
# When in this status, a launch request to the node provider is made.
|
| 204 |
+
Instance.REQUESTED: {
|
| 205 |
+
# Cloud provider allocated a cloud instance for the instance.
|
| 206 |
+
# This happens when the cloud instance first appears in the list of
|
| 207 |
+
# running cloud instances from the cloud instance provider.
|
| 208 |
+
Instance.ALLOCATED,
|
| 209 |
+
# Retry the allocation, become queueing again.
|
| 210 |
+
Instance.QUEUED,
|
| 211 |
+
# Cloud provider fails to allocate one. Either as a timeout or
|
| 212 |
+
# the launch request fails immediately.
|
| 213 |
+
Instance.ALLOCATION_FAILED,
|
| 214 |
+
},
|
| 215 |
+
# When in this status, the cloud instance is allocated and running. This
|
| 216 |
+
# happens when the cloud instance is present in node provider's list of
|
| 217 |
+
# running cloud instances.
|
| 218 |
+
Instance.ALLOCATED: {
|
| 219 |
+
# Ray needs to be install and launch on the provisioned cloud instance.
|
| 220 |
+
# This happens when the cloud instance is allocated, and the autoscaler
|
| 221 |
+
# is responsible for installing and launching ray on the cloud instance.
|
| 222 |
+
# For node provider that manages the ray installation and launching,
|
| 223 |
+
# this state is skipped.
|
| 224 |
+
Instance.RAY_INSTALLING,
|
| 225 |
+
# Ray is already installed on the provisioned cloud
|
| 226 |
+
# instance. It could be any valid ray status.
|
| 227 |
+
Instance.RAY_RUNNING,
|
| 228 |
+
Instance.RAY_STOPPING,
|
| 229 |
+
Instance.RAY_STOPPED,
|
| 230 |
+
# Instance is requested to be stopped, e.g. instance leaked: no matching
|
| 231 |
+
# Instance with the same type is found in the autoscaler's state.
|
| 232 |
+
Instance.TERMINATING,
|
| 233 |
+
# cloud instance somehow failed.
|
| 234 |
+
Instance.TERMINATED,
|
| 235 |
+
},
|
| 236 |
+
# Ray process is being installed and started on the cloud instance.
|
| 237 |
+
# This status is skipped for node provider that manages the ray
|
| 238 |
+
# installation and launching. (e.g. Ray-on-Spark)
|
| 239 |
+
Instance.RAY_INSTALLING: {
|
| 240 |
+
# Ray installed and launched successfully, reported by the ray cluster.
|
| 241 |
+
# Similar to the Instance.ALLOCATED -> Instance.RAY_RUNNING transition,
|
| 242 |
+
# where the ray process is managed by the node provider.
|
| 243 |
+
Instance.RAY_RUNNING,
|
| 244 |
+
# Ray installation failed. This happens when the ray process failed to
|
| 245 |
+
# be installed and started on the cloud instance.
|
| 246 |
+
Instance.RAY_INSTALL_FAILED,
|
| 247 |
+
# Wen the ray node is reported as stopped by the ray cluster.
|
| 248 |
+
# This could happen that the ray process was stopped quickly after start
|
| 249 |
+
# such that a ray running node wasn't discovered and the RAY_RUNNING
|
| 250 |
+
# transition was skipped.
|
| 251 |
+
Instance.RAY_STOPPED,
|
| 252 |
+
# A cloud instance is being terminated (when the instance itself is no
|
| 253 |
+
# longer needed, e.g. instance is outdated, autoscaler is scaling down)
|
| 254 |
+
Instance.TERMINATING,
|
| 255 |
+
# cloud instance somehow failed during the installation process.
|
| 256 |
+
Instance.TERMINATED,
|
| 257 |
+
},
|
| 258 |
+
# Ray process is installed and running on the cloud instance. When in this
|
| 259 |
+
# status, a ray node must be present in the ray cluster.
|
| 260 |
+
Instance.RAY_RUNNING: {
|
| 261 |
+
# Ray is requested to be stopped.
|
| 262 |
+
Instance.RAY_STOP_REQUESTED,
|
| 263 |
+
# Ray is stopping (currently draining),
|
| 264 |
+
# e.g. idle termination.
|
| 265 |
+
Instance.RAY_STOPPING,
|
| 266 |
+
# Ray is already stopped, as reported by the ray cluster.
|
| 267 |
+
Instance.RAY_STOPPED,
|
| 268 |
+
# A cloud instance is being terminated (when the instance itself is no
|
| 269 |
+
# longer needed, e.g. instance is outdated, autoscaler is scaling down)
|
| 270 |
+
Instance.TERMINATING,
|
| 271 |
+
# cloud instance somehow failed.
|
| 272 |
+
Instance.TERMINATED,
|
| 273 |
+
},
|
| 274 |
+
# Ray process should be stopped on the cloud instance. The RayStopper
|
| 275 |
+
# subscriber will listen to this status and stop the ray process.
|
| 276 |
+
Instance.RAY_STOP_REQUESTED: {
|
| 277 |
+
# Ray is stopping on the cloud instance.
|
| 278 |
+
Instance.RAY_STOPPING,
|
| 279 |
+
# Ray stopped already.
|
| 280 |
+
Instance.RAY_STOPPED,
|
| 281 |
+
# Ray stop request failed (e.g. idle node no longer idle),
|
| 282 |
+
# ray is still running.
|
| 283 |
+
Instance.RAY_RUNNING,
|
| 284 |
+
# cloud instance somehow failed.
|
| 285 |
+
Instance.TERMINATED,
|
| 286 |
+
},
|
| 287 |
+
# When in this status, the ray process is requested to be stopped to the
|
| 288 |
+
# ray cluster, but not yet present in the dead ray node list reported by
|
| 289 |
+
# the ray cluster.
|
| 290 |
+
Instance.RAY_STOPPING: {
|
| 291 |
+
# Ray is stopped, and the ray node is present in the dead ray node list
|
| 292 |
+
# reported by the ray cluster.
|
| 293 |
+
Instance.RAY_STOPPED,
|
| 294 |
+
# A cloud instance is being terminated (when the instance itself is no
|
| 295 |
+
# longer needed, e.g. instance is outdated, autoscaler is scaling down)
|
| 296 |
+
Instance.TERMINATING,
|
| 297 |
+
# cloud instance somehow failed.
|
| 298 |
+
Instance.TERMINATED,
|
| 299 |
+
},
|
| 300 |
+
# When in this status, the ray process is stopped, and the ray node is
|
| 301 |
+
# present in the dead ray node list reported by the ray cluster.
|
| 302 |
+
Instance.RAY_STOPPED: {
|
| 303 |
+
# A cloud instance is being terminated (when the instance itself is no
|
| 304 |
+
# longer needed, e.g. instance is outdated, autoscaler is scaling down)
|
| 305 |
+
Instance.TERMINATING,
|
| 306 |
+
# cloud instance somehow failed.
|
| 307 |
+
Instance.TERMINATED,
|
| 308 |
+
},
|
| 309 |
+
# When in this status, the cloud instance is requested to be stopped to
|
| 310 |
+
# the node provider.
|
| 311 |
+
Instance.TERMINATING: {
|
| 312 |
+
# When a cloud instance no longer appears in the list of running cloud
|
| 313 |
+
# instances from the node provider.
|
| 314 |
+
Instance.TERMINATED,
|
| 315 |
+
# When the cloud instance failed to be terminated.
|
| 316 |
+
Instance.TERMINATION_FAILED,
|
| 317 |
+
},
|
| 318 |
+
# When in this status, the cloud instance failed to be terminated by the
|
| 319 |
+
# node provider. We will keep retrying.
|
| 320 |
+
Instance.TERMINATION_FAILED: {
|
| 321 |
+
# Retry the termination, become terminating again.
|
| 322 |
+
Instance.TERMINATING,
|
| 323 |
+
},
|
| 324 |
+
# Whenever a cloud instance disappears from the list of running cloud
|
| 325 |
+
# instances from the node provider, the instance is marked as stopped. Since
|
| 326 |
+
# we guarantee 1:1 mapping of a Instance to a cloud instance, this is a
|
| 327 |
+
# terminal state.
|
| 328 |
+
Instance.TERMINATED: set(), # Terminal state.
|
| 329 |
+
# When in this status, the cloud instance failed to be allocated by the
|
| 330 |
+
# node provider.
|
| 331 |
+
Instance.ALLOCATION_FAILED: set(), # Terminal state.
|
| 332 |
+
Instance.RAY_INSTALL_FAILED: {
|
| 333 |
+
# Autoscaler requests to shutdown the instance when ray install failed.
|
| 334 |
+
Instance.TERMINATING,
|
| 335 |
+
# cloud instance somehow failed.
|
| 336 |
+
Instance.TERMINATED,
|
| 337 |
+
},
|
| 338 |
+
# Initial state before the instance is created. Should never be used.
|
| 339 |
+
Instance.UNKNOWN: set(),
|
| 340 |
+
}
|
| 341 |
+
|
| 342 |
+
@staticmethod
|
| 343 |
+
def get_status_transitions(
|
| 344 |
+
instance: Instance,
|
| 345 |
+
select_instance_status: Optional["Instance.InstanceStatus"] = None,
|
| 346 |
+
) -> List["Instance.StatusHistory"]:
|
| 347 |
+
"""
|
| 348 |
+
Returns the status history of the instance.
|
| 349 |
+
|
| 350 |
+
Args:
|
| 351 |
+
instance: The instance.
|
| 352 |
+
select_instance_status: The go-to status to search for, i.e. select
|
| 353 |
+
only status history when the instance transitions into the status.
|
| 354 |
+
If None, returns all status updates.
|
| 355 |
+
"""
|
| 356 |
+
history = []
|
| 357 |
+
for status_update in instance.status_history:
|
| 358 |
+
if (
|
| 359 |
+
select_instance_status
|
| 360 |
+
and status_update.instance_status != select_instance_status
|
| 361 |
+
):
|
| 362 |
+
continue
|
| 363 |
+
history.append(status_update)
|
| 364 |
+
return history
|
| 365 |
+
|
| 366 |
+
@staticmethod
|
| 367 |
+
def get_last_status_transition(
|
| 368 |
+
instance: Instance,
|
| 369 |
+
select_instance_status: Optional["Instance.InstanceStatus"] = None,
|
| 370 |
+
) -> Optional["Instance.StatusHistory"]:
|
| 371 |
+
"""
|
| 372 |
+
Returns the last status transition of the instance.
|
| 373 |
+
|
| 374 |
+
Args:
|
| 375 |
+
instance: The instance.
|
| 376 |
+
instance_status: The status to search for. If None, returns the last
|
| 377 |
+
status update.
|
| 378 |
+
"""
|
| 379 |
+
history = InstanceUtil.get_status_transitions(instance, select_instance_status)
|
| 380 |
+
history.sort(key=lambda x: x.timestamp_ns)
|
| 381 |
+
if history:
|
| 382 |
+
return history[-1]
|
| 383 |
+
return None
|
| 384 |
+
|
| 385 |
+
@staticmethod
|
| 386 |
+
def get_status_transition_times_ns(
|
| 387 |
+
instance: Instance,
|
| 388 |
+
select_instance_status: Optional["Instance.InstanceStatus"] = None,
|
| 389 |
+
) -> List[int]:
|
| 390 |
+
"""
|
| 391 |
+
Returns a list of timestamps of the instance status update.
|
| 392 |
+
|
| 393 |
+
Args:
|
| 394 |
+
instance: The instance.
|
| 395 |
+
instance_status: The status to search for. If None, returns all
|
| 396 |
+
status updates timestamps.
|
| 397 |
+
|
| 398 |
+
Returns:
|
| 399 |
+
The list of timestamps of the instance status updates.
|
| 400 |
+
"""
|
| 401 |
+
return [
|
| 402 |
+
e.timestamp_ns
|
| 403 |
+
for e in InstanceUtil.get_status_transitions(
|
| 404 |
+
instance, select_instance_status
|
| 405 |
+
)
|
| 406 |
+
]
|
| 407 |
+
|
| 408 |
+
@classmethod
|
| 409 |
+
def get_reachable_statuses(
|
| 410 |
+
cls,
|
| 411 |
+
instance_status: Instance.InstanceStatus,
|
| 412 |
+
) -> Set["Instance.InstanceStatus"]:
|
| 413 |
+
"""
|
| 414 |
+
Returns the set of instance status that is reachable from the given
|
| 415 |
+
instance status following the status transitions.
|
| 416 |
+
This method is memoized.
|
| 417 |
+
Args:
|
| 418 |
+
instance_status: The instance status to start from.
|
| 419 |
+
Returns:
|
| 420 |
+
The set of instance status that is reachable from the given instance
|
| 421 |
+
status.
|
| 422 |
+
"""
|
| 423 |
+
if cls._reachable_from is None:
|
| 424 |
+
cls._compute_reachable()
|
| 425 |
+
return cls._reachable_from[instance_status]
|
| 426 |
+
|
| 427 |
+
@staticmethod
|
| 428 |
+
def get_log_str_for_update(instance: Instance, update: InstanceUpdateEvent) -> str:
|
| 429 |
+
"""Returns a log string for the given instance update."""
|
| 430 |
+
if update.upsert:
|
| 431 |
+
return (
|
| 432 |
+
f"New instance "
|
| 433 |
+
f"{Instance.InstanceStatus.Name(update.new_instance_status)} (id="
|
| 434 |
+
f"{instance.instance_id}, type={instance.instance_type}, "
|
| 435 |
+
f"cloud_instance_id={instance.cloud_instance_id}, "
|
| 436 |
+
f"ray_id={instance.node_id}): {update.details}"
|
| 437 |
+
)
|
| 438 |
+
return (
|
| 439 |
+
f"Update instance "
|
| 440 |
+
f"{Instance.InstanceStatus.Name(instance.status)}->"
|
| 441 |
+
f"{Instance.InstanceStatus.Name(update.new_instance_status)} (id="
|
| 442 |
+
f"{instance.instance_id}, type={instance.instance_type}, "
|
| 443 |
+
f"cloud_instance_id={instance.cloud_instance_id}, "
|
| 444 |
+
f"ray_id={instance.node_id}): {update.details}"
|
| 445 |
+
)
|
| 446 |
+
|
| 447 |
+
@classmethod
|
| 448 |
+
def _compute_reachable(cls):
|
| 449 |
+
"""
|
| 450 |
+
Computes and memorize the from status sets for each status machine with
|
| 451 |
+
a DFS search.
|
| 452 |
+
"""
|
| 453 |
+
valid_transitions = cls.get_valid_transitions()
|
| 454 |
+
|
| 455 |
+
def dfs(graph, start, visited):
|
| 456 |
+
"""
|
| 457 |
+
Regular DFS algorithm to find all reachable nodes from a given node.
|
| 458 |
+
"""
|
| 459 |
+
for next_node in graph[start]:
|
| 460 |
+
if next_node not in visited:
|
| 461 |
+
# We delay adding the visited set here so we could capture
|
| 462 |
+
# the self loop.
|
| 463 |
+
visited.add(next_node)
|
| 464 |
+
dfs(graph, next_node, visited)
|
| 465 |
+
return visited
|
| 466 |
+
|
| 467 |
+
# Initialize the graphs
|
| 468 |
+
cls._reachable_from = {}
|
| 469 |
+
for status in Instance.InstanceStatus.values():
|
| 470 |
+
# All nodes reachable from 'start'
|
| 471 |
+
visited = set()
|
| 472 |
+
cls._reachable_from[status] = dfs(valid_transitions, status, visited)
|
.venv/lib/python3.11/site-packages/ray/autoscaler/v2/instance_manager/config.py
ADDED
|
@@ -0,0 +1,541 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import copy
|
| 2 |
+
import logging
|
| 3 |
+
from abc import ABC, abstractmethod
|
| 4 |
+
from dataclasses import dataclass, field
|
| 5 |
+
from enum import Enum
|
| 6 |
+
from pathlib import Path
|
| 7 |
+
from typing import Any, Dict, List, Optional
|
| 8 |
+
|
| 9 |
+
import yaml
|
| 10 |
+
|
| 11 |
+
from ray._private.ray_constants import env_integer
|
| 12 |
+
from ray._private.utils import binary_to_hex
|
| 13 |
+
from ray._raylet import GcsClient
|
| 14 |
+
from ray.autoscaler._private.constants import (
|
| 15 |
+
AUTOSCALER_MAX_CONCURRENT_LAUNCHES,
|
| 16 |
+
DEFAULT_UPSCALING_SPEED,
|
| 17 |
+
DISABLE_LAUNCH_CONFIG_CHECK_KEY,
|
| 18 |
+
DISABLE_NODE_UPDATERS_KEY,
|
| 19 |
+
)
|
| 20 |
+
from ray.autoscaler._private.kuberay.autoscaling_config import AutoscalingConfigProducer
|
| 21 |
+
from ray.autoscaler._private.monitor import BASE_READONLY_CONFIG
|
| 22 |
+
from ray.autoscaler._private.util import (
|
| 23 |
+
format_readonly_node_type,
|
| 24 |
+
hash_launch_conf,
|
| 25 |
+
hash_runtime_conf,
|
| 26 |
+
prepare_config,
|
| 27 |
+
validate_config,
|
| 28 |
+
)
|
| 29 |
+
from ray.autoscaler.v2.schema import NodeType
|
| 30 |
+
from ray.autoscaler.v2.sdk import get_cluster_resource_state
|
| 31 |
+
from ray.autoscaler.v2.utils import is_head_node
|
| 32 |
+
|
| 33 |
+
logger = logging.getLogger(__name__)
|
| 34 |
+
|
| 35 |
+
|
| 36 |
+
class Provider(Enum):
|
| 37 |
+
UNKNOWN = 0
|
| 38 |
+
ALIYUN = 1
|
| 39 |
+
AWS = 2
|
| 40 |
+
AZURE = 3
|
| 41 |
+
GCP = 4
|
| 42 |
+
KUBERAY = 5
|
| 43 |
+
LOCAL = 6
|
| 44 |
+
READ_ONLY = 7
|
| 45 |
+
|
| 46 |
+
|
| 47 |
+
class IConfigReader(ABC):
|
| 48 |
+
"""An interface for reading Autoscaling config.
|
| 49 |
+
|
| 50 |
+
A utility class that reads autoscaling configs from various sources:
|
| 51 |
+
- File
|
| 52 |
+
- In-memory dict
|
| 53 |
+
- Remote config service (e.g. KubeRay's config)
|
| 54 |
+
|
| 55 |
+
Example:
|
| 56 |
+
reader = FileConfigReader("path/to/config.yaml")
|
| 57 |
+
# Get the recently cached config.
|
| 58 |
+
config = reader.get_cached_autoscaling_config()
|
| 59 |
+
|
| 60 |
+
...
|
| 61 |
+
# Refresh the cached config.
|
| 62 |
+
reader.refresh_cached_autoscaling_config()
|
| 63 |
+
config = reader.get_cached_autoscaling_config()
|
| 64 |
+
|
| 65 |
+
"""
|
| 66 |
+
|
| 67 |
+
@abstractmethod
|
| 68 |
+
def get_cached_autoscaling_config(self) -> "AutoscalingConfig":
|
| 69 |
+
"""Returns the recently read autoscaling config.
|
| 70 |
+
|
| 71 |
+
Returns:
|
| 72 |
+
AutoscalingConfig: The recently read autoscaling config.
|
| 73 |
+
"""
|
| 74 |
+
pass
|
| 75 |
+
|
| 76 |
+
@abstractmethod
|
| 77 |
+
def refresh_cached_autoscaling_config(self):
|
| 78 |
+
"""Read the config from the source."""
|
| 79 |
+
pass
|
| 80 |
+
|
| 81 |
+
|
| 82 |
+
@dataclass(frozen=True)
|
| 83 |
+
class InstanceReconcileConfig:
|
| 84 |
+
# The timeout for waiting for a REQUESTED instance to be ALLOCATED.
|
| 85 |
+
request_status_timeout_s: int = env_integer(
|
| 86 |
+
"RAY_AUTOSCALER_RECONCILE_REQUEST_STATUS_TIMEOUT_S", 10 * 60
|
| 87 |
+
)
|
| 88 |
+
# The timeout for waiting for a ALLOCATED instance to be RAY_RUNNING.
|
| 89 |
+
allocate_status_timeout_s: int = env_integer(
|
| 90 |
+
"RAY_AUTOSCALER_RECONCILE_ALLOCATE_STATUS_TIMEOUT_S", 300
|
| 91 |
+
)
|
| 92 |
+
# The timeout for waiting for a RAY_INSTALLING instance to be RAY_RUNNING.
|
| 93 |
+
ray_install_status_timeout_s: int = env_integer(
|
| 94 |
+
"RAY_AUTOSCALER_RECONCILE_RAY_INSTALL_STATUS_TIMEOUT_S", 30 * 60
|
| 95 |
+
)
|
| 96 |
+
# The timeout for waiting for a TERMINATING instance to be TERMINATED.
|
| 97 |
+
terminating_status_timeout_s: int = env_integer(
|
| 98 |
+
"RAY_AUTOSCALER_RECONCILE_TERMINATING_STATUS_TIMEOUT_S", 300
|
| 99 |
+
)
|
| 100 |
+
# The timeout for waiting for a RAY_STOP_REQUESTED instance
|
| 101 |
+
# to be RAY_STOPPING or RAY_STOPPED.
|
| 102 |
+
ray_stop_requested_status_timeout_s: int = env_integer(
|
| 103 |
+
"RAY_AUTOSCALER_RECONCILE_RAY_STOP_REQUESTED_STATUS_TIMEOUT_S", 300
|
| 104 |
+
)
|
| 105 |
+
# The interval for raise a warning when an instance in transient status
|
| 106 |
+
# is not updated for a long time.
|
| 107 |
+
transient_status_warn_interval_s: int = env_integer(
|
| 108 |
+
"RAY_AUTOSCALER_RECONCILE_TRANSIENT_STATUS_WARN_INTERVAL_S", 90
|
| 109 |
+
)
|
| 110 |
+
# The number of times to retry requesting to allocate an instance.
|
| 111 |
+
max_num_retry_request_to_allocate: int = env_integer(
|
| 112 |
+
"RAY_AUTOSCALER_RECONCILE_MAX_NUM_RETRY_REQUEST_TO_ALLOCATE", 3
|
| 113 |
+
)
|
| 114 |
+
|
| 115 |
+
|
| 116 |
+
@dataclass
|
| 117 |
+
class NodeTypeConfig:
|
| 118 |
+
"""
|
| 119 |
+
NodeTypeConfig is the helper class to provide node type specific configs.
|
| 120 |
+
This maps to subset of the `available_node_types` field in the
|
| 121 |
+
autoscaling config.
|
| 122 |
+
"""
|
| 123 |
+
|
| 124 |
+
# Node type name
|
| 125 |
+
name: NodeType
|
| 126 |
+
# The minimal number of worker nodes to be launched for this node type.
|
| 127 |
+
min_worker_nodes: int
|
| 128 |
+
# The maximal number of worker nodes can be launched for this node type.
|
| 129 |
+
max_worker_nodes: int
|
| 130 |
+
# Idle timeout seconds for worker nodes of this node type.
|
| 131 |
+
idle_timeout_s: Optional[float] = None
|
| 132 |
+
# The total resources on the node.
|
| 133 |
+
resources: Dict[str, float] = field(default_factory=dict)
|
| 134 |
+
# The labels on the node.
|
| 135 |
+
labels: Dict[str, str] = field(default_factory=dict)
|
| 136 |
+
# The node config's launch config hash. It's calculated from the auth
|
| 137 |
+
# config, and the node's config in the `AutoscalingConfig` for the node
|
| 138 |
+
# type when launching the node. It's used to detect config changes.
|
| 139 |
+
launch_config_hash: str = ""
|
| 140 |
+
|
| 141 |
+
def __post_init__(self):
|
| 142 |
+
assert self.min_worker_nodes <= self.max_worker_nodes
|
| 143 |
+
assert self.min_worker_nodes >= 0
|
| 144 |
+
|
| 145 |
+
|
| 146 |
+
class AutoscalingConfig:
|
| 147 |
+
"""
|
| 148 |
+
AutoscalingConfig is the helper class to provide autoscaling
|
| 149 |
+
related configs.
|
| 150 |
+
|
| 151 |
+
# TODO(rickyx):
|
| 152 |
+
1. Move the config validation logic here.
|
| 153 |
+
2. Deprecate the ray-schema.json for validation because it's
|
| 154 |
+
static thus not possible to validate the config with interdependency
|
| 155 |
+
of each other.
|
| 156 |
+
"""
|
| 157 |
+
|
| 158 |
+
def __init__(
|
| 159 |
+
self,
|
| 160 |
+
configs: Dict[str, Any],
|
| 161 |
+
skip_content_hash: bool = False,
|
| 162 |
+
) -> None:
|
| 163 |
+
"""
|
| 164 |
+
Args:
|
| 165 |
+
configs : The raw configs dict.
|
| 166 |
+
skip_content_hash :
|
| 167 |
+
Whether to skip file mounts/ray command hash calculation.
|
| 168 |
+
"""
|
| 169 |
+
self._sync_continuously = False
|
| 170 |
+
self.update_configs(configs, skip_content_hash)
|
| 171 |
+
|
| 172 |
+
def update_configs(self, configs: Dict[str, Any], skip_content_hash: bool) -> None:
|
| 173 |
+
self._configs = prepare_config(configs)
|
| 174 |
+
validate_config(self._configs)
|
| 175 |
+
if skip_content_hash:
|
| 176 |
+
return
|
| 177 |
+
self._calculate_hashes()
|
| 178 |
+
self._sync_continuously = self._configs.get(
|
| 179 |
+
"generate_file_mounts_contents_hash", True
|
| 180 |
+
)
|
| 181 |
+
|
| 182 |
+
def _calculate_hashes(self) -> None:
|
| 183 |
+
logger.info("Calculating hashes for file mounts and ray commands.")
|
| 184 |
+
self._runtime_hash, self._file_mounts_contents_hash = hash_runtime_conf(
|
| 185 |
+
self._configs.get("file_mounts", {}),
|
| 186 |
+
self._configs.get("cluster_synced_files", []),
|
| 187 |
+
[
|
| 188 |
+
self._configs.get("worker_setup_commands", []),
|
| 189 |
+
self._configs.get("worker_start_ray_commands", []),
|
| 190 |
+
],
|
| 191 |
+
generate_file_mounts_contents_hash=self._configs.get(
|
| 192 |
+
"generate_file_mounts_contents_hash", True
|
| 193 |
+
),
|
| 194 |
+
)
|
| 195 |
+
|
| 196 |
+
def get_cloud_node_config(self, ray_node_type: NodeType) -> Dict[str, Any]:
|
| 197 |
+
return copy.deepcopy(
|
| 198 |
+
self.get_node_type_specific_config(ray_node_type, "node_config") or {}
|
| 199 |
+
)
|
| 200 |
+
|
| 201 |
+
def get_docker_config(self, ray_node_type: NodeType) -> Dict[str, Any]:
|
| 202 |
+
"""
|
| 203 |
+
Return the docker config for the specified node type.
|
| 204 |
+
If it's a head node, the image will be chosen in the following order:
|
| 205 |
+
1. Node specific docker image.
|
| 206 |
+
2. The 'docker' config's 'head_image' field.
|
| 207 |
+
3. The 'docker' config's 'image' field.
|
| 208 |
+
If it's a worker node, the image will be chosen in the following order:
|
| 209 |
+
1. Node specific docker image.
|
| 210 |
+
2. The 'docker' config's 'worker_image' field.
|
| 211 |
+
3. The 'docker' config's 'image' field.
|
| 212 |
+
"""
|
| 213 |
+
# TODO(rickyx): It's unfortunate we have multiple fields in ray-schema.json
|
| 214 |
+
# that can specify docker images. We should consolidate them.
|
| 215 |
+
docker_config = copy.deepcopy(self._configs.get("docker", {}))
|
| 216 |
+
node_specific_docker_config = self._configs["available_node_types"][
|
| 217 |
+
ray_node_type
|
| 218 |
+
].get("docker", {})
|
| 219 |
+
# Override the global docker config with node specific docker config.
|
| 220 |
+
docker_config.update(node_specific_docker_config)
|
| 221 |
+
|
| 222 |
+
if self._configs.get("head_node_type") == ray_node_type:
|
| 223 |
+
if "head_image" in docker_config:
|
| 224 |
+
logger.info(
|
| 225 |
+
"Overwriting image={} by head_image({}) for head node docker.".format( # noqa: E501
|
| 226 |
+
docker_config["image"], docker_config["head_image"]
|
| 227 |
+
)
|
| 228 |
+
)
|
| 229 |
+
docker_config["image"] = docker_config["head_image"]
|
| 230 |
+
else:
|
| 231 |
+
if "worker_image" in docker_config:
|
| 232 |
+
logger.info(
|
| 233 |
+
"Overwriting image={} by worker_image({}) for worker node docker.".format( # noqa: E501
|
| 234 |
+
docker_config["image"], docker_config["worker_image"]
|
| 235 |
+
)
|
| 236 |
+
)
|
| 237 |
+
docker_config["image"] = docker_config["worker_image"]
|
| 238 |
+
|
| 239 |
+
# These fields should be merged.
|
| 240 |
+
docker_config.pop("head_image", None)
|
| 241 |
+
docker_config.pop("worker_image", None)
|
| 242 |
+
return docker_config
|
| 243 |
+
|
| 244 |
+
def get_worker_start_ray_commands(self) -> List[str]:
|
| 245 |
+
return self._configs.get("worker_start_ray_commands", [])
|
| 246 |
+
|
| 247 |
+
def get_head_setup_commands(self) -> List[str]:
|
| 248 |
+
return self._configs.get("head_setup_commands", [])
|
| 249 |
+
|
| 250 |
+
def get_head_start_ray_commands(self) -> List[str]:
|
| 251 |
+
return self._configs.get("head_start_ray_commands", [])
|
| 252 |
+
|
| 253 |
+
def get_worker_setup_commands(self, ray_node_type: NodeType) -> List[str]:
|
| 254 |
+
"""
|
| 255 |
+
Return the worker setup commands for the specified node type.
|
| 256 |
+
|
| 257 |
+
If the node type specific worker setup commands are not specified,
|
| 258 |
+
return the global worker setup commands.
|
| 259 |
+
"""
|
| 260 |
+
worker_setup_command = self.get_node_type_specific_config(
|
| 261 |
+
ray_node_type, "worker_setup_commands"
|
| 262 |
+
)
|
| 263 |
+
if worker_setup_command is None:
|
| 264 |
+
# Return global worker setup commands if node type specific
|
| 265 |
+
# worker setup commands are not specified.
|
| 266 |
+
logger.info(
|
| 267 |
+
"Using global worker setup commands for {}".format(ray_node_type)
|
| 268 |
+
)
|
| 269 |
+
return self._configs.get("worker_setup_commands", [])
|
| 270 |
+
return worker_setup_command
|
| 271 |
+
|
| 272 |
+
def get_initialization_commands(self, ray_node_type: NodeType) -> List[str]:
|
| 273 |
+
"""
|
| 274 |
+
Return the initialization commands for the specified node type.
|
| 275 |
+
|
| 276 |
+
If the node type specific initialization commands are not specified,
|
| 277 |
+
return the global initialization commands.
|
| 278 |
+
"""
|
| 279 |
+
initialization_command = self.get_node_type_specific_config(
|
| 280 |
+
ray_node_type, "initialization_commands"
|
| 281 |
+
)
|
| 282 |
+
if initialization_command is None:
|
| 283 |
+
logger.info(
|
| 284 |
+
"Using global initialization commands for {}".format(ray_node_type)
|
| 285 |
+
)
|
| 286 |
+
return self._configs.get("initialization_commands", [])
|
| 287 |
+
return initialization_command
|
| 288 |
+
|
| 289 |
+
def get_node_type_specific_config(
|
| 290 |
+
self, ray_node_type: NodeType, config_name: str
|
| 291 |
+
) -> Optional[Any]:
|
| 292 |
+
node_specific_config = self._configs["available_node_types"].get(
|
| 293 |
+
ray_node_type, {}
|
| 294 |
+
)
|
| 295 |
+
return node_specific_config.get(config_name, None)
|
| 296 |
+
|
| 297 |
+
def get_node_resources(self, ray_node_type: NodeType) -> Dict[str, float]:
|
| 298 |
+
return copy.deepcopy(
|
| 299 |
+
self.get_node_type_specific_config(ray_node_type, "resources") or {}
|
| 300 |
+
)
|
| 301 |
+
|
| 302 |
+
def get_node_labels(self, ray_node_type: NodeType) -> Dict[str, str]:
|
| 303 |
+
return copy.deepcopy(
|
| 304 |
+
self.get_node_type_specific_config(ray_node_type, "labels") or {}
|
| 305 |
+
)
|
| 306 |
+
|
| 307 |
+
def get_config(self, config_name, default=None) -> Any:
|
| 308 |
+
return self._configs.get(config_name, default)
|
| 309 |
+
|
| 310 |
+
def get_provider_instance_type(self, ray_node_type: NodeType) -> str:
|
| 311 |
+
provider = self.provider
|
| 312 |
+
node_config = self.get_node_type_specific_config(ray_node_type, "node_config")
|
| 313 |
+
if provider in [Provider.AWS, Provider.ALIYUN]:
|
| 314 |
+
return node_config.get("InstanceType", "")
|
| 315 |
+
elif provider == Provider.AZURE:
|
| 316 |
+
return node_config.get("azure_arm_parameters", {}).get("vmSize", "")
|
| 317 |
+
elif provider == Provider.GCP:
|
| 318 |
+
return node_config.get("machineType", "")
|
| 319 |
+
elif provider in [Provider.KUBERAY, Provider.LOCAL, Provider.UNKNOWN]:
|
| 320 |
+
return ""
|
| 321 |
+
else:
|
| 322 |
+
raise ValueError(f"Unknown provider {provider}")
|
| 323 |
+
|
| 324 |
+
def get_node_type_configs(self) -> Dict[NodeType, NodeTypeConfig]:
|
| 325 |
+
"""
|
| 326 |
+
Returns the node type configs from the `available_node_types` field.
|
| 327 |
+
|
| 328 |
+
Returns:
|
| 329 |
+
Dict[NodeType, NodeTypeConfig]: The node type configs.
|
| 330 |
+
"""
|
| 331 |
+
available_node_types = self._configs.get("available_node_types", {})
|
| 332 |
+
if not available_node_types:
|
| 333 |
+
return None
|
| 334 |
+
node_type_configs = {}
|
| 335 |
+
auth_config = self._configs.get("auth", {})
|
| 336 |
+
head_node_type = self.get_head_node_type()
|
| 337 |
+
assert head_node_type
|
| 338 |
+
for node_type, node_config in available_node_types.items():
|
| 339 |
+
launch_config_hash = hash_launch_conf(
|
| 340 |
+
node_config.get("node_config", {}), auth_config
|
| 341 |
+
)
|
| 342 |
+
max_workers_nodes = node_config.get("max_workers", 0)
|
| 343 |
+
if head_node_type == node_type:
|
| 344 |
+
max_workers_nodes += 1
|
| 345 |
+
|
| 346 |
+
node_type_configs[node_type] = NodeTypeConfig(
|
| 347 |
+
name=node_type,
|
| 348 |
+
min_worker_nodes=node_config.get("min_workers", 0),
|
| 349 |
+
max_worker_nodes=max_workers_nodes,
|
| 350 |
+
idle_timeout_s=node_config.get("idle_timeout_s", None),
|
| 351 |
+
resources=node_config.get("resources", {}),
|
| 352 |
+
labels=node_config.get("labels", {}),
|
| 353 |
+
launch_config_hash=launch_config_hash,
|
| 354 |
+
)
|
| 355 |
+
return node_type_configs
|
| 356 |
+
|
| 357 |
+
def get_head_node_type(self) -> NodeType:
|
| 358 |
+
"""
|
| 359 |
+
Returns the head node type.
|
| 360 |
+
|
| 361 |
+
If there is only one node type, return the only node type as the head
|
| 362 |
+
node type.
|
| 363 |
+
If there are multiple node types, return the head node type specified
|
| 364 |
+
in the config.
|
| 365 |
+
"""
|
| 366 |
+
available_node_types = self._configs.get("available_node_types", {})
|
| 367 |
+
if len(available_node_types) == 1:
|
| 368 |
+
return list(available_node_types.keys())[0]
|
| 369 |
+
return self._configs.get("head_node_type")
|
| 370 |
+
|
| 371 |
+
def get_max_num_worker_nodes(self) -> Optional[int]:
|
| 372 |
+
return self.get_config("max_workers", None)
|
| 373 |
+
|
| 374 |
+
def get_max_num_nodes(self) -> Optional[int]:
|
| 375 |
+
max_num_workers = self.get_max_num_worker_nodes()
|
| 376 |
+
if max_num_workers is not None:
|
| 377 |
+
return max_num_workers + 1 # For head node
|
| 378 |
+
return None
|
| 379 |
+
|
| 380 |
+
def get_raw_config_mutable(self) -> Dict[str, Any]:
|
| 381 |
+
return self._configs
|
| 382 |
+
|
| 383 |
+
def get_upscaling_speed(self) -> float:
|
| 384 |
+
return self.get_config("upscaling_speed", DEFAULT_UPSCALING_SPEED)
|
| 385 |
+
|
| 386 |
+
def get_max_concurrent_launches(self) -> int:
|
| 387 |
+
return AUTOSCALER_MAX_CONCURRENT_LAUNCHES
|
| 388 |
+
|
| 389 |
+
def disable_node_updaters(self) -> bool:
|
| 390 |
+
provider_config = self._configs.get("provider", {})
|
| 391 |
+
return provider_config.get(DISABLE_NODE_UPDATERS_KEY, True)
|
| 392 |
+
|
| 393 |
+
def get_idle_timeout_s(self) -> Optional[float]:
|
| 394 |
+
"""
|
| 395 |
+
Returns the idle timeout in seconds if present in config, otherwise None.
|
| 396 |
+
"""
|
| 397 |
+
idle_timeout_s = self.get_config("idle_timeout_minutes", None)
|
| 398 |
+
return idle_timeout_s * 60 if idle_timeout_s is not None else None
|
| 399 |
+
|
| 400 |
+
def disable_launch_config_check(self) -> bool:
|
| 401 |
+
provider_config = self.get_provider_config()
|
| 402 |
+
return provider_config.get(DISABLE_LAUNCH_CONFIG_CHECK_KEY, True)
|
| 403 |
+
|
| 404 |
+
def get_instance_reconcile_config(self) -> InstanceReconcileConfig:
|
| 405 |
+
# TODO(rickyx): we need a way to customize these configs,
|
| 406 |
+
# either extending the current ray-schema.json, or just use another
|
| 407 |
+
# schema validation paths.
|
| 408 |
+
return InstanceReconcileConfig()
|
| 409 |
+
|
| 410 |
+
def get_provider_config(self) -> Dict[str, Any]:
|
| 411 |
+
return self._configs.get("provider", {})
|
| 412 |
+
|
| 413 |
+
def dump(self) -> str:
|
| 414 |
+
return yaml.safe_dump(self._configs)
|
| 415 |
+
|
| 416 |
+
@property
|
| 417 |
+
def provider(self) -> Provider:
|
| 418 |
+
provider_str = self._configs.get("provider", {}).get("type", "")
|
| 419 |
+
if provider_str == "local":
|
| 420 |
+
return Provider.LOCAL
|
| 421 |
+
elif provider_str == "aws":
|
| 422 |
+
return Provider.AWS
|
| 423 |
+
elif provider_str == "azure":
|
| 424 |
+
return Provider.AZURE
|
| 425 |
+
elif provider_str == "gcp":
|
| 426 |
+
return Provider.GCP
|
| 427 |
+
elif provider_str == "aliyun":
|
| 428 |
+
return Provider.ALIYUN
|
| 429 |
+
elif provider_str == "kuberay":
|
| 430 |
+
return Provider.KUBERAY
|
| 431 |
+
elif provider_str == "readonly":
|
| 432 |
+
return Provider.READ_ONLY
|
| 433 |
+
else:
|
| 434 |
+
return Provider.UNKNOWN
|
| 435 |
+
|
| 436 |
+
@property
|
| 437 |
+
def runtime_hash(self) -> str:
|
| 438 |
+
return self._runtime_hash
|
| 439 |
+
|
| 440 |
+
@property
|
| 441 |
+
def file_mounts_contents_hash(self) -> str:
|
| 442 |
+
return self._file_mounts_contents_hash
|
| 443 |
+
|
| 444 |
+
|
| 445 |
+
class FileConfigReader(IConfigReader):
|
| 446 |
+
"""A class that reads cluster config from a yaml file."""
|
| 447 |
+
|
| 448 |
+
def __init__(self, config_file: str, skip_content_hash: bool = True) -> None:
|
| 449 |
+
"""
|
| 450 |
+
Args:
|
| 451 |
+
config_file: The path to the config file.
|
| 452 |
+
skip_content_hash: Whether to skip file mounts/ray command
|
| 453 |
+
hash calculation. Default to True.
|
| 454 |
+
"""
|
| 455 |
+
self._config_file_path = Path(config_file).resolve()
|
| 456 |
+
self._skip_content_hash = skip_content_hash
|
| 457 |
+
self._cached_config = self._read()
|
| 458 |
+
|
| 459 |
+
def _read(self) -> AutoscalingConfig:
|
| 460 |
+
with open(self._config_file_path) as f:
|
| 461 |
+
config = yaml.safe_load(f.read())
|
| 462 |
+
return AutoscalingConfig(config, skip_content_hash=self._skip_content_hash)
|
| 463 |
+
|
| 464 |
+
def get_cached_autoscaling_config(self) -> AutoscalingConfig:
|
| 465 |
+
"""
|
| 466 |
+
Returns:
|
| 467 |
+
AutoscalingConfig: The autoscaling config.
|
| 468 |
+
"""
|
| 469 |
+
|
| 470 |
+
return self._cached_config
|
| 471 |
+
|
| 472 |
+
def refresh_cached_autoscaling_config(self):
|
| 473 |
+
self._cached_config = self._read()
|
| 474 |
+
|
| 475 |
+
|
| 476 |
+
class KubeRayConfigReader(IConfigReader):
|
| 477 |
+
"""A class that reads cluster config from a K8s RayCluster CR."""
|
| 478 |
+
|
| 479 |
+
def __init__(self, config_producer: AutoscalingConfigProducer):
|
| 480 |
+
self._config_producer = config_producer
|
| 481 |
+
self._cached_config = self._generate_configs_from_k8s()
|
| 482 |
+
|
| 483 |
+
def _generate_configs_from_k8s(self) -> AutoscalingConfig:
|
| 484 |
+
return AutoscalingConfig(self._config_producer())
|
| 485 |
+
|
| 486 |
+
def get_cached_autoscaling_config(self) -> AutoscalingConfig:
|
| 487 |
+
"""
|
| 488 |
+
Returns:
|
| 489 |
+
AutoscalingConfig: The autoscaling config.
|
| 490 |
+
"""
|
| 491 |
+
return self._cached_config
|
| 492 |
+
|
| 493 |
+
def refresh_cached_autoscaling_config(self):
|
| 494 |
+
"""
|
| 495 |
+
Reads the configs from the K8s RayCluster CR.
|
| 496 |
+
|
| 497 |
+
This reads from the K8s API server every time to pick up changes.
|
| 498 |
+
"""
|
| 499 |
+
self._cached_config = self._generate_configs_from_k8s()
|
| 500 |
+
|
| 501 |
+
|
| 502 |
+
class ReadOnlyProviderConfigReader(IConfigReader):
|
| 503 |
+
"""A class that reads cluster config for a read-only provider.
|
| 504 |
+
|
| 505 |
+
This is used for laptop mode / manual cluster setup modes, in order to
|
| 506 |
+
provide status reporting in the same way for users."""
|
| 507 |
+
|
| 508 |
+
def __init__(self, gcs_address: str):
|
| 509 |
+
self._configs = BASE_READONLY_CONFIG
|
| 510 |
+
self._gcs_client = GcsClient(address=gcs_address)
|
| 511 |
+
|
| 512 |
+
def refresh_cached_autoscaling_config(self) -> AutoscalingConfig:
|
| 513 |
+
# Update the config with node types from GCS.
|
| 514 |
+
ray_cluster_resource_state = get_cluster_resource_state(self._gcs_client)
|
| 515 |
+
|
| 516 |
+
# Format each node type's config from the running nodes.
|
| 517 |
+
available_node_types = {}
|
| 518 |
+
|
| 519 |
+
head_node_type = None
|
| 520 |
+
for node_state in ray_cluster_resource_state.node_states:
|
| 521 |
+
node_type = format_readonly_node_type(binary_to_hex(node_state.node_id))
|
| 522 |
+
if is_head_node(node_state):
|
| 523 |
+
head_node_type = node_type
|
| 524 |
+
|
| 525 |
+
available_node_types[node_type] = {
|
| 526 |
+
"resources": dict(node_state.total_resources),
|
| 527 |
+
"min_workers": 0,
|
| 528 |
+
"max_workers": 0 if is_head_node(node_state) else 1,
|
| 529 |
+
"node_config": {},
|
| 530 |
+
}
|
| 531 |
+
if available_node_types:
|
| 532 |
+
self._configs["available_node_types"].update(available_node_types)
|
| 533 |
+
self._configs["max_workers"] = len(available_node_types)
|
| 534 |
+
assert head_node_type, "Head node type should be found."
|
| 535 |
+
self._configs["head_node_type"] = head_node_type
|
| 536 |
+
|
| 537 |
+
# Don't idle terminated nodes in read-only mode.
|
| 538 |
+
self._configs.pop("idle_timeout_minutes", None)
|
| 539 |
+
|
| 540 |
+
def get_cached_autoscaling_config(self) -> AutoscalingConfig:
|
| 541 |
+
return AutoscalingConfig(self._configs, skip_content_hash=True)
|
.venv/lib/python3.11/site-packages/ray/autoscaler/v2/instance_manager/instance_manager.py
ADDED
|
@@ -0,0 +1,270 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import logging
|
| 2 |
+
from abc import ABC, abstractmethod
|
| 3 |
+
from typing import List, Optional
|
| 4 |
+
|
| 5 |
+
from ray.autoscaler.v2.instance_manager.common import InstanceUtil
|
| 6 |
+
from ray.autoscaler.v2.instance_manager.instance_storage import InstanceStorage
|
| 7 |
+
from ray.core.generated.instance_manager_pb2 import (
|
| 8 |
+
GetInstanceManagerStateReply,
|
| 9 |
+
GetInstanceManagerStateRequest,
|
| 10 |
+
Instance,
|
| 11 |
+
InstanceUpdateEvent,
|
| 12 |
+
NodeKind,
|
| 13 |
+
StatusCode,
|
| 14 |
+
UpdateInstanceManagerStateReply,
|
| 15 |
+
UpdateInstanceManagerStateRequest,
|
| 16 |
+
)
|
| 17 |
+
|
| 18 |
+
logger = logging.getLogger(__name__)
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
class InstanceUpdatedSubscriber(ABC):
|
| 22 |
+
"""Subscribers to instance status changes."""
|
| 23 |
+
|
| 24 |
+
@abstractmethod
|
| 25 |
+
def notify(self, events: List[InstanceUpdateEvent]) -> None:
|
| 26 |
+
pass
|
| 27 |
+
|
| 28 |
+
|
| 29 |
+
class InstanceManager:
|
| 30 |
+
"""
|
| 31 |
+
See `InstanceManagerService` in instance_manager.proto
|
| 32 |
+
|
| 33 |
+
This handles updates to an instance, or inserts a new instance if
|
| 34 |
+
it's an insert update. We should only be inserting new instances
|
| 35 |
+
of the below statuses:
|
| 36 |
+
1. ALLOCATED: For unmanaged instance not initialized by InstanceManager,
|
| 37 |
+
e.g. head node
|
| 38 |
+
2. QUEUED: For new instance being queued to launch.
|
| 39 |
+
3. TERMINATING: For leaked cloud instance that needs to be terminated.
|
| 40 |
+
|
| 41 |
+
For full status transitions, see:
|
| 42 |
+
https://docs.google.com/document/d/1NzQjA8Mh-oMc-QxXOa529oneWCoA8sDiVoNkBqqDb4U/edit#heading=h.k9a1sp4qpqj4
|
| 43 |
+
|
| 44 |
+
Not thread safe, should be used as a singleton.
|
| 45 |
+
"""
|
| 46 |
+
|
| 47 |
+
def __init__(
|
| 48 |
+
self,
|
| 49 |
+
instance_storage: InstanceStorage,
|
| 50 |
+
instance_status_update_subscribers: Optional[List[InstanceUpdatedSubscriber]],
|
| 51 |
+
):
|
| 52 |
+
self._instance_storage = instance_storage
|
| 53 |
+
self._status_update_subscribers = instance_status_update_subscribers or []
|
| 54 |
+
|
| 55 |
+
def update_instance_manager_state(
|
| 56 |
+
self, request: UpdateInstanceManagerStateRequest
|
| 57 |
+
) -> UpdateInstanceManagerStateReply:
|
| 58 |
+
"""
|
| 59 |
+
Updates the instance manager state.
|
| 60 |
+
|
| 61 |
+
If there's any failure, no updates would be made and the reply
|
| 62 |
+
would contain the latest version of the instance manager state,
|
| 63 |
+
and the error info.
|
| 64 |
+
|
| 65 |
+
Args:
|
| 66 |
+
request: The request to update the instance manager state.
|
| 67 |
+
|
| 68 |
+
Returns:
|
| 69 |
+
The reply to the request.
|
| 70 |
+
"""
|
| 71 |
+
|
| 72 |
+
# Handle updates
|
| 73 |
+
ids_to_updates = {update.instance_id: update for update in request.updates}
|
| 74 |
+
to_update_instances, version = self._instance_storage.get_instances(
|
| 75 |
+
instance_ids=ids_to_updates.keys()
|
| 76 |
+
)
|
| 77 |
+
|
| 78 |
+
if request.expected_version >= 0 and request.expected_version != version:
|
| 79 |
+
err_str = (
|
| 80 |
+
f"Version mismatch: expected: {request.expected_version}, "
|
| 81 |
+
f"actual: {version}"
|
| 82 |
+
)
|
| 83 |
+
logger.warning(err_str)
|
| 84 |
+
return self._get_update_im_state_reply(
|
| 85 |
+
StatusCode.VERSION_MISMATCH,
|
| 86 |
+
version,
|
| 87 |
+
err_str,
|
| 88 |
+
)
|
| 89 |
+
|
| 90 |
+
# Handle instances states update.
|
| 91 |
+
to_upsert_instances = []
|
| 92 |
+
for instance_id, update in ids_to_updates.items():
|
| 93 |
+
if instance_id in to_update_instances:
|
| 94 |
+
instance = self._update_instance(
|
| 95 |
+
to_update_instances[instance_id], update
|
| 96 |
+
)
|
| 97 |
+
else:
|
| 98 |
+
instance = self._create_instance(update)
|
| 99 |
+
|
| 100 |
+
to_upsert_instances.append(instance)
|
| 101 |
+
|
| 102 |
+
# Updates the instance storage.
|
| 103 |
+
result = self._instance_storage.batch_upsert_instances(
|
| 104 |
+
updates=to_upsert_instances,
|
| 105 |
+
expected_storage_version=version,
|
| 106 |
+
)
|
| 107 |
+
|
| 108 |
+
if not result.success:
|
| 109 |
+
if result.version != version:
|
| 110 |
+
err_str = (
|
| 111 |
+
f"Version mismatch: expected: {version}, actual: {result.version}"
|
| 112 |
+
)
|
| 113 |
+
logger.warning(err_str)
|
| 114 |
+
return self._get_update_im_state_reply(
|
| 115 |
+
StatusCode.VERSION_MISMATCH, result.version, err_str
|
| 116 |
+
)
|
| 117 |
+
else:
|
| 118 |
+
err_str = "Failed to update instance storage."
|
| 119 |
+
logger.error(err_str)
|
| 120 |
+
return self._get_update_im_state_reply(
|
| 121 |
+
StatusCode.UNKNOWN_ERRORS, result.version, err_str
|
| 122 |
+
)
|
| 123 |
+
|
| 124 |
+
# Successful updates.
|
| 125 |
+
for subscriber in self._status_update_subscribers:
|
| 126 |
+
subscriber.notify(request.updates)
|
| 127 |
+
|
| 128 |
+
return self._get_update_im_state_reply(StatusCode.OK, result.version)
|
| 129 |
+
|
| 130 |
+
def get_instance_manager_state(
|
| 131 |
+
self, request: GetInstanceManagerStateRequest
|
| 132 |
+
) -> GetInstanceManagerStateReply:
|
| 133 |
+
"""
|
| 134 |
+
Gets the instance manager state.
|
| 135 |
+
|
| 136 |
+
Args:
|
| 137 |
+
request: The request to get the instance manager state.
|
| 138 |
+
|
| 139 |
+
Returns:
|
| 140 |
+
The reply to the request.
|
| 141 |
+
"""
|
| 142 |
+
reply = GetInstanceManagerStateReply()
|
| 143 |
+
instances, version = self._instance_storage.get_instances()
|
| 144 |
+
reply.state.instances.extend(instances.values())
|
| 145 |
+
reply.state.version = version
|
| 146 |
+
reply.status.code = StatusCode.OK
|
| 147 |
+
|
| 148 |
+
return reply
|
| 149 |
+
|
| 150 |
+
#########################################
|
| 151 |
+
# Private methods
|
| 152 |
+
#########################################
|
| 153 |
+
|
| 154 |
+
@staticmethod
|
| 155 |
+
def _get_update_im_state_reply(
|
| 156 |
+
status_code: StatusCode, version: int, error_message: str = ""
|
| 157 |
+
) -> UpdateInstanceManagerStateReply:
|
| 158 |
+
"""
|
| 159 |
+
Returns a UpdateInstanceManagerStateReply with the given status code and
|
| 160 |
+
version.
|
| 161 |
+
|
| 162 |
+
Args:
|
| 163 |
+
status_code: The status code.
|
| 164 |
+
version: The version.
|
| 165 |
+
error_message: The error message if any.
|
| 166 |
+
|
| 167 |
+
Returns:
|
| 168 |
+
The reply.
|
| 169 |
+
"""
|
| 170 |
+
reply = UpdateInstanceManagerStateReply()
|
| 171 |
+
reply.status.code = status_code
|
| 172 |
+
reply.version = version
|
| 173 |
+
if error_message:
|
| 174 |
+
reply.status.message = error_message
|
| 175 |
+
return reply
|
| 176 |
+
|
| 177 |
+
@staticmethod
|
| 178 |
+
def _apply_update(instance: Instance, update: InstanceUpdateEvent):
|
| 179 |
+
"""
|
| 180 |
+
Apply status specific update to the instance.
|
| 181 |
+
|
| 182 |
+
Args:
|
| 183 |
+
instance: The instance to update.
|
| 184 |
+
update: The update to apply.
|
| 185 |
+
"""
|
| 186 |
+
if update.new_instance_status == Instance.ALLOCATED:
|
| 187 |
+
assert (
|
| 188 |
+
update.cloud_instance_id
|
| 189 |
+
), "ALLOCATED update must have cloud_instance_id"
|
| 190 |
+
assert update.node_kind in [
|
| 191 |
+
NodeKind.WORKER,
|
| 192 |
+
NodeKind.HEAD,
|
| 193 |
+
], "ALLOCATED update must have node_kind as WORKER or HEAD"
|
| 194 |
+
assert update.instance_type, "ALLOCATED update must have instance_type"
|
| 195 |
+
assert (
|
| 196 |
+
update.cloud_instance_id
|
| 197 |
+
), "ALLOCATED update must have cloud_instance_id"
|
| 198 |
+
instance.cloud_instance_id = update.cloud_instance_id
|
| 199 |
+
instance.node_kind = update.node_kind
|
| 200 |
+
instance.instance_type = update.instance_type
|
| 201 |
+
elif update.new_instance_status == Instance.RAY_RUNNING:
|
| 202 |
+
assert update.ray_node_id, "RAY_RUNNING update must have ray_node_id"
|
| 203 |
+
instance.node_id = update.ray_node_id
|
| 204 |
+
elif update.new_instance_status == Instance.REQUESTED:
|
| 205 |
+
assert (
|
| 206 |
+
update.launch_request_id
|
| 207 |
+
), "REQUESTED update must have launch_request_id"
|
| 208 |
+
assert update.instance_type, "REQUESTED update must have instance_type"
|
| 209 |
+
instance.launch_request_id = update.launch_request_id
|
| 210 |
+
instance.instance_type = update.instance_type
|
| 211 |
+
elif update.new_instance_status == Instance.TERMINATING:
|
| 212 |
+
assert (
|
| 213 |
+
update.cloud_instance_id
|
| 214 |
+
), "TERMINATING update must have cloud instance id"
|
| 215 |
+
|
| 216 |
+
@staticmethod
|
| 217 |
+
def _create_instance(update: InstanceUpdateEvent) -> Instance:
|
| 218 |
+
"""
|
| 219 |
+
Create a new instance from the given update.
|
| 220 |
+
"""
|
| 221 |
+
|
| 222 |
+
assert update.upsert, "upsert must be true for creating new instance."
|
| 223 |
+
|
| 224 |
+
assert update.new_instance_status in [
|
| 225 |
+
# For unmanaged instance not initialized by InstanceManager,
|
| 226 |
+
# e.g. head node
|
| 227 |
+
Instance.ALLOCATED,
|
| 228 |
+
# For new instance being queued to launch.
|
| 229 |
+
Instance.QUEUED,
|
| 230 |
+
# For leaked cloud instance that needs to be terminated.
|
| 231 |
+
Instance.TERMINATING,
|
| 232 |
+
], (
|
| 233 |
+
"Invalid status for new instance, must be one of "
|
| 234 |
+
"[ALLOCATED, QUEUED, TERMINATING]"
|
| 235 |
+
)
|
| 236 |
+
|
| 237 |
+
# Create a new instance first for common fields.
|
| 238 |
+
instance = InstanceUtil.new_instance(
|
| 239 |
+
instance_id=update.instance_id,
|
| 240 |
+
instance_type=update.instance_type,
|
| 241 |
+
status=update.new_instance_status,
|
| 242 |
+
details=update.details,
|
| 243 |
+
)
|
| 244 |
+
|
| 245 |
+
# Apply the status specific updates.
|
| 246 |
+
logger.info(InstanceUtil.get_log_str_for_update(instance, update))
|
| 247 |
+
InstanceManager._apply_update(instance, update)
|
| 248 |
+
return instance
|
| 249 |
+
|
| 250 |
+
@staticmethod
|
| 251 |
+
def _update_instance(instance: Instance, update: InstanceUpdateEvent) -> Instance:
|
| 252 |
+
"""
|
| 253 |
+
Update the instance with the given update.
|
| 254 |
+
|
| 255 |
+
Args:
|
| 256 |
+
instance: The instance to update.
|
| 257 |
+
update: The update to apply.
|
| 258 |
+
|
| 259 |
+
Returns:
|
| 260 |
+
The updated instance.
|
| 261 |
+
"""
|
| 262 |
+
logger.info(InstanceUtil.get_log_str_for_update(instance, update))
|
| 263 |
+
assert InstanceUtil.set_status(instance, update.new_instance_status), (
|
| 264 |
+
"Invalid status transition from "
|
| 265 |
+
f"{Instance.InstanceStatus.Name(instance.status)} to "
|
| 266 |
+
f"{Instance.InstanceStatus.Name(update.new_instance_status)}"
|
| 267 |
+
)
|
| 268 |
+
InstanceManager._apply_update(instance, update)
|
| 269 |
+
|
| 270 |
+
return instance
|
.venv/lib/python3.11/site-packages/ray/autoscaler/v2/instance_manager/instance_storage.py
ADDED
|
@@ -0,0 +1,151 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import copy
|
| 2 |
+
import logging
|
| 3 |
+
from typing import Dict, List, Optional, Set, Tuple
|
| 4 |
+
|
| 5 |
+
from ray.autoscaler.v2.instance_manager.storage import Storage, StoreStatus
|
| 6 |
+
from ray.core.generated.instance_manager_pb2 import Instance
|
| 7 |
+
|
| 8 |
+
logger = logging.getLogger(__name__)
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
class InstanceStorage:
|
| 12 |
+
"""Instance storage stores the states of instances in the storage."""
|
| 13 |
+
|
| 14 |
+
def __init__(
|
| 15 |
+
self,
|
| 16 |
+
cluster_id: str,
|
| 17 |
+
storage: Storage,
|
| 18 |
+
) -> None:
|
| 19 |
+
self._storage = storage
|
| 20 |
+
self._cluster_id = cluster_id
|
| 21 |
+
self._table_name = f"instance_table@{cluster_id}"
|
| 22 |
+
|
| 23 |
+
def batch_upsert_instances(
|
| 24 |
+
self,
|
| 25 |
+
updates: List[Instance],
|
| 26 |
+
expected_storage_version: Optional[int] = None,
|
| 27 |
+
) -> StoreStatus:
|
| 28 |
+
"""Upsert instances into the storage. If the instance already exists,
|
| 29 |
+
it will be updated. Otherwise, it will be inserted. If the
|
| 30 |
+
expected_storage_version is specified, the update will fail if the
|
| 31 |
+
current storage version does not match the expected version.
|
| 32 |
+
|
| 33 |
+
Note the version of the upserted instances will be set to the current
|
| 34 |
+
storage version.
|
| 35 |
+
|
| 36 |
+
Args:
|
| 37 |
+
updates: A list of instances to be upserted.
|
| 38 |
+
expected_storage_version: The expected storage version.
|
| 39 |
+
|
| 40 |
+
Returns:
|
| 41 |
+
StoreStatus: A tuple of (success, storage_version).
|
| 42 |
+
"""
|
| 43 |
+
mutations = {}
|
| 44 |
+
version = self._storage.get_version()
|
| 45 |
+
# handle version mismatch
|
| 46 |
+
if expected_storage_version and expected_storage_version != version:
|
| 47 |
+
return StoreStatus(False, version)
|
| 48 |
+
|
| 49 |
+
for instance in updates:
|
| 50 |
+
instance = copy.deepcopy(instance)
|
| 51 |
+
# the instance version is set to 0, it will be
|
| 52 |
+
# populated by the storage entry's verion on read
|
| 53 |
+
instance.version = 0
|
| 54 |
+
mutations[instance.instance_id] = instance.SerializeToString()
|
| 55 |
+
|
| 56 |
+
result, version = self._storage.batch_update(
|
| 57 |
+
self._table_name, mutations, {}, expected_storage_version
|
| 58 |
+
)
|
| 59 |
+
|
| 60 |
+
return StoreStatus(result, version)
|
| 61 |
+
|
| 62 |
+
def upsert_instance(
|
| 63 |
+
self,
|
| 64 |
+
instance: Instance,
|
| 65 |
+
expected_instance_version: Optional[int] = None,
|
| 66 |
+
expected_storage_verison: Optional[int] = None,
|
| 67 |
+
) -> StoreStatus:
|
| 68 |
+
"""Upsert an instance in the storage.
|
| 69 |
+
If the expected_instance_version is specified, the update will fail
|
| 70 |
+
if the current instance version does not match the expected version.
|
| 71 |
+
Similarly, if the expected_storage_version is
|
| 72 |
+
specified, the update will fail if the current storage version does not
|
| 73 |
+
match the expected version.
|
| 74 |
+
|
| 75 |
+
Note the version of the upserted instances will be set to the current
|
| 76 |
+
storage version.
|
| 77 |
+
|
| 78 |
+
Args:
|
| 79 |
+
instance: The instance to be updated.
|
| 80 |
+
expected_instance_version: The expected instance version.
|
| 81 |
+
expected_storage_version: The expected storage version.
|
| 82 |
+
|
| 83 |
+
Returns:
|
| 84 |
+
StoreStatus: A tuple of (success, storage_version).
|
| 85 |
+
"""
|
| 86 |
+
instance = copy.deepcopy(instance)
|
| 87 |
+
# the instance version is set to 0, it will be
|
| 88 |
+
# populated by the storage entry's verion on read
|
| 89 |
+
instance.version = 0
|
| 90 |
+
result, version = self._storage.update(
|
| 91 |
+
self._table_name,
|
| 92 |
+
key=instance.instance_id,
|
| 93 |
+
value=instance.SerializeToString(),
|
| 94 |
+
expected_entry_version=expected_instance_version,
|
| 95 |
+
expected_storage_version=expected_storage_verison,
|
| 96 |
+
insert_only=False,
|
| 97 |
+
)
|
| 98 |
+
|
| 99 |
+
return StoreStatus(result, version)
|
| 100 |
+
|
| 101 |
+
def get_instances(
|
| 102 |
+
self,
|
| 103 |
+
instance_ids: List[str] = None,
|
| 104 |
+
status_filter: Set[int] = None,
|
| 105 |
+
) -> Tuple[Dict[str, Instance], int]:
|
| 106 |
+
"""Get instances from the storage.
|
| 107 |
+
|
| 108 |
+
Args:
|
| 109 |
+
instance_ids: A list of instance ids to be retrieved. If empty, all
|
| 110 |
+
instances will be retrieved.
|
| 111 |
+
status_filter: Only instances with the specified status will be returned.
|
| 112 |
+
|
| 113 |
+
Returns:
|
| 114 |
+
Tuple[Dict[str, Instance], int]: A tuple of (instances, version).
|
| 115 |
+
The instances is a dictionary of (instance_id, instance) pairs.
|
| 116 |
+
"""
|
| 117 |
+
instance_ids = instance_ids or []
|
| 118 |
+
status_filter = status_filter or set()
|
| 119 |
+
pairs, version = self._storage.get(self._table_name, instance_ids)
|
| 120 |
+
instances = {}
|
| 121 |
+
for instance_id, (instance_data, entry_version) in pairs.items():
|
| 122 |
+
instance = Instance()
|
| 123 |
+
instance.ParseFromString(instance_data)
|
| 124 |
+
instance.version = entry_version
|
| 125 |
+
if status_filter and instance.status not in status_filter:
|
| 126 |
+
continue
|
| 127 |
+
instances[instance_id] = instance
|
| 128 |
+
return instances, version
|
| 129 |
+
|
| 130 |
+
def batch_delete_instances(
|
| 131 |
+
self, instance_ids: List[str], expected_storage_version: Optional[int] = None
|
| 132 |
+
) -> StoreStatus:
|
| 133 |
+
"""Delete instances from the storage. If the expected_version is
|
| 134 |
+
specified, the update will fail if the current storage version does not
|
| 135 |
+
match the expected version.
|
| 136 |
+
|
| 137 |
+
Args:
|
| 138 |
+
to_delete: A list of instances to be deleted.
|
| 139 |
+
expected_version: The expected storage version.
|
| 140 |
+
|
| 141 |
+
Returns:
|
| 142 |
+
StoreStatus: A tuple of (success, storage_version).
|
| 143 |
+
"""
|
| 144 |
+
version = self._storage.get_version()
|
| 145 |
+
if expected_storage_version and expected_storage_version != version:
|
| 146 |
+
return StoreStatus(False, version)
|
| 147 |
+
|
| 148 |
+
result = self._storage.batch_update(
|
| 149 |
+
self._table_name, {}, instance_ids, expected_storage_version
|
| 150 |
+
)
|
| 151 |
+
return result
|
.venv/lib/python3.11/site-packages/ray/autoscaler/v2/instance_manager/node_provider.py
ADDED
|
@@ -0,0 +1,522 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import logging
|
| 2 |
+
import math
|
| 3 |
+
import time
|
| 4 |
+
from abc import ABC, abstractmethod
|
| 5 |
+
from concurrent.futures import ThreadPoolExecutor
|
| 6 |
+
from dataclasses import dataclass
|
| 7 |
+
from queue import Queue
|
| 8 |
+
from typing import Any, Dict, List, Optional
|
| 9 |
+
|
| 10 |
+
from ray.autoscaler._private.constants import (
|
| 11 |
+
AUTOSCALER_MAX_CONCURRENT_LAUNCHES,
|
| 12 |
+
AUTOSCALER_MAX_LAUNCH_BATCH,
|
| 13 |
+
)
|
| 14 |
+
from ray.autoscaler._private.util import hash_launch_conf
|
| 15 |
+
from ray.autoscaler.node_provider import NodeProvider as NodeProviderV1
|
| 16 |
+
from ray.autoscaler.tags import (
|
| 17 |
+
NODE_KIND_HEAD,
|
| 18 |
+
NODE_KIND_UNMANAGED,
|
| 19 |
+
NODE_KIND_WORKER,
|
| 20 |
+
STATUS_UNINITIALIZED,
|
| 21 |
+
TAG_RAY_LAUNCH_CONFIG,
|
| 22 |
+
TAG_RAY_LAUNCH_REQUEST,
|
| 23 |
+
TAG_RAY_NODE_KIND,
|
| 24 |
+
TAG_RAY_NODE_NAME,
|
| 25 |
+
TAG_RAY_NODE_STATUS,
|
| 26 |
+
TAG_RAY_USER_NODE_TYPE,
|
| 27 |
+
)
|
| 28 |
+
from ray.autoscaler.v2.instance_manager.config import IConfigReader
|
| 29 |
+
from ray.autoscaler.v2.schema import NodeType
|
| 30 |
+
from ray.core.generated.instance_manager_pb2 import NodeKind
|
| 31 |
+
|
| 32 |
+
logger = logging.getLogger(__name__)
|
| 33 |
+
|
| 34 |
+
# Type Alias. This is a **unique identifier** for a cloud instance in the cluster.
|
| 35 |
+
# The provider should guarantee that this id is unique across the cluster,
|
| 36 |
+
# such that:
|
| 37 |
+
# - When a cloud instance is created and running, no other cloud instance in the
|
| 38 |
+
# cluster has the same id.
|
| 39 |
+
# - When a cloud instance is terminated, no other cloud instance in the cluster will
|
| 40 |
+
# be assigned the same id later.
|
| 41 |
+
CloudInstanceId = str
|
| 42 |
+
|
| 43 |
+
|
| 44 |
+
@dataclass
|
| 45 |
+
class CloudInstance:
|
| 46 |
+
"""
|
| 47 |
+
A class that represents a cloud instance in the cluster, with necessary metadata
|
| 48 |
+
of the cloud instance.
|
| 49 |
+
"""
|
| 50 |
+
|
| 51 |
+
# The cloud instance id.
|
| 52 |
+
cloud_instance_id: CloudInstanceId
|
| 53 |
+
# The node type of the cloud instance.
|
| 54 |
+
node_type: NodeType
|
| 55 |
+
# The node kind, i.e head or worker.
|
| 56 |
+
node_kind: NodeKind
|
| 57 |
+
# If the cloud instance is already running.
|
| 58 |
+
is_running: bool
|
| 59 |
+
# Update request id from which the cloud instance is launched.
|
| 60 |
+
# This could be None if the cloud instance couldn't be associated with requests
|
| 61 |
+
# by the cloud provider: e.g. cloud provider doesn't support per-instance
|
| 62 |
+
# extra metadata.
|
| 63 |
+
# This is fine for now since the reconciler should be able to know how
|
| 64 |
+
# to handle cloud instances w/o request ids.
|
| 65 |
+
# TODO: make this a required field.
|
| 66 |
+
request_id: Optional[str] = None
|
| 67 |
+
|
| 68 |
+
|
| 69 |
+
class CloudInstanceProviderError(Exception):
|
| 70 |
+
"""
|
| 71 |
+
An base error class that represents an error that happened in the cloud instance
|
| 72 |
+
provider.
|
| 73 |
+
"""
|
| 74 |
+
|
| 75 |
+
# The timestamp of the error occurred in nanoseconds.
|
| 76 |
+
timestamp_ns: int
|
| 77 |
+
|
| 78 |
+
def __init__(self, msg, timestamp_ns) -> None:
|
| 79 |
+
super().__init__(msg)
|
| 80 |
+
self.timestamp_ns = timestamp_ns
|
| 81 |
+
|
| 82 |
+
|
| 83 |
+
class LaunchNodeError(CloudInstanceProviderError):
|
| 84 |
+
# The node type that failed to launch.
|
| 85 |
+
node_type: NodeType
|
| 86 |
+
# Number of nodes that failed to launch.
|
| 87 |
+
count: int
|
| 88 |
+
# A unique id that identifies from which update request the error originates.
|
| 89 |
+
request_id: str
|
| 90 |
+
|
| 91 |
+
def __init__(
|
| 92 |
+
self,
|
| 93 |
+
node_type: NodeType,
|
| 94 |
+
count: int,
|
| 95 |
+
request_id: str,
|
| 96 |
+
timestamp_ns: int,
|
| 97 |
+
details: str = "",
|
| 98 |
+
cause: Optional[Exception] = None,
|
| 99 |
+
) -> None:
|
| 100 |
+
msg = (
|
| 101 |
+
f"Failed to launch {count} nodes of type {node_type} with "
|
| 102 |
+
f"request id {request_id}: {details}"
|
| 103 |
+
)
|
| 104 |
+
super().__init__(msg, timestamp_ns=timestamp_ns)
|
| 105 |
+
self.node_type = node_type
|
| 106 |
+
self.count = count
|
| 107 |
+
self.request_id = request_id
|
| 108 |
+
if cause:
|
| 109 |
+
self.__cause__ = cause
|
| 110 |
+
|
| 111 |
+
def __repr__(self) -> str:
|
| 112 |
+
return (
|
| 113 |
+
f"LaunchNodeError(node_type={self.node_type}, count={self.count}, "
|
| 114 |
+
f"request_id={self.request_id}): {self.__cause__}"
|
| 115 |
+
)
|
| 116 |
+
|
| 117 |
+
|
| 118 |
+
class TerminateNodeError(CloudInstanceProviderError):
|
| 119 |
+
# The cloud instance id of the node that failed to terminate.
|
| 120 |
+
cloud_instance_id: CloudInstanceId
|
| 121 |
+
# A unique id that identifies from which update request the error originates.
|
| 122 |
+
request_id: str
|
| 123 |
+
|
| 124 |
+
def __init__(
|
| 125 |
+
self,
|
| 126 |
+
cloud_instance_id: CloudInstanceId,
|
| 127 |
+
request_id: str,
|
| 128 |
+
timestamp_ns: int,
|
| 129 |
+
details: str = "",
|
| 130 |
+
cause: Optional[Exception] = None,
|
| 131 |
+
) -> None:
|
| 132 |
+
msg = (
|
| 133 |
+
f"Failed to terminate node {cloud_instance_id} with "
|
| 134 |
+
f"request id {request_id}: {details}"
|
| 135 |
+
)
|
| 136 |
+
super().__init__(msg, timestamp_ns=timestamp_ns)
|
| 137 |
+
self.cloud_instance_id = cloud_instance_id
|
| 138 |
+
self.request_id = request_id
|
| 139 |
+
if cause:
|
| 140 |
+
self.__cause__ = cause
|
| 141 |
+
|
| 142 |
+
def __repr__(self) -> str:
|
| 143 |
+
return (
|
| 144 |
+
f"TerminateNodeError(cloud_instance_id={self.cloud_instance_id}, "
|
| 145 |
+
f"request_id={self.request_id}): {self.__cause__}"
|
| 146 |
+
)
|
| 147 |
+
|
| 148 |
+
|
| 149 |
+
class ICloudInstanceProvider(ABC):
|
| 150 |
+
"""
|
| 151 |
+
The interface for a cloud instance provider.
|
| 152 |
+
|
| 153 |
+
This interface is a minimal interface that should be implemented by the
|
| 154 |
+
various cloud instance providers (e.g. AWS, and etc).
|
| 155 |
+
|
| 156 |
+
The cloud instance provider is responsible for managing the cloud instances in the
|
| 157 |
+
cluster. It provides the following main functionalities:
|
| 158 |
+
- Launch new cloud instances.
|
| 159 |
+
- Terminate existing running instances.
|
| 160 |
+
- Get the non-terminated cloud instances in the cluster.
|
| 161 |
+
- Poll the errors that happened for the updates to the cloud instance provider.
|
| 162 |
+
|
| 163 |
+
Below properties of the cloud instance provider are assumed with this interface:
|
| 164 |
+
|
| 165 |
+
1. Eventually consistent
|
| 166 |
+
The cloud instance provider is expected to be eventually consistent with the
|
| 167 |
+
cluster state. For example, when a cloud instance is request to be terminated
|
| 168 |
+
or launched, the provider may not immediately reflect the change in its state.
|
| 169 |
+
However, the provider is expected to eventually reflect the change in its state.
|
| 170 |
+
|
| 171 |
+
2. Asynchronous
|
| 172 |
+
The provider could also be asynchronous, where the termination/launch
|
| 173 |
+
request may not immediately return the result of the request.
|
| 174 |
+
|
| 175 |
+
3. Unique cloud instance ids
|
| 176 |
+
Cloud instance ids are expected to be unique across the cluster.
|
| 177 |
+
|
| 178 |
+
4. Idempotent updates
|
| 179 |
+
For the update APIs (e.g. ensure_min_nodes, terminate), the provider may use the
|
| 180 |
+
request ids to provide idempotency.
|
| 181 |
+
|
| 182 |
+
Usage:
|
| 183 |
+
```
|
| 184 |
+
provider: ICloudInstanceProvider = ...
|
| 185 |
+
|
| 186 |
+
# Update the cluster with a desired shape.
|
| 187 |
+
provider.launch(
|
| 188 |
+
shape={
|
| 189 |
+
"worker_nodes": 10,
|
| 190 |
+
"ray_head": 1,
|
| 191 |
+
},
|
| 192 |
+
request_id="1",
|
| 193 |
+
)
|
| 194 |
+
|
| 195 |
+
# Get the non-terminated nodes of the cloud instance provider.
|
| 196 |
+
running = provider.get_non_terminated()
|
| 197 |
+
|
| 198 |
+
# Poll the errors
|
| 199 |
+
errors = provider.poll_errors()
|
| 200 |
+
|
| 201 |
+
# Terminate nodes.
|
| 202 |
+
provider.terminate(
|
| 203 |
+
ids=["cloud_instance_id_1", "cloud_instance_id_2"],
|
| 204 |
+
request_id="2",
|
| 205 |
+
)
|
| 206 |
+
|
| 207 |
+
# Process the state of the provider.
|
| 208 |
+
...
|
| 209 |
+
```
|
| 210 |
+
"""
|
| 211 |
+
|
| 212 |
+
@abstractmethod
|
| 213 |
+
def get_non_terminated(self) -> Dict[CloudInstanceId, CloudInstance]:
|
| 214 |
+
"""Get the non-terminated cloud instances in the cluster.
|
| 215 |
+
|
| 216 |
+
Returns:
|
| 217 |
+
A dictionary of the non-terminated cloud instances in the cluster.
|
| 218 |
+
The key is the cloud instance id, and the value is the cloud instance.
|
| 219 |
+
"""
|
| 220 |
+
pass
|
| 221 |
+
|
| 222 |
+
@abstractmethod
|
| 223 |
+
def terminate(self, ids: List[CloudInstanceId], request_id: str) -> None:
|
| 224 |
+
"""
|
| 225 |
+
Terminate the cloud instances asynchronously.
|
| 226 |
+
|
| 227 |
+
This method is expected to be idempotent, i.e. if the same request id is used
|
| 228 |
+
to terminate the same cloud instances, this should be a no-op if
|
| 229 |
+
the cloud instances are already terminated or being terminated.
|
| 230 |
+
|
| 231 |
+
Args:
|
| 232 |
+
ids: the cloud instance ids to terminate.
|
| 233 |
+
request_id: a unique id that identifies the request.
|
| 234 |
+
"""
|
| 235 |
+
pass
|
| 236 |
+
|
| 237 |
+
@abstractmethod
|
| 238 |
+
def launch(
|
| 239 |
+
self,
|
| 240 |
+
shape: Dict[NodeType, int],
|
| 241 |
+
request_id: str,
|
| 242 |
+
) -> None:
|
| 243 |
+
"""Launch the cloud instances asynchronously.
|
| 244 |
+
|
| 245 |
+
Args:
|
| 246 |
+
shape: A map from node type to number of nodes to launch.
|
| 247 |
+
request_id: a unique id that identifies the update request.
|
| 248 |
+
"""
|
| 249 |
+
pass
|
| 250 |
+
|
| 251 |
+
@abstractmethod
|
| 252 |
+
def poll_errors(self) -> List[CloudInstanceProviderError]:
|
| 253 |
+
"""
|
| 254 |
+
Poll the errors that happened since the last poll.
|
| 255 |
+
|
| 256 |
+
This method would also clear the errors that happened since the last poll.
|
| 257 |
+
|
| 258 |
+
Returns:
|
| 259 |
+
The errors that happened since the last poll.
|
| 260 |
+
"""
|
| 261 |
+
pass
|
| 262 |
+
|
| 263 |
+
|
| 264 |
+
@dataclass(frozen=True)
|
| 265 |
+
class CloudInstanceLaunchRequest:
|
| 266 |
+
"""
|
| 267 |
+
The arguments to launch a node.
|
| 268 |
+
"""
|
| 269 |
+
|
| 270 |
+
# The node type to launch.
|
| 271 |
+
node_type: NodeType
|
| 272 |
+
# Number of nodes to launch.
|
| 273 |
+
count: int
|
| 274 |
+
# A unique id that identifies the request.
|
| 275 |
+
request_id: str
|
| 276 |
+
|
| 277 |
+
|
| 278 |
+
@dataclass(frozen=True)
|
| 279 |
+
class CloudInstanceTerminateRequest:
|
| 280 |
+
"""
|
| 281 |
+
The arguments to terminate a node.
|
| 282 |
+
"""
|
| 283 |
+
|
| 284 |
+
# The cloud instance id of the node to terminate.
|
| 285 |
+
cloud_instance_id: CloudInstanceId
|
| 286 |
+
# A unique id that identifies the request.
|
| 287 |
+
request_id: str
|
| 288 |
+
|
| 289 |
+
|
| 290 |
+
class NodeProviderAdapter(ICloudInstanceProvider):
|
| 291 |
+
"""
|
| 292 |
+
Warps a NodeProviderV1 to a ICloudInstanceProvider.
|
| 293 |
+
|
| 294 |
+
TODO(rickyx):
|
| 295 |
+
The current adapter right now consists of two sets of APIs:
|
| 296 |
+
- v1: the old APIs that are used by the autoscaler, where
|
| 297 |
+
we forward the calls to the NodeProviderV1.
|
| 298 |
+
- v2: the new APIs that are used by the autoscaler v2, this is
|
| 299 |
+
defined in the ICloudInstanceProvider interface.
|
| 300 |
+
|
| 301 |
+
We should eventually remove the v1 APIs and only use the v2 APIs.
|
| 302 |
+
It's currently left as a TODO since changing the v1 APIs would
|
| 303 |
+
requires a lot of changes in the cluster launcher codebase.
|
| 304 |
+
"""
|
| 305 |
+
|
| 306 |
+
def __init__(
|
| 307 |
+
self,
|
| 308 |
+
v1_provider: NodeProviderV1,
|
| 309 |
+
config_reader: IConfigReader,
|
| 310 |
+
max_launch_batch_per_type: int = AUTOSCALER_MAX_LAUNCH_BATCH,
|
| 311 |
+
max_concurrent_launches: int = AUTOSCALER_MAX_CONCURRENT_LAUNCHES,
|
| 312 |
+
) -> None:
|
| 313 |
+
"""
|
| 314 |
+
Args:
|
| 315 |
+
v1_provider: The v1 node provider to wrap.
|
| 316 |
+
config_reader: The config reader to read the autoscaling config.
|
| 317 |
+
max_launch_batch_per_type: The maximum number of nodes to launch per
|
| 318 |
+
node type in a single batch.
|
| 319 |
+
max_concurrent_launches: The maximum number of concurrent launches.
|
| 320 |
+
"""
|
| 321 |
+
|
| 322 |
+
super().__init__()
|
| 323 |
+
self._v1_provider = v1_provider
|
| 324 |
+
self._config_reader = config_reader
|
| 325 |
+
# Executor to async launching and terminating nodes.
|
| 326 |
+
self._main_executor = ThreadPoolExecutor(
|
| 327 |
+
max_workers=1, thread_name_prefix="ray::NodeProviderAdapter"
|
| 328 |
+
)
|
| 329 |
+
|
| 330 |
+
# v1 legacy rate limiting on the node provider launch calls.
|
| 331 |
+
self._max_launch_batch_per_type = max_launch_batch_per_type
|
| 332 |
+
max_batches = math.ceil(
|
| 333 |
+
max_concurrent_launches / float(max_launch_batch_per_type)
|
| 334 |
+
)
|
| 335 |
+
self._node_launcher_executors = ThreadPoolExecutor(
|
| 336 |
+
max_workers=max_batches,
|
| 337 |
+
thread_name_prefix="ray::NodeLauncherPool",
|
| 338 |
+
)
|
| 339 |
+
|
| 340 |
+
# Queue to retrieve new errors occur in the multi-thread executors
|
| 341 |
+
# temporarily.
|
| 342 |
+
self._errors_queue = Queue()
|
| 343 |
+
|
| 344 |
+
def get_non_terminated(self) -> Dict[CloudInstanceId, CloudInstance]:
|
| 345 |
+
nodes = {}
|
| 346 |
+
|
| 347 |
+
cloud_instance_ids = self._v1_non_terminated_nodes({})
|
| 348 |
+
# Filter out nodes that are not running.
|
| 349 |
+
# This is efficient since the provider is expected to cache the
|
| 350 |
+
# running status of the nodes.
|
| 351 |
+
for cloud_instance_id in cloud_instance_ids:
|
| 352 |
+
node_tags = self._v1_node_tags(cloud_instance_id)
|
| 353 |
+
node_kind_tag = node_tags.get(TAG_RAY_NODE_KIND, NODE_KIND_UNMANAGED)
|
| 354 |
+
if node_kind_tag == NODE_KIND_UNMANAGED:
|
| 355 |
+
# Filter out unmanaged nodes.
|
| 356 |
+
continue
|
| 357 |
+
elif node_kind_tag == NODE_KIND_WORKER:
|
| 358 |
+
node_kind = NodeKind.WORKER
|
| 359 |
+
elif node_kind_tag == NODE_KIND_HEAD:
|
| 360 |
+
node_kind = NodeKind.HEAD
|
| 361 |
+
else:
|
| 362 |
+
raise ValueError(f"Invalid node kind: {node_kind_tag}")
|
| 363 |
+
|
| 364 |
+
nodes[cloud_instance_id] = CloudInstance(
|
| 365 |
+
cloud_instance_id=cloud_instance_id,
|
| 366 |
+
node_type=node_tags.get(TAG_RAY_USER_NODE_TYPE, ""),
|
| 367 |
+
is_running=self._v1_is_running(cloud_instance_id),
|
| 368 |
+
request_id=node_tags.get(TAG_RAY_LAUNCH_REQUEST, ""),
|
| 369 |
+
node_kind=node_kind,
|
| 370 |
+
)
|
| 371 |
+
|
| 372 |
+
return nodes
|
| 373 |
+
|
| 374 |
+
def poll_errors(self) -> List[CloudInstanceProviderError]:
|
| 375 |
+
errors = []
|
| 376 |
+
while not self._errors_queue.empty():
|
| 377 |
+
errors.append(self._errors_queue.get_nowait())
|
| 378 |
+
return errors
|
| 379 |
+
|
| 380 |
+
def launch(
|
| 381 |
+
self,
|
| 382 |
+
shape: Dict[NodeType, int],
|
| 383 |
+
request_id: str,
|
| 384 |
+
) -> None:
|
| 385 |
+
self._main_executor.submit(self._do_launch, shape, request_id)
|
| 386 |
+
|
| 387 |
+
def terminate(self, ids: List[CloudInstanceId], request_id: str) -> None:
|
| 388 |
+
self._main_executor.submit(self._do_terminate, ids, request_id)
|
| 389 |
+
|
| 390 |
+
###########################################
|
| 391 |
+
# Private APIs
|
| 392 |
+
###########################################
|
| 393 |
+
|
| 394 |
+
def _do_launch(
|
| 395 |
+
self,
|
| 396 |
+
shape: Dict[NodeType, int],
|
| 397 |
+
request_id: str,
|
| 398 |
+
) -> None:
|
| 399 |
+
"""
|
| 400 |
+
Launch the cloud instances by calling into the v1 base node provider.
|
| 401 |
+
|
| 402 |
+
Args:
|
| 403 |
+
shape: The requested to launch node type and number of nodes.
|
| 404 |
+
request_id: The request id that identifies the request.
|
| 405 |
+
"""
|
| 406 |
+
for node_type, count in shape.items():
|
| 407 |
+
# Keep submitting the launch requests to the launch pool in batches.
|
| 408 |
+
while count > 0:
|
| 409 |
+
to_launch = min(count, self._max_launch_batch_per_type)
|
| 410 |
+
self._node_launcher_executors.submit(
|
| 411 |
+
self._launch_nodes_by_type,
|
| 412 |
+
node_type,
|
| 413 |
+
to_launch,
|
| 414 |
+
request_id,
|
| 415 |
+
)
|
| 416 |
+
count -= to_launch
|
| 417 |
+
|
| 418 |
+
def _do_terminate(self, ids: List[CloudInstanceId], request_id: str) -> None:
|
| 419 |
+
"""
|
| 420 |
+
Terminate the cloud instances by calling into the v1 base node provider.
|
| 421 |
+
|
| 422 |
+
If errors happen during the termination, the errors will be put into the
|
| 423 |
+
errors queue.
|
| 424 |
+
|
| 425 |
+
Args:
|
| 426 |
+
ids: The cloud instance ids to terminate.
|
| 427 |
+
request_id: The request id that identifies the request.
|
| 428 |
+
"""
|
| 429 |
+
|
| 430 |
+
try:
|
| 431 |
+
self._v1_terminate_nodes(ids)
|
| 432 |
+
except Exception as e:
|
| 433 |
+
for id in ids:
|
| 434 |
+
error = TerminateNodeError(id, request_id, int(time.time_ns()))
|
| 435 |
+
error.__cause__ = e
|
| 436 |
+
self._errors_queue.put(error)
|
| 437 |
+
|
| 438 |
+
def _launch_nodes_by_type(
|
| 439 |
+
self,
|
| 440 |
+
node_type: NodeType,
|
| 441 |
+
count: int,
|
| 442 |
+
request_id: str,
|
| 443 |
+
) -> None:
|
| 444 |
+
"""
|
| 445 |
+
Launch nodes of the given node type.
|
| 446 |
+
|
| 447 |
+
Args:
|
| 448 |
+
node_type: The node type to launch.
|
| 449 |
+
count: Number of nodes to launch.
|
| 450 |
+
request_id: A unique id that identifies the request.
|
| 451 |
+
|
| 452 |
+
Raises:
|
| 453 |
+
ValueError: If the node type is invalid.
|
| 454 |
+
LaunchNodeError: If the launch failed and raised by the underlying provider.
|
| 455 |
+
"""
|
| 456 |
+
# Check node type is valid.
|
| 457 |
+
try:
|
| 458 |
+
config = self._config_reader.get_cached_autoscaling_config()
|
| 459 |
+
launch_config = config.get_cloud_node_config(node_type)
|
| 460 |
+
resources = config.get_node_resources(node_type)
|
| 461 |
+
labels = config.get_node_labels(node_type)
|
| 462 |
+
|
| 463 |
+
# This is to be compatible with the v1 node launcher.
|
| 464 |
+
# See more in https://github.com/ray-project/ray/blob/6f5a189bc463e52c51a70f8aea41fb2950b443e8/python/ray/autoscaler/_private/node_launcher.py#L78-L85 # noqa
|
| 465 |
+
# TODO: this should be synced with what's stored in the IM, it should
|
| 466 |
+
# probably be made as a metadata field in the cloud instance. This is
|
| 467 |
+
# another incompatibility with KubeRay.
|
| 468 |
+
launch_hash = hash_launch_conf(launch_config, config.get_config("auth", {}))
|
| 469 |
+
node_tags = {
|
| 470 |
+
TAG_RAY_NODE_NAME: "ray-{}-worker".format(
|
| 471 |
+
config.get_config("cluster_name", "")
|
| 472 |
+
),
|
| 473 |
+
TAG_RAY_NODE_KIND: NODE_KIND_WORKER,
|
| 474 |
+
TAG_RAY_NODE_STATUS: STATUS_UNINITIALIZED,
|
| 475 |
+
TAG_RAY_LAUNCH_CONFIG: launch_hash,
|
| 476 |
+
TAG_RAY_LAUNCH_REQUEST: request_id,
|
| 477 |
+
TAG_RAY_USER_NODE_TYPE: node_type,
|
| 478 |
+
}
|
| 479 |
+
|
| 480 |
+
logger.info("Launching {} nodes of type {}.".format(count, node_type))
|
| 481 |
+
self._v1_provider.create_node_with_resources_and_labels(
|
| 482 |
+
launch_config, node_tags, count, resources, labels
|
| 483 |
+
)
|
| 484 |
+
logger.info("Launched {} nodes of type {}.".format(count, node_type))
|
| 485 |
+
except Exception as e:
|
| 486 |
+
error = LaunchNodeError(node_type, count, request_id, int(time.time_ns()))
|
| 487 |
+
error.__cause__ = e
|
| 488 |
+
self._errors_queue.put(error)
|
| 489 |
+
|
| 490 |
+
###########################################
|
| 491 |
+
# V1 Legacy APIs
|
| 492 |
+
###########################################
|
| 493 |
+
"""
|
| 494 |
+
Below are the necessary legacy APIs from the V1 node provider.
|
| 495 |
+
These are needed as of now to provide the needed features
|
| 496 |
+
for V2 node provider.
|
| 497 |
+
The goal is to eventually remove these APIs and only use the
|
| 498 |
+
V2 APIs by modifying the individual node provider to inherit
|
| 499 |
+
from ICloudInstanceProvider.
|
| 500 |
+
"""
|
| 501 |
+
|
| 502 |
+
def _v1_terminate_nodes(
|
| 503 |
+
self, ids: List[CloudInstanceId]
|
| 504 |
+
) -> Optional[Dict[str, Any]]:
|
| 505 |
+
return self._v1_provider.terminate_nodes(ids)
|
| 506 |
+
|
| 507 |
+
def _v1_non_terminated_nodes(
|
| 508 |
+
self, tag_filters: Dict[str, str]
|
| 509 |
+
) -> List[CloudInstanceId]:
|
| 510 |
+
return self._v1_provider.non_terminated_nodes(tag_filters)
|
| 511 |
+
|
| 512 |
+
def _v1_is_running(self, node_id: CloudInstanceId) -> bool:
|
| 513 |
+
return self._v1_provider.is_running(node_id)
|
| 514 |
+
|
| 515 |
+
def _v1_post_process(self) -> None:
|
| 516 |
+
self._v1_provider.post_process()
|
| 517 |
+
|
| 518 |
+
def _v1_node_tags(self, node_id: CloudInstanceId) -> Dict[str, str]:
|
| 519 |
+
return self._v1_provider.node_tags(node_id)
|
| 520 |
+
|
| 521 |
+
def _v1_safe_to_scale(self) -> bool:
|
| 522 |
+
return self._v1_provider.safe_to_scale()
|
.venv/lib/python3.11/site-packages/ray/autoscaler/v2/instance_manager/ray_installer.py
ADDED
|
@@ -0,0 +1,99 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import dataclasses
|
| 2 |
+
import logging
|
| 3 |
+
import subprocess
|
| 4 |
+
|
| 5 |
+
from ray.autoscaler._private.updater import NodeUpdater
|
| 6 |
+
from ray.autoscaler._private.util import with_envs, with_head_node_ip
|
| 7 |
+
from ray.autoscaler.node_provider import NodeProvider as NodeProviderV1
|
| 8 |
+
from ray.autoscaler.v2.instance_manager.config import AutoscalingConfig
|
| 9 |
+
from ray.core.generated.instance_manager_pb2 import Instance
|
| 10 |
+
|
| 11 |
+
logger = logging.getLogger(__name__)
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
@dataclasses.dataclass(frozen=True)
|
| 15 |
+
class RayInstallError:
|
| 16 |
+
# Instance manager's instance id.
|
| 17 |
+
im_instance_id: str
|
| 18 |
+
# Error details.
|
| 19 |
+
details: str
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
class RayInstaller(object):
|
| 23 |
+
"""
|
| 24 |
+
RayInstaller is responsible for installing ray on the target instance.
|
| 25 |
+
"""
|
| 26 |
+
|
| 27 |
+
def __init__(
|
| 28 |
+
self,
|
| 29 |
+
provider: NodeProviderV1,
|
| 30 |
+
config: AutoscalingConfig,
|
| 31 |
+
process_runner=subprocess,
|
| 32 |
+
) -> None:
|
| 33 |
+
self._provider = provider
|
| 34 |
+
self._config = config
|
| 35 |
+
self._process_runner = process_runner
|
| 36 |
+
|
| 37 |
+
def install_ray(self, instance: Instance, head_node_ip: str) -> bool:
|
| 38 |
+
"""
|
| 39 |
+
Install ray on the target instance synchronously.
|
| 40 |
+
TODO:(rickyx): This runs in another thread, and errors are silently
|
| 41 |
+
ignored. We should propagate the error to the main thread.
|
| 42 |
+
"""
|
| 43 |
+
setup_commands = self._config.get_worker_setup_commands(instance.instance_type)
|
| 44 |
+
ray_start_commands = self._config.get_worker_start_ray_commands()
|
| 45 |
+
docker_config = self._config.get_docker_config(instance.instance_type)
|
| 46 |
+
|
| 47 |
+
logger.info(
|
| 48 |
+
f"Creating new (spawn_updater) updater thread for node"
|
| 49 |
+
f" {instance.cloud_instance_id}."
|
| 50 |
+
)
|
| 51 |
+
provider_instance_type_name = self._config.get_provider_instance_type(
|
| 52 |
+
instance.instance_type
|
| 53 |
+
)
|
| 54 |
+
updater = NodeUpdater(
|
| 55 |
+
node_id=instance.instance_id,
|
| 56 |
+
provider_config=self._config.get_config("provider"),
|
| 57 |
+
provider=self._provider,
|
| 58 |
+
auth_config=self._config.get_config("auth"),
|
| 59 |
+
cluster_name=self._config.get_config("cluster_name"),
|
| 60 |
+
file_mounts=self._config.get_config("file_mounts"),
|
| 61 |
+
initialization_commands=with_head_node_ip(
|
| 62 |
+
self._config.get_initialization_commands(instance.instance_type),
|
| 63 |
+
head_node_ip,
|
| 64 |
+
),
|
| 65 |
+
setup_commands=with_head_node_ip(setup_commands, head_node_ip),
|
| 66 |
+
# This will prepend envs to the begin of the ray start commands, e.g.
|
| 67 |
+
# `RAY_HEAD_IP=<head_node_ip> \
|
| 68 |
+
# RAY_CLOUD_INSTANCE_ID=<instance_id> \
|
| 69 |
+
# ray start --head ...`
|
| 70 |
+
# See src/ray/common/constants.h for ENV name definitions.
|
| 71 |
+
ray_start_commands=with_envs(
|
| 72 |
+
ray_start_commands,
|
| 73 |
+
{
|
| 74 |
+
"RAY_HEAD_IP": head_node_ip,
|
| 75 |
+
"RAY_CLOUD_INSTANCE_ID": instance.instance_id,
|
| 76 |
+
"RAY_NODE_TYPE_NAME": instance.instance_type,
|
| 77 |
+
"RAY_CLOUD_INSTANCE_TYPE_NAME": provider_instance_type_name,
|
| 78 |
+
},
|
| 79 |
+
),
|
| 80 |
+
runtime_hash=self._config.runtime_hash,
|
| 81 |
+
file_mounts_contents_hash=self._config.file_mounts_contents_hash,
|
| 82 |
+
is_head_node=False,
|
| 83 |
+
cluster_synced_files=self._config.get_config("cluster_synced_files"),
|
| 84 |
+
rsync_options={
|
| 85 |
+
"rsync_exclude": self._config.get_config("rsync_exclude"),
|
| 86 |
+
"rsync_filter": self._config.get_config("rsync_filter"),
|
| 87 |
+
},
|
| 88 |
+
use_internal_ip=True,
|
| 89 |
+
docker_config=docker_config,
|
| 90 |
+
node_resources=self._config.get_node_resources(instance.instance_type),
|
| 91 |
+
node_labels=self._config.get_node_labels(instance.instance_type),
|
| 92 |
+
process_runner=self._process_runner,
|
| 93 |
+
)
|
| 94 |
+
try:
|
| 95 |
+
updater.run()
|
| 96 |
+
except Exception:
|
| 97 |
+
# Errors has already been handled.
|
| 98 |
+
return False
|
| 99 |
+
return True
|
.venv/lib/python3.11/site-packages/ray/autoscaler/v2/instance_manager/reconciler.py
ADDED
|
@@ -0,0 +1,1565 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import logging
|
| 2 |
+
import math
|
| 3 |
+
import time
|
| 4 |
+
import uuid
|
| 5 |
+
from collections import defaultdict
|
| 6 |
+
from typing import Dict, List, Optional, Set, Tuple
|
| 7 |
+
|
| 8 |
+
from ray._private.utils import binary_to_hex
|
| 9 |
+
from ray.autoscaler.v2.instance_manager.common import InstanceUtil
|
| 10 |
+
from ray.autoscaler.v2.instance_manager.config import (
|
| 11 |
+
AutoscalingConfig,
|
| 12 |
+
InstanceReconcileConfig,
|
| 13 |
+
Provider,
|
| 14 |
+
)
|
| 15 |
+
from ray.autoscaler.v2.instance_manager.instance_manager import InstanceManager
|
| 16 |
+
from ray.autoscaler.v2.instance_manager.node_provider import (
|
| 17 |
+
CloudInstance,
|
| 18 |
+
CloudInstanceId,
|
| 19 |
+
CloudInstanceProviderError,
|
| 20 |
+
ICloudInstanceProvider,
|
| 21 |
+
LaunchNodeError,
|
| 22 |
+
TerminateNodeError,
|
| 23 |
+
)
|
| 24 |
+
from ray.autoscaler.v2.instance_manager.ray_installer import RayInstallError
|
| 25 |
+
from ray.autoscaler.v2.instance_manager.subscribers.ray_stopper import RayStopError
|
| 26 |
+
from ray.autoscaler.v2.metrics_reporter import AutoscalerMetricsReporter
|
| 27 |
+
from ray.autoscaler.v2.scheduler import IResourceScheduler, SchedulingRequest
|
| 28 |
+
from ray.autoscaler.v2.schema import AutoscalerInstance, NodeType
|
| 29 |
+
from ray.autoscaler.v2.sdk import is_head_node
|
| 30 |
+
from ray.core.generated.autoscaler_pb2 import (
|
| 31 |
+
AutoscalingState,
|
| 32 |
+
ClusterResourceState,
|
| 33 |
+
FailedInstanceRequest,
|
| 34 |
+
NodeState,
|
| 35 |
+
NodeStatus,
|
| 36 |
+
PendingInstance,
|
| 37 |
+
PendingInstanceRequest,
|
| 38 |
+
)
|
| 39 |
+
from ray.core.generated.instance_manager_pb2 import GetInstanceManagerStateRequest
|
| 40 |
+
from ray.core.generated.instance_manager_pb2 import Instance as IMInstance
|
| 41 |
+
from ray.core.generated.instance_manager_pb2 import (
|
| 42 |
+
InstanceUpdateEvent as IMInstanceUpdateEvent,
|
| 43 |
+
)
|
| 44 |
+
from ray.core.generated.instance_manager_pb2 import (
|
| 45 |
+
NodeKind,
|
| 46 |
+
StatusCode,
|
| 47 |
+
UpdateInstanceManagerStateRequest,
|
| 48 |
+
)
|
| 49 |
+
|
| 50 |
+
logger = logging.getLogger(__name__)
|
| 51 |
+
|
| 52 |
+
|
| 53 |
+
class Reconciler:
|
| 54 |
+
"""
|
| 55 |
+
A singleton class that reconciles the instance states of the instance manager
|
| 56 |
+
for autoscaler.
|
| 57 |
+
|
| 58 |
+
"""
|
| 59 |
+
|
| 60 |
+
@staticmethod
|
| 61 |
+
def reconcile(
|
| 62 |
+
instance_manager: InstanceManager,
|
| 63 |
+
scheduler: IResourceScheduler,
|
| 64 |
+
cloud_provider: ICloudInstanceProvider,
|
| 65 |
+
ray_cluster_resource_state: ClusterResourceState,
|
| 66 |
+
non_terminated_cloud_instances: Dict[CloudInstanceId, CloudInstance],
|
| 67 |
+
autoscaling_config: AutoscalingConfig,
|
| 68 |
+
cloud_provider_errors: Optional[List[CloudInstanceProviderError]] = None,
|
| 69 |
+
ray_install_errors: Optional[List[RayInstallError]] = None,
|
| 70 |
+
ray_stop_errors: Optional[List[RayStopError]] = None,
|
| 71 |
+
metrics_reporter: Optional[AutoscalerMetricsReporter] = None,
|
| 72 |
+
_logger: Optional[logging.Logger] = None,
|
| 73 |
+
) -> AutoscalingState:
|
| 74 |
+
"""
|
| 75 |
+
The reconcile method computes InstanceUpdateEvents for the instance manager
|
| 76 |
+
by:
|
| 77 |
+
|
| 78 |
+
1. Reconciling the instance manager's instances with external states like
|
| 79 |
+
the cloud provider's, the ray cluster's states, the ray installer's results.
|
| 80 |
+
It performs "passive" status transitions for the instances (where the status
|
| 81 |
+
transition should only be reflecting the external states of the cloud provider
|
| 82 |
+
and the ray cluster, and should not be actively changing them)
|
| 83 |
+
|
| 84 |
+
2. Stepping the instances to the active states by computing instance status
|
| 85 |
+
transitions that are needed and updating the instance manager's state.
|
| 86 |
+
These transitions should be "active" where the transitions have side effects
|
| 87 |
+
(through InstanceStatusSubscriber) to the cloud provider and the ray cluster.
|
| 88 |
+
|
| 89 |
+
Args:
|
| 90 |
+
instance_manager: The instance manager to reconcile.
|
| 91 |
+
ray_cluster_resource_state: The ray cluster's resource state.
|
| 92 |
+
non_terminated_cloud_instances: The non-terminated cloud instances from
|
| 93 |
+
the cloud provider.
|
| 94 |
+
cloud_provider_errors: The errors from the cloud provider.
|
| 95 |
+
ray_install_errors: The errors from RayInstaller.
|
| 96 |
+
ray_stop_errors: The errors from RayStopper.
|
| 97 |
+
metrics_reporter: The metric reporter to report the autoscaler metrics.
|
| 98 |
+
_logger: The logger (for testing).
|
| 99 |
+
|
| 100 |
+
"""
|
| 101 |
+
cloud_provider_errors = cloud_provider_errors or []
|
| 102 |
+
ray_install_errors = ray_install_errors or []
|
| 103 |
+
ray_stop_errors = ray_stop_errors or []
|
| 104 |
+
|
| 105 |
+
autoscaling_state = AutoscalingState()
|
| 106 |
+
autoscaling_state.last_seen_cluster_resource_state_version = (
|
| 107 |
+
ray_cluster_resource_state.cluster_resource_state_version
|
| 108 |
+
)
|
| 109 |
+
Reconciler._sync_from(
|
| 110 |
+
instance_manager=instance_manager,
|
| 111 |
+
ray_nodes=ray_cluster_resource_state.node_states,
|
| 112 |
+
non_terminated_cloud_instances=non_terminated_cloud_instances,
|
| 113 |
+
cloud_provider_errors=cloud_provider_errors,
|
| 114 |
+
ray_install_errors=ray_install_errors,
|
| 115 |
+
ray_stop_errors=ray_stop_errors,
|
| 116 |
+
autoscaling_config=autoscaling_config,
|
| 117 |
+
)
|
| 118 |
+
|
| 119 |
+
Reconciler._step_next(
|
| 120 |
+
autoscaling_state=autoscaling_state,
|
| 121 |
+
instance_manager=instance_manager,
|
| 122 |
+
scheduler=scheduler,
|
| 123 |
+
cloud_provider=cloud_provider,
|
| 124 |
+
ray_cluster_resource_state=ray_cluster_resource_state,
|
| 125 |
+
non_terminated_cloud_instances=non_terminated_cloud_instances,
|
| 126 |
+
autoscaling_config=autoscaling_config,
|
| 127 |
+
_logger=_logger,
|
| 128 |
+
)
|
| 129 |
+
|
| 130 |
+
Reconciler._report_metrics(
|
| 131 |
+
instance_manager=instance_manager,
|
| 132 |
+
autoscaling_config=autoscaling_config,
|
| 133 |
+
metrics_reporter=metrics_reporter,
|
| 134 |
+
)
|
| 135 |
+
|
| 136 |
+
return autoscaling_state
|
| 137 |
+
|
| 138 |
+
@staticmethod
|
| 139 |
+
def _sync_from(
|
| 140 |
+
instance_manager: InstanceManager,
|
| 141 |
+
ray_nodes: List[NodeState],
|
| 142 |
+
non_terminated_cloud_instances: Dict[CloudInstanceId, CloudInstance],
|
| 143 |
+
cloud_provider_errors: List[CloudInstanceProviderError],
|
| 144 |
+
ray_install_errors: List[RayInstallError],
|
| 145 |
+
ray_stop_errors: List[RayStopError],
|
| 146 |
+
autoscaling_config: AutoscalingConfig,
|
| 147 |
+
):
|
| 148 |
+
"""
|
| 149 |
+
Reconcile the instance states of the instance manager from external states like
|
| 150 |
+
the cloud provider's, the ray cluster's states, the ray installer's results,
|
| 151 |
+
etc.
|
| 152 |
+
|
| 153 |
+
For each instance, we try to figure out if we need to transition the instance
|
| 154 |
+
status to a new status, and if so, what the new status should be.
|
| 155 |
+
|
| 156 |
+
These transitions should be purely "passive", meaning they should only be
|
| 157 |
+
reflecting the external states of the cloud provider and the ray cluster,
|
| 158 |
+
and should not be actively changing the states of the cloud provider or the ray
|
| 159 |
+
cluster.
|
| 160 |
+
|
| 161 |
+
More specifically, we will reconcile status transitions for:
|
| 162 |
+
1. QUEUED/REQUESTED -> ALLOCATED:
|
| 163 |
+
When a instance with launch request id (indicating a previous launch
|
| 164 |
+
request was made) could be assigned to an unassigned cloud instance
|
| 165 |
+
of the same instance type.
|
| 166 |
+
2. REQUESTED -> ALLOCATION_FAILED:
|
| 167 |
+
When there's an error from the cloud provider for launch failure so
|
| 168 |
+
that the instance becomes ALLOCATION_FAILED.
|
| 169 |
+
3. * -> RAY_RUNNING:
|
| 170 |
+
When a ray node on a cloud instance joins the ray cluster, we will
|
| 171 |
+
transition the instance to RAY_RUNNING.
|
| 172 |
+
4. * -> TERMINATED:
|
| 173 |
+
When the cloud instance is already terminated, we will transition the
|
| 174 |
+
instance to TERMINATED.
|
| 175 |
+
5. TERMINATING -> TERMINATION_FAILED:
|
| 176 |
+
When there's an error from the cloud provider for termination failure.
|
| 177 |
+
6. * -> RAY_STOPPED:
|
| 178 |
+
When ray was stopped on the cloud instance, we will transition the
|
| 179 |
+
instance to RAY_STOPPED.
|
| 180 |
+
7. * -> RAY_INSTALL_FAILED:
|
| 181 |
+
When there's an error from RayInstaller.
|
| 182 |
+
8. RAY_STOP_REQUESTED -> RAY_RUNNING:
|
| 183 |
+
When requested to stop ray, but failed to stop/drain the ray node
|
| 184 |
+
(e.g. idle termination drain rejected by the node).
|
| 185 |
+
|
| 186 |
+
Args:
|
| 187 |
+
instance_manager: The instance manager to reconcile.
|
| 188 |
+
ray_nodes: The ray cluster's states of ray nodes.
|
| 189 |
+
non_terminated_cloud_instances: The non-terminated cloud instances from
|
| 190 |
+
the cloud provider.
|
| 191 |
+
cloud_provider_errors: The errors from the cloud provider.
|
| 192 |
+
ray_install_errors: The errors from RayInstaller.
|
| 193 |
+
ray_stop_errors: The errors from RayStopper.
|
| 194 |
+
|
| 195 |
+
"""
|
| 196 |
+
|
| 197 |
+
# Handle 1 & 2 for cloud instance allocation.
|
| 198 |
+
Reconciler._handle_cloud_instance_allocation(
|
| 199 |
+
instance_manager,
|
| 200 |
+
non_terminated_cloud_instances,
|
| 201 |
+
cloud_provider_errors,
|
| 202 |
+
)
|
| 203 |
+
Reconciler._handle_cloud_instance_terminated(
|
| 204 |
+
instance_manager, non_terminated_cloud_instances
|
| 205 |
+
)
|
| 206 |
+
|
| 207 |
+
Reconciler._handle_cloud_instance_termination_errors(
|
| 208 |
+
instance_manager, cloud_provider_errors
|
| 209 |
+
)
|
| 210 |
+
|
| 211 |
+
Reconciler._handle_extra_cloud_instances(
|
| 212 |
+
instance_manager, non_terminated_cloud_instances, ray_nodes
|
| 213 |
+
)
|
| 214 |
+
|
| 215 |
+
Reconciler._handle_ray_status_transition(
|
| 216 |
+
instance_manager, ray_nodes, autoscaling_config
|
| 217 |
+
)
|
| 218 |
+
|
| 219 |
+
Reconciler._handle_ray_install_failed(instance_manager, ray_install_errors)
|
| 220 |
+
|
| 221 |
+
Reconciler._handle_ray_stop_failed(instance_manager, ray_stop_errors, ray_nodes)
|
| 222 |
+
|
| 223 |
+
@staticmethod
|
| 224 |
+
def _step_next(
|
| 225 |
+
autoscaling_state: AutoscalingState,
|
| 226 |
+
instance_manager: InstanceManager,
|
| 227 |
+
scheduler: IResourceScheduler,
|
| 228 |
+
cloud_provider: ICloudInstanceProvider,
|
| 229 |
+
ray_cluster_resource_state: ClusterResourceState,
|
| 230 |
+
non_terminated_cloud_instances: Dict[CloudInstanceId, CloudInstance],
|
| 231 |
+
autoscaling_config: AutoscalingConfig,
|
| 232 |
+
_logger: Optional[logging.Logger] = None,
|
| 233 |
+
):
|
| 234 |
+
"""
|
| 235 |
+
Step the reconciler to the next state by computing instance status transitions
|
| 236 |
+
that are needed and updating the instance manager's state.
|
| 237 |
+
|
| 238 |
+
Specifically, we will:
|
| 239 |
+
1. Shut down leak cloud instances
|
| 240 |
+
Leaked cloud instances that are not managed by the instance manager.
|
| 241 |
+
2. Terminating instances with ray stopped or ray install failure.
|
| 242 |
+
3. Scale down the cluster:
|
| 243 |
+
(* -> RAY_STOP_REQUESTED/TERMINATING)
|
| 244 |
+
b. Extra cloud due to max nodes config.
|
| 245 |
+
c. Cloud instances with outdated configs.
|
| 246 |
+
4. Scale up the cluster:
|
| 247 |
+
(new QUEUED)
|
| 248 |
+
Create new instances based on the IResourceScheduler's decision for
|
| 249 |
+
scaling up.
|
| 250 |
+
5. Request cloud provider to launch new instances.
|
| 251 |
+
(QUEUED -> REQUESTED)
|
| 252 |
+
6. Install ray
|
| 253 |
+
(ALLOCATED -> RAY_INSTALLING)
|
| 254 |
+
When ray could be installed and launched.
|
| 255 |
+
7. Handle any stuck instances with timeouts.
|
| 256 |
+
|
| 257 |
+
Args:
|
| 258 |
+
instance_manager: The instance manager to reconcile.
|
| 259 |
+
scheduler: The resource scheduler to make scaling decisions.
|
| 260 |
+
ray_cluster_resource_state: The ray cluster's resource state.
|
| 261 |
+
non_terminated_cloud_instances: The non-terminated cloud instances from
|
| 262 |
+
the cloud provider.
|
| 263 |
+
autoscaling_config: The autoscaling config.
|
| 264 |
+
_logger: The logger (for testing).
|
| 265 |
+
|
| 266 |
+
"""
|
| 267 |
+
|
| 268 |
+
Reconciler._handle_stuck_instances(
|
| 269 |
+
instance_manager=instance_manager,
|
| 270 |
+
reconcile_config=autoscaling_config.get_instance_reconcile_config(),
|
| 271 |
+
_logger=_logger or logger,
|
| 272 |
+
)
|
| 273 |
+
|
| 274 |
+
Reconciler._scale_cluster(
|
| 275 |
+
autoscaling_state=autoscaling_state,
|
| 276 |
+
instance_manager=instance_manager,
|
| 277 |
+
ray_state=ray_cluster_resource_state,
|
| 278 |
+
scheduler=scheduler,
|
| 279 |
+
autoscaling_config=autoscaling_config,
|
| 280 |
+
)
|
| 281 |
+
|
| 282 |
+
Reconciler._handle_instances_launch(
|
| 283 |
+
instance_manager=instance_manager, autoscaling_config=autoscaling_config
|
| 284 |
+
)
|
| 285 |
+
|
| 286 |
+
Reconciler._terminate_instances(instance_manager=instance_manager)
|
| 287 |
+
if not autoscaling_config.disable_node_updaters():
|
| 288 |
+
Reconciler._install_ray(
|
| 289 |
+
instance_manager=instance_manager,
|
| 290 |
+
non_terminated_cloud_instances=non_terminated_cloud_instances,
|
| 291 |
+
)
|
| 292 |
+
|
| 293 |
+
Reconciler._fill_autoscaling_state(
|
| 294 |
+
instance_manager=instance_manager, autoscaling_state=autoscaling_state
|
| 295 |
+
)
|
| 296 |
+
|
| 297 |
+
#######################################################
|
| 298 |
+
# Utility methods for reconciling instance states.
|
| 299 |
+
#######################################################
|
| 300 |
+
|
| 301 |
+
@staticmethod
|
| 302 |
+
def _handle_cloud_instance_allocation(
|
| 303 |
+
instance_manager: InstanceManager,
|
| 304 |
+
non_terminated_cloud_instances: Dict[CloudInstanceId, CloudInstance],
|
| 305 |
+
cloud_provider_errors: List[CloudInstanceProviderError],
|
| 306 |
+
):
|
| 307 |
+
im_instances, version = Reconciler._get_im_instances(instance_manager)
|
| 308 |
+
updates = {}
|
| 309 |
+
|
| 310 |
+
# Compute intermediate states.
|
| 311 |
+
|
| 312 |
+
instances_with_launch_requests: List[IMInstance] = []
|
| 313 |
+
for instance in im_instances:
|
| 314 |
+
if instance.status != IMInstance.REQUESTED:
|
| 315 |
+
continue
|
| 316 |
+
|
| 317 |
+
assert (
|
| 318 |
+
instance.launch_request_id
|
| 319 |
+
), "Instance in REQUESTED status should have launch_request_id set."
|
| 320 |
+
instances_with_launch_requests.append(instance)
|
| 321 |
+
|
| 322 |
+
assigned_cloud_instance_ids: Set[CloudInstanceId] = {
|
| 323 |
+
instance.cloud_instance_id for instance in im_instances
|
| 324 |
+
}
|
| 325 |
+
launch_errors: Dict[str, LaunchNodeError] = {
|
| 326 |
+
error.request_id: error
|
| 327 |
+
for error in cloud_provider_errors
|
| 328 |
+
if isinstance(error, LaunchNodeError)
|
| 329 |
+
}
|
| 330 |
+
unassigned_cloud_instances_by_type: Dict[
|
| 331 |
+
str, List[CloudInstance]
|
| 332 |
+
] = defaultdict(list)
|
| 333 |
+
|
| 334 |
+
for cloud_instance_id, cloud_instance in non_terminated_cloud_instances.items():
|
| 335 |
+
if cloud_instance_id not in assigned_cloud_instance_ids:
|
| 336 |
+
unassigned_cloud_instances_by_type[cloud_instance.node_type].append(
|
| 337 |
+
cloud_instance
|
| 338 |
+
)
|
| 339 |
+
|
| 340 |
+
# Sort the request instance by the increasing request time.
|
| 341 |
+
instances_with_launch_requests.sort(
|
| 342 |
+
key=lambda instance: InstanceUtil.get_status_transition_times_ns(
|
| 343 |
+
instance, IMInstance.REQUESTED
|
| 344 |
+
)
|
| 345 |
+
)
|
| 346 |
+
|
| 347 |
+
# For each instance, try to allocate or fail the allocation.
|
| 348 |
+
for instance in instances_with_launch_requests:
|
| 349 |
+
# Try allocate or fail with errors.
|
| 350 |
+
update_event = Reconciler._try_resolve_pending_allocation(
|
| 351 |
+
instance, unassigned_cloud_instances_by_type, launch_errors
|
| 352 |
+
)
|
| 353 |
+
if not update_event:
|
| 354 |
+
continue
|
| 355 |
+
|
| 356 |
+
updates[instance.instance_id] = update_event
|
| 357 |
+
|
| 358 |
+
# Update the instance manager for the events.
|
| 359 |
+
Reconciler._update_instance_manager(instance_manager, version, updates)
|
| 360 |
+
|
| 361 |
+
@staticmethod
|
| 362 |
+
def _try_resolve_pending_allocation(
|
| 363 |
+
im_instance: IMInstance,
|
| 364 |
+
unassigned_cloud_instances_by_type: Dict[str, List[CloudInstance]],
|
| 365 |
+
launch_errors: Dict[str, LaunchNodeError],
|
| 366 |
+
) -> Optional[IMInstanceUpdateEvent]:
|
| 367 |
+
"""
|
| 368 |
+
Allocate, or fail the cloud instance allocation for the instance.
|
| 369 |
+
|
| 370 |
+
Args:
|
| 371 |
+
im_instance: The instance to allocate or fail.
|
| 372 |
+
unassigned_cloud_instances_by_type: The unassigned cloud instances by type.
|
| 373 |
+
launch_errors: The launch errors from the cloud provider.
|
| 374 |
+
|
| 375 |
+
Returns:
|
| 376 |
+
Instance update to ALLOCATED: if there's a matching unassigned cloud
|
| 377 |
+
instance with the same type.
|
| 378 |
+
Instance update to ALLOCATION_FAILED: if the instance allocation failed
|
| 379 |
+
with errors.
|
| 380 |
+
None: if there's no update.
|
| 381 |
+
|
| 382 |
+
"""
|
| 383 |
+
unassigned_cloud_instance = None
|
| 384 |
+
|
| 385 |
+
# Try to allocate an unassigned cloud instance.
|
| 386 |
+
# TODO(rickyx): We could also look at the launch request id
|
| 387 |
+
# on the cloud node and the im instance later once all node providers
|
| 388 |
+
# support request id. For now, we only look at the instance type.
|
| 389 |
+
if len(unassigned_cloud_instances_by_type.get(im_instance.instance_type, [])):
|
| 390 |
+
unassigned_cloud_instance = unassigned_cloud_instances_by_type[
|
| 391 |
+
im_instance.instance_type
|
| 392 |
+
].pop()
|
| 393 |
+
|
| 394 |
+
if unassigned_cloud_instance:
|
| 395 |
+
return IMInstanceUpdateEvent(
|
| 396 |
+
instance_id=im_instance.instance_id,
|
| 397 |
+
new_instance_status=IMInstance.ALLOCATED,
|
| 398 |
+
cloud_instance_id=unassigned_cloud_instance.cloud_instance_id,
|
| 399 |
+
node_kind=unassigned_cloud_instance.node_kind,
|
| 400 |
+
instance_type=unassigned_cloud_instance.node_type,
|
| 401 |
+
details=(
|
| 402 |
+
"allocated unassigned cloud instance "
|
| 403 |
+
f"{unassigned_cloud_instance.cloud_instance_id}"
|
| 404 |
+
),
|
| 405 |
+
)
|
| 406 |
+
|
| 407 |
+
# If there's a launch error, transition to ALLOCATION_FAILED.
|
| 408 |
+
launch_error = launch_errors.get(im_instance.launch_request_id)
|
| 409 |
+
if launch_error and launch_error.node_type == im_instance.instance_type:
|
| 410 |
+
return IMInstanceUpdateEvent(
|
| 411 |
+
instance_id=im_instance.instance_id,
|
| 412 |
+
new_instance_status=IMInstance.ALLOCATION_FAILED,
|
| 413 |
+
details=f"launch failed with {str(launch_error)}",
|
| 414 |
+
)
|
| 415 |
+
# No update.
|
| 416 |
+
return None
|
| 417 |
+
|
| 418 |
+
@staticmethod
|
| 419 |
+
def _handle_ray_stop_failed(
|
| 420 |
+
instance_manager: InstanceManager,
|
| 421 |
+
ray_stop_errors: List[RayStopError],
|
| 422 |
+
ray_nodes: List[NodeState],
|
| 423 |
+
):
|
| 424 |
+
"""
|
| 425 |
+
The instance requested to stop ray, but failed to stop/drain the ray node.
|
| 426 |
+
E.g. connection errors, idle termination drain rejected by the node.
|
| 427 |
+
|
| 428 |
+
We will transition the instance back to RAY_RUNNING.
|
| 429 |
+
|
| 430 |
+
Args:
|
| 431 |
+
instance_manager: The instance manager to reconcile.
|
| 432 |
+
ray_stop_errors: The errors from RayStopper.
|
| 433 |
+
|
| 434 |
+
"""
|
| 435 |
+
instances, version = Reconciler._get_im_instances(instance_manager)
|
| 436 |
+
updates = {}
|
| 437 |
+
|
| 438 |
+
ray_stop_errors_by_instance_id = {
|
| 439 |
+
error.im_instance_id: error for error in ray_stop_errors
|
| 440 |
+
}
|
| 441 |
+
|
| 442 |
+
ray_nodes_by_ray_node_id = {binary_to_hex(n.node_id): n for n in ray_nodes}
|
| 443 |
+
|
| 444 |
+
ray_stop_requested_instances = {
|
| 445 |
+
instance.instance_id: instance
|
| 446 |
+
for instance in instances
|
| 447 |
+
if instance.status == IMInstance.RAY_STOP_REQUESTED
|
| 448 |
+
}
|
| 449 |
+
|
| 450 |
+
for instance_id, instance in ray_stop_requested_instances.items():
|
| 451 |
+
stop_error = ray_stop_errors_by_instance_id.get(instance_id)
|
| 452 |
+
if not stop_error:
|
| 453 |
+
continue
|
| 454 |
+
|
| 455 |
+
assert instance.node_id
|
| 456 |
+
ray_node = ray_nodes_by_ray_node_id.get(instance.node_id)
|
| 457 |
+
assert ray_node is not None and ray_node.status in [
|
| 458 |
+
NodeStatus.RUNNING,
|
| 459 |
+
NodeStatus.IDLE,
|
| 460 |
+
], (
|
| 461 |
+
"There should be a running ray node for instance with ray stop "
|
| 462 |
+
"requested failed."
|
| 463 |
+
)
|
| 464 |
+
|
| 465 |
+
updates[instance_id] = IMInstanceUpdateEvent(
|
| 466 |
+
instance_id=instance_id,
|
| 467 |
+
new_instance_status=IMInstance.RAY_RUNNING,
|
| 468 |
+
details="failed to stop/drain ray",
|
| 469 |
+
ray_node_id=instance.node_id,
|
| 470 |
+
)
|
| 471 |
+
|
| 472 |
+
Reconciler._update_instance_manager(instance_manager, version, updates)
|
| 473 |
+
|
| 474 |
+
@staticmethod
|
| 475 |
+
def _handle_ray_install_failed(
|
| 476 |
+
instance_manager: InstanceManager, ray_install_errors: List[RayInstallError]
|
| 477 |
+
):
|
| 478 |
+
|
| 479 |
+
instances, version = Reconciler._get_im_instances(instance_manager)
|
| 480 |
+
updates = {}
|
| 481 |
+
|
| 482 |
+
# Get all instances with RAY_INSTALLING status.
|
| 483 |
+
instances_with_ray_installing = {
|
| 484 |
+
instance.instance_id: instance
|
| 485 |
+
for instance in instances
|
| 486 |
+
if instance.status == IMInstance.RAY_INSTALLING
|
| 487 |
+
}
|
| 488 |
+
|
| 489 |
+
install_errors = {error.im_instance_id: error for error in ray_install_errors}
|
| 490 |
+
|
| 491 |
+
# For each instance with RAY_INSTALLING status, check if there's any
|
| 492 |
+
# install error.
|
| 493 |
+
for instance_id, instance in instances_with_ray_installing.items():
|
| 494 |
+
install_error = install_errors.get(instance_id)
|
| 495 |
+
if install_error:
|
| 496 |
+
updates[instance_id] = IMInstanceUpdateEvent(
|
| 497 |
+
instance_id=instance_id,
|
| 498 |
+
new_instance_status=IMInstance.RAY_INSTALL_FAILED,
|
| 499 |
+
details=(
|
| 500 |
+
f"failed to install ray with errors: {install_error.details}"
|
| 501 |
+
),
|
| 502 |
+
)
|
| 503 |
+
|
| 504 |
+
# Update the instance manager for the events.
|
| 505 |
+
Reconciler._update_instance_manager(instance_manager, version, updates)
|
| 506 |
+
|
| 507 |
+
@staticmethod
|
| 508 |
+
def _handle_cloud_instance_terminated(
|
| 509 |
+
instance_manager: InstanceManager,
|
| 510 |
+
non_terminated_cloud_instances: Dict[CloudInstanceId, CloudInstance],
|
| 511 |
+
):
|
| 512 |
+
"""
|
| 513 |
+
For any IM (instance manager) instance with a cloud node id, if the mapped
|
| 514 |
+
cloud instance is no longer running, transition the instance to TERMINATED.
|
| 515 |
+
|
| 516 |
+
Args:
|
| 517 |
+
instance_manager: The instance manager to reconcile.
|
| 518 |
+
non_terminated_cloud_instances: The non-terminated cloud instances from
|
| 519 |
+
the cloud provider.
|
| 520 |
+
"""
|
| 521 |
+
updates = {}
|
| 522 |
+
instances, version = Reconciler._get_im_instances(instance_manager)
|
| 523 |
+
|
| 524 |
+
non_terminated_instances_with_cloud_instance_assigned = {
|
| 525 |
+
instance.cloud_instance_id: instance
|
| 526 |
+
for instance in instances
|
| 527 |
+
if instance.cloud_instance_id and instance.status != IMInstance.TERMINATED
|
| 528 |
+
}
|
| 529 |
+
|
| 530 |
+
for (
|
| 531 |
+
cloud_instance_id,
|
| 532 |
+
instance,
|
| 533 |
+
) in non_terminated_instances_with_cloud_instance_assigned.items():
|
| 534 |
+
if cloud_instance_id in non_terminated_cloud_instances.keys():
|
| 535 |
+
# The cloud instance is still running.
|
| 536 |
+
continue
|
| 537 |
+
|
| 538 |
+
# The cloud instance is terminated.
|
| 539 |
+
updates[instance.instance_id] = IMInstanceUpdateEvent(
|
| 540 |
+
instance_id=instance.instance_id,
|
| 541 |
+
new_instance_status=IMInstance.TERMINATED,
|
| 542 |
+
details=f"cloud instance {cloud_instance_id} no longer found",
|
| 543 |
+
)
|
| 544 |
+
|
| 545 |
+
Reconciler._update_instance_manager(instance_manager, version, updates)
|
| 546 |
+
|
| 547 |
+
@staticmethod
|
| 548 |
+
def _handle_cloud_instance_termination_errors(
|
| 549 |
+
instance_manager: InstanceManager,
|
| 550 |
+
cloud_provider_errors: List[CloudInstanceProviderError],
|
| 551 |
+
):
|
| 552 |
+
"""
|
| 553 |
+
If any TERMINATING instances have termination errors, transition the instance to
|
| 554 |
+
TERMINATION_FAILED.
|
| 555 |
+
|
| 556 |
+
We will retry the termination for the TERMINATION_FAILED instances in the next
|
| 557 |
+
reconciler step.
|
| 558 |
+
|
| 559 |
+
Args:
|
| 560 |
+
instance_manager: The instance manager to reconcile.
|
| 561 |
+
cloud_provider_errors: The errors from the cloud provider.
|
| 562 |
+
|
| 563 |
+
"""
|
| 564 |
+
instances, version = Reconciler._get_im_instances(instance_manager)
|
| 565 |
+
updates = {}
|
| 566 |
+
|
| 567 |
+
termination_errors = {
|
| 568 |
+
error.cloud_instance_id: error
|
| 569 |
+
for error in cloud_provider_errors
|
| 570 |
+
if isinstance(error, TerminateNodeError)
|
| 571 |
+
}
|
| 572 |
+
|
| 573 |
+
terminating_instances_by_cloud_instance_id = {
|
| 574 |
+
instance.cloud_instance_id: instance
|
| 575 |
+
for instance in instances
|
| 576 |
+
if instance.status == IMInstance.TERMINATING
|
| 577 |
+
}
|
| 578 |
+
|
| 579 |
+
for cloud_instance_id, failure in termination_errors.items():
|
| 580 |
+
instance = terminating_instances_by_cloud_instance_id.get(cloud_instance_id)
|
| 581 |
+
if not instance:
|
| 582 |
+
# The instance is no longer in TERMINATING status.
|
| 583 |
+
continue
|
| 584 |
+
|
| 585 |
+
updates[instance.instance_id] = IMInstanceUpdateEvent(
|
| 586 |
+
instance_id=instance.instance_id,
|
| 587 |
+
new_instance_status=IMInstance.TERMINATION_FAILED,
|
| 588 |
+
details=f"termination failed: {str(failure)}",
|
| 589 |
+
)
|
| 590 |
+
|
| 591 |
+
Reconciler._update_instance_manager(instance_manager, version, updates)
|
| 592 |
+
|
| 593 |
+
@staticmethod
|
| 594 |
+
def _get_im_instances(
|
| 595 |
+
instance_manager: InstanceManager,
|
| 596 |
+
) -> Tuple[List[IMInstance], int]:
|
| 597 |
+
reply = instance_manager.get_instance_manager_state(
|
| 598 |
+
request=GetInstanceManagerStateRequest()
|
| 599 |
+
)
|
| 600 |
+
assert reply.status.code == StatusCode.OK
|
| 601 |
+
im_state = reply.state
|
| 602 |
+
return im_state.instances, im_state.version
|
| 603 |
+
|
| 604 |
+
@staticmethod
|
| 605 |
+
def _update_instance_manager(
|
| 606 |
+
instance_manager: InstanceManager,
|
| 607 |
+
version: int,
|
| 608 |
+
updates: Dict[str, IMInstanceUpdateEvent],
|
| 609 |
+
) -> None:
|
| 610 |
+
if not updates:
|
| 611 |
+
return
|
| 612 |
+
|
| 613 |
+
updates = list(updates.values()) or []
|
| 614 |
+
|
| 615 |
+
reply = instance_manager.update_instance_manager_state(
|
| 616 |
+
request=UpdateInstanceManagerStateRequest(
|
| 617 |
+
expected_version=version,
|
| 618 |
+
updates=updates,
|
| 619 |
+
)
|
| 620 |
+
)
|
| 621 |
+
# TODO: While it's possible that a version mismatch
|
| 622 |
+
# happens, or some other failures could happen. But given
|
| 623 |
+
# the current implementation:
|
| 624 |
+
# 1. There's only 1 writer (the reconciler) for updating the instance
|
| 625 |
+
# manager states, so there shouldn't be version mismatch.
|
| 626 |
+
# 2. Any failures in one reconciler step should be caught at a higher
|
| 627 |
+
# level and be retried in the next reconciler step. If the IM
|
| 628 |
+
# fails to be updated, we don't have sufficient info to handle it
|
| 629 |
+
# here.
|
| 630 |
+
assert (
|
| 631 |
+
reply.status.code == StatusCode.OK
|
| 632 |
+
), f"Failed to update instance manager: {reply}"
|
| 633 |
+
|
| 634 |
+
@staticmethod
|
| 635 |
+
def _handle_ray_status_transition(
|
| 636 |
+
instance_manager: InstanceManager,
|
| 637 |
+
ray_nodes: List[NodeState],
|
| 638 |
+
autoscaling_config: AutoscalingConfig,
|
| 639 |
+
):
|
| 640 |
+
"""
|
| 641 |
+
Handle the ray status transition for the instance manager.
|
| 642 |
+
|
| 643 |
+
If a new ray node running on the instance, transition it to RAY_RUNNING.
|
| 644 |
+
If a ray node stopped, transition it to RAY_STOPPED.
|
| 645 |
+
If a ray node is draining, transition it to RAY_STOPPING.
|
| 646 |
+
|
| 647 |
+
Args:
|
| 648 |
+
instance_manager: The instance manager to reconcile.
|
| 649 |
+
ray_nodes: The ray cluster's states of ray nodes.
|
| 650 |
+
"""
|
| 651 |
+
instances, version = Reconciler._get_im_instances(instance_manager)
|
| 652 |
+
updates = {}
|
| 653 |
+
|
| 654 |
+
im_instances_by_cloud_instance_id = {
|
| 655 |
+
i.cloud_instance_id: i for i in instances if i.cloud_instance_id
|
| 656 |
+
}
|
| 657 |
+
ray_nodes_by_cloud_instance_id = {}
|
| 658 |
+
for n in ray_nodes:
|
| 659 |
+
if n.instance_id:
|
| 660 |
+
ray_nodes_by_cloud_instance_id[n.instance_id] = n
|
| 661 |
+
else:
|
| 662 |
+
if autoscaling_config.provider == Provider.READ_ONLY:
|
| 663 |
+
# We will use the node id as the cloud instance id for read-only
|
| 664 |
+
# provider.
|
| 665 |
+
ray_nodes_by_cloud_instance_id[binary_to_hex(n.node_id)] = n
|
| 666 |
+
else:
|
| 667 |
+
# This should only happen to a ray node that's not managed by us.
|
| 668 |
+
logger.warning(
|
| 669 |
+
f"Ray node {binary_to_hex(n.node_id)} has no instance id. "
|
| 670 |
+
"This only happens to a ray node not managed by autoscaler. "
|
| 671 |
+
"If not, please file a bug at "
|
| 672 |
+
"https://github.com/ray-project/ray"
|
| 673 |
+
)
|
| 674 |
+
|
| 675 |
+
for cloud_instance_id, ray_node in ray_nodes_by_cloud_instance_id.items():
|
| 676 |
+
assert cloud_instance_id in im_instances_by_cloud_instance_id, (
|
| 677 |
+
f"Ray node {binary_to_hex(ray_node.node_id)} has no matching "
|
| 678 |
+
f"instance with cloud instance id={cloud_instance_id}. We should "
|
| 679 |
+
"not see a ray node with cloud instance id not found in IM since "
|
| 680 |
+
"we have reconciled all cloud instances, and ray nodes by now."
|
| 681 |
+
)
|
| 682 |
+
|
| 683 |
+
im_instance = im_instances_by_cloud_instance_id[cloud_instance_id]
|
| 684 |
+
reconciled_im_status = Reconciler._reconciled_im_status_from_ray_status(
|
| 685 |
+
ray_node.status, im_instance.status
|
| 686 |
+
)
|
| 687 |
+
|
| 688 |
+
if reconciled_im_status != im_instance.status:
|
| 689 |
+
updates[im_instance.instance_id] = IMInstanceUpdateEvent(
|
| 690 |
+
instance_id=im_instance.instance_id,
|
| 691 |
+
new_instance_status=reconciled_im_status,
|
| 692 |
+
details=(
|
| 693 |
+
f"ray node {binary_to_hex(ray_node.node_id)} is "
|
| 694 |
+
f"{NodeStatus.Name(ray_node.status)}"
|
| 695 |
+
),
|
| 696 |
+
ray_node_id=binary_to_hex(ray_node.node_id),
|
| 697 |
+
)
|
| 698 |
+
|
| 699 |
+
Reconciler._update_instance_manager(instance_manager, version, updates)
|
| 700 |
+
|
| 701 |
+
@staticmethod
|
| 702 |
+
def _reconciled_im_status_from_ray_status(
|
| 703 |
+
ray_status: NodeStatus, cur_im_status: IMInstance.InstanceStatus
|
| 704 |
+
) -> "IMInstance.InstanceStatus":
|
| 705 |
+
"""
|
| 706 |
+
Reconcile the instance status from the ray node status.
|
| 707 |
+
Args:
|
| 708 |
+
ray_status: the current ray node status.
|
| 709 |
+
cur_im_status: the current IM instance status.
|
| 710 |
+
Returns:
|
| 711 |
+
The reconciled IM instance status
|
| 712 |
+
|
| 713 |
+
Raises:
|
| 714 |
+
ValueError: If the ray status is unknown.
|
| 715 |
+
"""
|
| 716 |
+
reconciled_im_status = None
|
| 717 |
+
if ray_status in [NodeStatus.RUNNING, NodeStatus.IDLE]:
|
| 718 |
+
reconciled_im_status = IMInstance.RAY_RUNNING
|
| 719 |
+
elif ray_status == NodeStatus.DEAD:
|
| 720 |
+
reconciled_im_status = IMInstance.RAY_STOPPED
|
| 721 |
+
elif ray_status == NodeStatus.DRAINING:
|
| 722 |
+
reconciled_im_status = IMInstance.RAY_STOPPING
|
| 723 |
+
else:
|
| 724 |
+
raise ValueError(f"Unknown ray status: {ray_status}")
|
| 725 |
+
|
| 726 |
+
if (
|
| 727 |
+
cur_im_status == reconciled_im_status
|
| 728 |
+
or cur_im_status
|
| 729 |
+
in InstanceUtil.get_reachable_statuses(reconciled_im_status)
|
| 730 |
+
):
|
| 731 |
+
# No need to reconcile if the instance is already in the reconciled status
|
| 732 |
+
# or has already transitioned beyond it.
|
| 733 |
+
return cur_im_status
|
| 734 |
+
|
| 735 |
+
return reconciled_im_status
|
| 736 |
+
|
| 737 |
+
@staticmethod
|
| 738 |
+
def _handle_instances_launch(
|
| 739 |
+
instance_manager: InstanceManager, autoscaling_config: AutoscalingConfig
|
| 740 |
+
):
|
| 741 |
+
|
| 742 |
+
instances, version = Reconciler._get_im_instances(instance_manager)
|
| 743 |
+
|
| 744 |
+
queued_instances = []
|
| 745 |
+
requested_instances = []
|
| 746 |
+
allocated_instances = []
|
| 747 |
+
|
| 748 |
+
for instance in instances:
|
| 749 |
+
if instance.status == IMInstance.QUEUED:
|
| 750 |
+
queued_instances.append(instance)
|
| 751 |
+
elif instance.status == IMInstance.REQUESTED:
|
| 752 |
+
requested_instances.append(instance)
|
| 753 |
+
elif instance.cloud_instance_id:
|
| 754 |
+
allocated_instances.append(instance)
|
| 755 |
+
|
| 756 |
+
if not queued_instances:
|
| 757 |
+
# No QUEUED instances
|
| 758 |
+
return
|
| 759 |
+
|
| 760 |
+
to_launch = Reconciler._compute_to_launch(
|
| 761 |
+
queued_instances,
|
| 762 |
+
requested_instances,
|
| 763 |
+
allocated_instances,
|
| 764 |
+
autoscaling_config.get_upscaling_speed(),
|
| 765 |
+
autoscaling_config.get_max_concurrent_launches(),
|
| 766 |
+
)
|
| 767 |
+
|
| 768 |
+
# Transition the instances to REQUESTED for instance launcher to
|
| 769 |
+
# launch them.
|
| 770 |
+
updates = {}
|
| 771 |
+
new_launch_request_id = str(uuid.uuid4())
|
| 772 |
+
for instance_type, instances in to_launch.items():
|
| 773 |
+
for instance in instances:
|
| 774 |
+
# Reuse launch request id for any QUEUED instances that have been
|
| 775 |
+
# requested before due to retry.
|
| 776 |
+
launch_request_id = (
|
| 777 |
+
new_launch_request_id
|
| 778 |
+
if len(instance.launch_request_id) == 0
|
| 779 |
+
else instance.launch_request_id
|
| 780 |
+
)
|
| 781 |
+
updates[instance.instance_id] = IMInstanceUpdateEvent(
|
| 782 |
+
instance_id=instance.instance_id,
|
| 783 |
+
new_instance_status=IMInstance.REQUESTED,
|
| 784 |
+
launch_request_id=launch_request_id,
|
| 785 |
+
instance_type=instance_type,
|
| 786 |
+
details=(
|
| 787 |
+
f"requested to launch {instance_type} with request id "
|
| 788 |
+
f"{launch_request_id}"
|
| 789 |
+
),
|
| 790 |
+
)
|
| 791 |
+
|
| 792 |
+
Reconciler._update_instance_manager(instance_manager, version, updates)
|
| 793 |
+
|
| 794 |
+
@staticmethod
|
| 795 |
+
def _compute_to_launch(
|
| 796 |
+
queued_instances: List[IMInstance],
|
| 797 |
+
requested_instances: List[IMInstance],
|
| 798 |
+
allocated_instances: List[IMInstance],
|
| 799 |
+
upscaling_speed: float,
|
| 800 |
+
max_concurrent_launches: int,
|
| 801 |
+
) -> Dict[NodeType, List[IMInstance]]:
|
| 802 |
+
def _group_by_type(instances):
|
| 803 |
+
instances_by_type = defaultdict(list)
|
| 804 |
+
for instance in instances:
|
| 805 |
+
instances_by_type[instance.instance_type].append(instance)
|
| 806 |
+
return instances_by_type
|
| 807 |
+
|
| 808 |
+
# Sort the instances by the time they were queued.
|
| 809 |
+
def _sort_by_earliest_queued(instance: IMInstance) -> List[int]:
|
| 810 |
+
queue_times = InstanceUtil.get_status_transition_times_ns(
|
| 811 |
+
instance, IMInstance.QUEUED
|
| 812 |
+
)
|
| 813 |
+
return sorted(queue_times)
|
| 814 |
+
|
| 815 |
+
queued_instances_by_type = _group_by_type(queued_instances)
|
| 816 |
+
requested_instances_by_type = _group_by_type(requested_instances)
|
| 817 |
+
allocated_instances_by_type = _group_by_type(allocated_instances)
|
| 818 |
+
|
| 819 |
+
total_num_requested_to_launch = len(requested_instances)
|
| 820 |
+
all_to_launch: Dict[NodeType : List[IMInstance]] = defaultdict(list)
|
| 821 |
+
|
| 822 |
+
for (
|
| 823 |
+
instance_type,
|
| 824 |
+
queued_instances_for_type,
|
| 825 |
+
) in queued_instances_by_type.items():
|
| 826 |
+
requested_instances_for_type = requested_instances_by_type.get(
|
| 827 |
+
instance_type, []
|
| 828 |
+
)
|
| 829 |
+
allocated_instances_for_type = allocated_instances_by_type.get(
|
| 830 |
+
instance_type, []
|
| 831 |
+
)
|
| 832 |
+
|
| 833 |
+
num_desired_to_upscale = max(
|
| 834 |
+
1,
|
| 835 |
+
math.ceil(
|
| 836 |
+
upscaling_speed
|
| 837 |
+
* (
|
| 838 |
+
len(requested_instances_for_type)
|
| 839 |
+
+ len(allocated_instances_for_type)
|
| 840 |
+
)
|
| 841 |
+
),
|
| 842 |
+
)
|
| 843 |
+
|
| 844 |
+
# Enforce global limit, at most we can launch `max_concurrent_launches`
|
| 845 |
+
num_to_launch = min(
|
| 846 |
+
max_concurrent_launches - total_num_requested_to_launch,
|
| 847 |
+
num_desired_to_upscale,
|
| 848 |
+
)
|
| 849 |
+
|
| 850 |
+
# Cap both ends 0 <= num_to_launch <= num_queued
|
| 851 |
+
num_to_launch = max(0, num_to_launch)
|
| 852 |
+
num_to_launch = min(len(queued_instances_for_type), num_to_launch)
|
| 853 |
+
|
| 854 |
+
to_launch = sorted(queued_instances_for_type, key=_sort_by_earliest_queued)[
|
| 855 |
+
:num_to_launch
|
| 856 |
+
]
|
| 857 |
+
|
| 858 |
+
all_to_launch[instance_type].extend(to_launch)
|
| 859 |
+
total_num_requested_to_launch += num_to_launch
|
| 860 |
+
|
| 861 |
+
return all_to_launch
|
| 862 |
+
|
| 863 |
+
@staticmethod
|
| 864 |
+
def _handle_stuck_instances(
|
| 865 |
+
instance_manager: InstanceManager,
|
| 866 |
+
reconcile_config: InstanceReconcileConfig,
|
| 867 |
+
_logger: logging.Logger,
|
| 868 |
+
):
|
| 869 |
+
"""
|
| 870 |
+
Handle stuck instances with timeouts.
|
| 871 |
+
|
| 872 |
+
Instances could be stuck in the following status and needs to be updated:
|
| 873 |
+
- REQUESTED: cloud provider is slow/fails to launch instances.
|
| 874 |
+
- ALLOCATED: ray fails to be started on the instance.
|
| 875 |
+
- RAY_INSTALLING: ray fails to be installed on the instance.
|
| 876 |
+
- TERMINATING: cloud provider is slow/fails to terminate instances.
|
| 877 |
+
|
| 878 |
+
Instances could be in the following status which could be unbounded or
|
| 879 |
+
transient, and we don't have a timeout mechanism to handle them. We would
|
| 880 |
+
warn if they are stuck for too long:
|
| 881 |
+
- RAY_STOPPING: ray taking time to drain.
|
| 882 |
+
- QUEUED: cloud provider is slow to launch instances, resulting in long
|
| 883 |
+
queue.
|
| 884 |
+
|
| 885 |
+
Reconciler should handle below statuses, if not, could be slow
|
| 886 |
+
reconcilation loop or a bug:
|
| 887 |
+
- RAY_INSTALL_FAILED
|
| 888 |
+
- RAY_STOPPED
|
| 889 |
+
- TERMINATION_FAILED
|
| 890 |
+
|
| 891 |
+
|
| 892 |
+
Args:
|
| 893 |
+
instance_manager: The instance manager to reconcile.
|
| 894 |
+
reconcile_config: The instance reconcile config.
|
| 895 |
+
_logger: The logger to log the warning messages. It's used for testing.
|
| 896 |
+
|
| 897 |
+
"""
|
| 898 |
+
instances, version = Reconciler._get_im_instances(instance_manager)
|
| 899 |
+
|
| 900 |
+
instances_by_status = defaultdict(list)
|
| 901 |
+
for instance in instances:
|
| 902 |
+
instances_by_status[instance.status].append(instance)
|
| 903 |
+
|
| 904 |
+
im_updates = {}
|
| 905 |
+
|
| 906 |
+
# Fail or retry the cloud instance allocation if it's stuck
|
| 907 |
+
# in the REQUESTED state.
|
| 908 |
+
for instance in instances_by_status[IMInstance.REQUESTED]:
|
| 909 |
+
update = Reconciler._handle_stuck_requested_instance(
|
| 910 |
+
instance,
|
| 911 |
+
reconcile_config.request_status_timeout_s,
|
| 912 |
+
reconcile_config.max_num_retry_request_to_allocate,
|
| 913 |
+
)
|
| 914 |
+
if update:
|
| 915 |
+
im_updates[instance.instance_id] = update
|
| 916 |
+
|
| 917 |
+
# Leaked ALLOCATED instances should be terminated.
|
| 918 |
+
# This usually happens when ray fails to be started on the instance, so
|
| 919 |
+
# it's unable to be RAY_RUNNING after a long time.
|
| 920 |
+
for instance in instances_by_status[IMInstance.ALLOCATED]:
|
| 921 |
+
assert (
|
| 922 |
+
instance.cloud_instance_id
|
| 923 |
+
), "cloud instance id should be set on ALLOCATED instance"
|
| 924 |
+
update = Reconciler._handle_stuck_instance(
|
| 925 |
+
instance,
|
| 926 |
+
reconcile_config.allocate_status_timeout_s,
|
| 927 |
+
new_status=IMInstance.TERMINATING,
|
| 928 |
+
cloud_instance_id=instance.cloud_instance_id,
|
| 929 |
+
)
|
| 930 |
+
if update:
|
| 931 |
+
im_updates[instance.instance_id] = update
|
| 932 |
+
|
| 933 |
+
# Fail the installation if it's stuck in RAY_INSTALLING for too long.
|
| 934 |
+
# If RAY_INSTALLING is stuck for too long, it's likely that the instance
|
| 935 |
+
# is not able to install ray, so we should also fail the installation.
|
| 936 |
+
for instance in instances_by_status[IMInstance.RAY_INSTALLING]:
|
| 937 |
+
update = Reconciler._handle_stuck_instance(
|
| 938 |
+
instance,
|
| 939 |
+
reconcile_config.ray_install_status_timeout_s,
|
| 940 |
+
new_status=IMInstance.RAY_INSTALL_FAILED,
|
| 941 |
+
)
|
| 942 |
+
if update:
|
| 943 |
+
im_updates[instance.instance_id] = update
|
| 944 |
+
|
| 945 |
+
# If we tried to terminate the instance, but it doesn't terminate (disappear
|
| 946 |
+
# from the cloud provider) after a long time, we fail the termination.
|
| 947 |
+
# This will trigger another attempt to terminate the instance.
|
| 948 |
+
for instance in instances_by_status[IMInstance.TERMINATING]:
|
| 949 |
+
update = Reconciler._handle_stuck_instance(
|
| 950 |
+
instance,
|
| 951 |
+
reconcile_config.terminating_status_timeout_s,
|
| 952 |
+
new_status=IMInstance.TERMINATION_FAILED,
|
| 953 |
+
)
|
| 954 |
+
if update:
|
| 955 |
+
im_updates[instance.instance_id] = update
|
| 956 |
+
|
| 957 |
+
# If we tried to stop ray on the instance, but it doesn't stop after a long
|
| 958 |
+
# time, we will transition it back to RAY_RUNNING as the stop/drain somehow
|
| 959 |
+
# failed. If it had succeed, we should have transitioned it to RAY_STOPPING
|
| 960 |
+
# or RAY_STOPPED.
|
| 961 |
+
for instance in instances_by_status[IMInstance.RAY_STOP_REQUESTED]:
|
| 962 |
+
update = Reconciler._handle_stuck_instance(
|
| 963 |
+
instance,
|
| 964 |
+
reconcile_config.ray_stop_requested_status_timeout_s,
|
| 965 |
+
new_status=IMInstance.RAY_RUNNING,
|
| 966 |
+
ray_node_id=instance.node_id,
|
| 967 |
+
)
|
| 968 |
+
if update:
|
| 969 |
+
im_updates[instance.instance_id] = update
|
| 970 |
+
|
| 971 |
+
# These statues could be unbounded or transient, and we don't have a timeout
|
| 972 |
+
# mechanism to handle them. We only warn if they are stuck for too long.
|
| 973 |
+
for status in [
|
| 974 |
+
# Ray taking time to drain. We could also have a timeout when Drain protocol
|
| 975 |
+
# supports timeout.
|
| 976 |
+
IMInstance.RAY_STOPPING,
|
| 977 |
+
# These should just be transient, we will terminate instances with this
|
| 978 |
+
# status in the next reconciler step.
|
| 979 |
+
IMInstance.RAY_INSTALL_FAILED,
|
| 980 |
+
IMInstance.RAY_STOPPED,
|
| 981 |
+
IMInstance.TERMINATION_FAILED,
|
| 982 |
+
# Instances could be in the QUEUED status for a long time if the cloud
|
| 983 |
+
# provider is slow to launch instances.
|
| 984 |
+
IMInstance.QUEUED,
|
| 985 |
+
]:
|
| 986 |
+
Reconciler._warn_stuck_instances(
|
| 987 |
+
instances_by_status[status],
|
| 988 |
+
status=status,
|
| 989 |
+
warn_interval_s=reconcile_config.transient_status_warn_interval_s,
|
| 990 |
+
logger=_logger,
|
| 991 |
+
)
|
| 992 |
+
|
| 993 |
+
Reconciler._update_instance_manager(instance_manager, version, im_updates)
|
| 994 |
+
|
| 995 |
+
@staticmethod
|
| 996 |
+
def _warn_stuck_instances(
|
| 997 |
+
instances: List[IMInstance],
|
| 998 |
+
status: IMInstance.InstanceStatus,
|
| 999 |
+
warn_interval_s: int,
|
| 1000 |
+
logger: logging.Logger,
|
| 1001 |
+
):
|
| 1002 |
+
"""Warn if any instance is stuck in a transient/unbounded status for too
|
| 1003 |
+
long.
|
| 1004 |
+
"""
|
| 1005 |
+
for instance in instances:
|
| 1006 |
+
status_times_ns = InstanceUtil.get_status_transition_times_ns(
|
| 1007 |
+
instance, select_instance_status=status
|
| 1008 |
+
)
|
| 1009 |
+
assert len(status_times_ns) >= 1
|
| 1010 |
+
status_time_ns = sorted(status_times_ns)[-1]
|
| 1011 |
+
|
| 1012 |
+
if time.time_ns() - status_time_ns > warn_interval_s * 1e9:
|
| 1013 |
+
logger.warning(
|
| 1014 |
+
"Instance {}({}) is stuck in {} for {} seconds.".format(
|
| 1015 |
+
instance.instance_id,
|
| 1016 |
+
IMInstance.InstanceStatus.Name(instance.status),
|
| 1017 |
+
IMInstance.InstanceStatus.Name(status),
|
| 1018 |
+
(time.time_ns() - status_time_ns) // 1e9,
|
| 1019 |
+
)
|
| 1020 |
+
)
|
| 1021 |
+
|
| 1022 |
+
@staticmethod
|
| 1023 |
+
def _is_head_node_running(instance_manager: InstanceManager) -> bool:
|
| 1024 |
+
"""
|
| 1025 |
+
Check if the head node is running and ready.
|
| 1026 |
+
|
| 1027 |
+
If we scale up the cluster before head node is running,
|
| 1028 |
+
it would cause issues when launching the worker nodes.
|
| 1029 |
+
|
| 1030 |
+
There are corner cases when the GCS is up (so the ray cluster resource
|
| 1031 |
+
state is retrievable from the GCS), but the head node's raylet is not
|
| 1032 |
+
running so the head node is missing from the reported nodes. This happens
|
| 1033 |
+
when the head node is still starting up, or the raylet is not running
|
| 1034 |
+
due to some issues, and this would yield false.
|
| 1035 |
+
|
| 1036 |
+
Args:
|
| 1037 |
+
instance_manager: The instance manager to reconcile.
|
| 1038 |
+
|
| 1039 |
+
Returns:
|
| 1040 |
+
True if the head node is running and ready, False otherwise.
|
| 1041 |
+
"""
|
| 1042 |
+
|
| 1043 |
+
im_instances, _ = Reconciler._get_im_instances(instance_manager)
|
| 1044 |
+
|
| 1045 |
+
for instance in im_instances:
|
| 1046 |
+
if instance.node_kind == NodeKind.HEAD:
|
| 1047 |
+
if instance.status == IMInstance.RAY_RUNNING:
|
| 1048 |
+
return True
|
| 1049 |
+
return False
|
| 1050 |
+
|
| 1051 |
+
@staticmethod
|
| 1052 |
+
def _scale_cluster(
|
| 1053 |
+
autoscaling_state: AutoscalingState,
|
| 1054 |
+
instance_manager: InstanceManager,
|
| 1055 |
+
ray_state: ClusterResourceState,
|
| 1056 |
+
scheduler: IResourceScheduler,
|
| 1057 |
+
autoscaling_config: AutoscalingConfig,
|
| 1058 |
+
) -> None:
|
| 1059 |
+
"""
|
| 1060 |
+
Scale the cluster based on the resource state and the resource scheduler's
|
| 1061 |
+
decision:
|
| 1062 |
+
|
| 1063 |
+
- It launches new instances if needed.
|
| 1064 |
+
- It terminates extra ray nodes if they should be shut down (preemption
|
| 1065 |
+
or idle termination)
|
| 1066 |
+
|
| 1067 |
+
Args:
|
| 1068 |
+
autoscaling_state: The autoscaling state to reconcile.
|
| 1069 |
+
instance_manager: The instance manager to reconcile.
|
| 1070 |
+
ray_state: The ray cluster's resource state.
|
| 1071 |
+
scheduler: The resource scheduler to make scaling decisions.
|
| 1072 |
+
autoscaling_config: The autoscaling config.
|
| 1073 |
+
|
| 1074 |
+
"""
|
| 1075 |
+
|
| 1076 |
+
# Get the current instance states.
|
| 1077 |
+
im_instances, version = Reconciler._get_im_instances(instance_manager)
|
| 1078 |
+
|
| 1079 |
+
autoscaler_instances = []
|
| 1080 |
+
ray_nodes_by_id = {
|
| 1081 |
+
binary_to_hex(node.node_id): node for node in ray_state.node_states
|
| 1082 |
+
}
|
| 1083 |
+
|
| 1084 |
+
for im_instance in im_instances:
|
| 1085 |
+
ray_node = ray_nodes_by_id.get(im_instance.node_id)
|
| 1086 |
+
autoscaler_instances.append(
|
| 1087 |
+
AutoscalerInstance(
|
| 1088 |
+
ray_node=ray_node,
|
| 1089 |
+
im_instance=im_instance,
|
| 1090 |
+
cloud_instance_id=(
|
| 1091 |
+
im_instance.cloud_instance_id
|
| 1092 |
+
if im_instance.cloud_instance_id
|
| 1093 |
+
else None
|
| 1094 |
+
),
|
| 1095 |
+
)
|
| 1096 |
+
)
|
| 1097 |
+
|
| 1098 |
+
# TODO(rickyx): We should probably name it as "Planner" or "Scaler"
|
| 1099 |
+
# or "ClusterScaler"
|
| 1100 |
+
sched_request = SchedulingRequest(
|
| 1101 |
+
node_type_configs=autoscaling_config.get_node_type_configs(),
|
| 1102 |
+
max_num_nodes=autoscaling_config.get_max_num_nodes(),
|
| 1103 |
+
resource_requests=ray_state.pending_resource_requests,
|
| 1104 |
+
gang_resource_requests=ray_state.pending_gang_resource_requests,
|
| 1105 |
+
cluster_resource_constraints=ray_state.cluster_resource_constraints,
|
| 1106 |
+
current_instances=autoscaler_instances,
|
| 1107 |
+
idle_timeout_s=autoscaling_config.get_idle_timeout_s(),
|
| 1108 |
+
disable_launch_config_check=(
|
| 1109 |
+
autoscaling_config.disable_launch_config_check()
|
| 1110 |
+
),
|
| 1111 |
+
)
|
| 1112 |
+
|
| 1113 |
+
# Ask scheduler for updates to the cluster shape.
|
| 1114 |
+
reply = scheduler.schedule(sched_request)
|
| 1115 |
+
|
| 1116 |
+
# Populate the autoscaling state.
|
| 1117 |
+
autoscaling_state.infeasible_resource_requests.extend(
|
| 1118 |
+
reply.infeasible_resource_requests
|
| 1119 |
+
)
|
| 1120 |
+
autoscaling_state.infeasible_gang_resource_requests.extend(
|
| 1121 |
+
reply.infeasible_gang_resource_requests
|
| 1122 |
+
)
|
| 1123 |
+
autoscaling_state.infeasible_cluster_resource_constraints.extend(
|
| 1124 |
+
reply.infeasible_cluster_resource_constraints
|
| 1125 |
+
)
|
| 1126 |
+
|
| 1127 |
+
if not Reconciler._is_head_node_running(instance_manager):
|
| 1128 |
+
# We shouldn't be scaling the cluster until the head node is ready.
|
| 1129 |
+
# This could happen when the head node (i.e. the raylet) is still
|
| 1130 |
+
# pending registration even though GCS is up.
|
| 1131 |
+
# We will wait until the head node is running and ready to avoid
|
| 1132 |
+
# scaling the cluster from min worker nodes constraint.
|
| 1133 |
+
return
|
| 1134 |
+
|
| 1135 |
+
if autoscaling_config.provider == Provider.READ_ONLY:
|
| 1136 |
+
# We shouldn't be scaling the cluster if the provider is read-only.
|
| 1137 |
+
return
|
| 1138 |
+
|
| 1139 |
+
# Scale the clusters if needed.
|
| 1140 |
+
to_launch = reply.to_launch
|
| 1141 |
+
to_terminate = reply.to_terminate
|
| 1142 |
+
updates = {}
|
| 1143 |
+
# Add terminating instances.
|
| 1144 |
+
for terminate_request in to_terminate:
|
| 1145 |
+
instance_id = terminate_request.instance_id
|
| 1146 |
+
updates[terminate_request.instance_id] = IMInstanceUpdateEvent(
|
| 1147 |
+
instance_id=instance_id,
|
| 1148 |
+
new_instance_status=IMInstance.RAY_STOP_REQUESTED,
|
| 1149 |
+
termination_request=terminate_request,
|
| 1150 |
+
details=f"draining ray: {terminate_request.details}",
|
| 1151 |
+
)
|
| 1152 |
+
|
| 1153 |
+
# Add new instances.
|
| 1154 |
+
for launch_request in to_launch:
|
| 1155 |
+
for _ in range(launch_request.count):
|
| 1156 |
+
instance_id = InstanceUtil.random_instance_id()
|
| 1157 |
+
updates[instance_id] = IMInstanceUpdateEvent(
|
| 1158 |
+
instance_id=instance_id,
|
| 1159 |
+
new_instance_status=IMInstance.QUEUED,
|
| 1160 |
+
instance_type=launch_request.instance_type,
|
| 1161 |
+
upsert=True,
|
| 1162 |
+
details=(
|
| 1163 |
+
f"queuing new instance of {launch_request.instance_type} "
|
| 1164 |
+
"from scheduler"
|
| 1165 |
+
),
|
| 1166 |
+
)
|
| 1167 |
+
|
| 1168 |
+
Reconciler._update_instance_manager(instance_manager, version, updates)
|
| 1169 |
+
|
| 1170 |
+
@staticmethod
|
| 1171 |
+
def _terminate_instances(instance_manager: InstanceManager):
|
| 1172 |
+
"""
|
| 1173 |
+
Terminate instances with the below statuses:
|
| 1174 |
+
- RAY_STOPPED: ray was stopped on the cloud instance.
|
| 1175 |
+
- RAY_INSTALL_FAILED: ray installation failed on the cloud instance,
|
| 1176 |
+
we will not retry.
|
| 1177 |
+
- TERMINATION_FAILED: cloud provider failed to terminate the instance
|
| 1178 |
+
or timeout for termination happened, we will retry again.
|
| 1179 |
+
|
| 1180 |
+
Args:
|
| 1181 |
+
instance_manager: The instance manager to reconcile.
|
| 1182 |
+
"""
|
| 1183 |
+
|
| 1184 |
+
im_instances, version = Reconciler._get_im_instances(instance_manager)
|
| 1185 |
+
updates = {}
|
| 1186 |
+
for instance in im_instances:
|
| 1187 |
+
if instance.status not in [
|
| 1188 |
+
IMInstance.RAY_STOPPED,
|
| 1189 |
+
IMInstance.RAY_INSTALL_FAILED,
|
| 1190 |
+
IMInstance.TERMINATION_FAILED,
|
| 1191 |
+
]:
|
| 1192 |
+
continue
|
| 1193 |
+
|
| 1194 |
+
# Terminate the instance.
|
| 1195 |
+
updates[instance.instance_id] = IMInstanceUpdateEvent(
|
| 1196 |
+
instance_id=instance.instance_id,
|
| 1197 |
+
new_instance_status=IMInstance.TERMINATING,
|
| 1198 |
+
cloud_instance_id=instance.cloud_instance_id,
|
| 1199 |
+
details="terminating instance from "
|
| 1200 |
+
f"{IMInstance.InstanceStatus.Name(instance.status)}",
|
| 1201 |
+
)
|
| 1202 |
+
|
| 1203 |
+
Reconciler._update_instance_manager(instance_manager, version, updates)
|
| 1204 |
+
|
| 1205 |
+
@staticmethod
|
| 1206 |
+
def _install_ray(
|
| 1207 |
+
instance_manager: InstanceManager,
|
| 1208 |
+
non_terminated_cloud_instances: Dict[CloudInstanceId, CloudInstance],
|
| 1209 |
+
) -> None:
|
| 1210 |
+
"""
|
| 1211 |
+
Install ray on the allocated instances when it's ready (cloud instance
|
| 1212 |
+
should be running)
|
| 1213 |
+
|
| 1214 |
+
This is needed if ray installation needs to be performed by
|
| 1215 |
+
the instance manager.
|
| 1216 |
+
|
| 1217 |
+
Args:
|
| 1218 |
+
instance_manager: The instance manager to reconcile.
|
| 1219 |
+
"""
|
| 1220 |
+
im_instances, version = Reconciler._get_im_instances(instance_manager)
|
| 1221 |
+
updates = {}
|
| 1222 |
+
for instance in im_instances:
|
| 1223 |
+
if instance.status != IMInstance.ALLOCATED:
|
| 1224 |
+
continue
|
| 1225 |
+
|
| 1226 |
+
if instance.node_kind == NodeKind.HEAD:
|
| 1227 |
+
# Skip head node.
|
| 1228 |
+
continue
|
| 1229 |
+
|
| 1230 |
+
cloud_instance = non_terminated_cloud_instances.get(
|
| 1231 |
+
instance.cloud_instance_id
|
| 1232 |
+
)
|
| 1233 |
+
|
| 1234 |
+
assert cloud_instance, (
|
| 1235 |
+
f"Cloud instance {instance.cloud_instance_id} is not found "
|
| 1236 |
+
"in non_terminated_cloud_instances."
|
| 1237 |
+
)
|
| 1238 |
+
|
| 1239 |
+
if not cloud_instance.is_running:
|
| 1240 |
+
# It might still be pending (e.g. setting up ssh)
|
| 1241 |
+
continue
|
| 1242 |
+
|
| 1243 |
+
# Install ray on the running cloud instance
|
| 1244 |
+
updates[instance.instance_id] = IMInstanceUpdateEvent(
|
| 1245 |
+
instance_id=instance.instance_id,
|
| 1246 |
+
new_instance_status=IMInstance.RAY_INSTALLING,
|
| 1247 |
+
details="installing ray",
|
| 1248 |
+
)
|
| 1249 |
+
|
| 1250 |
+
Reconciler._update_instance_manager(instance_manager, version, updates)
|
| 1251 |
+
|
| 1252 |
+
@staticmethod
|
| 1253 |
+
def _fill_autoscaling_state(
|
| 1254 |
+
instance_manager: InstanceManager,
|
| 1255 |
+
autoscaling_state: AutoscalingState,
|
| 1256 |
+
) -> None:
|
| 1257 |
+
|
| 1258 |
+
# Use the IM instance version for the autoscaler_state_version
|
| 1259 |
+
instances, version = Reconciler._get_im_instances(instance_manager)
|
| 1260 |
+
autoscaling_state.autoscaler_state_version = version
|
| 1261 |
+
|
| 1262 |
+
# Group instances by status
|
| 1263 |
+
instances_by_status = defaultdict(list)
|
| 1264 |
+
for instance in instances:
|
| 1265 |
+
instances_by_status[instance.status].append(instance)
|
| 1266 |
+
|
| 1267 |
+
# Pending instance requests
|
| 1268 |
+
instances_by_launch_request = defaultdict(list)
|
| 1269 |
+
queued_instances = []
|
| 1270 |
+
for instance in (
|
| 1271 |
+
instances_by_status[IMInstance.REQUESTED]
|
| 1272 |
+
+ instances_by_status[IMInstance.QUEUED]
|
| 1273 |
+
):
|
| 1274 |
+
if instance.launch_request_id:
|
| 1275 |
+
instances_by_launch_request[instance.launch_request_id].append(instance)
|
| 1276 |
+
else:
|
| 1277 |
+
queued_instances.append(instance)
|
| 1278 |
+
|
| 1279 |
+
for _, instances in instances_by_launch_request.items():
|
| 1280 |
+
num_instances_by_type = defaultdict(int)
|
| 1281 |
+
for instance in instances:
|
| 1282 |
+
num_instances_by_type[instance.instance_type] += 1
|
| 1283 |
+
|
| 1284 |
+
# All instances with same request id should have the same
|
| 1285 |
+
# request time.
|
| 1286 |
+
request_update = InstanceUtil.get_last_status_transition(
|
| 1287 |
+
instances[0], IMInstance.REQUESTED
|
| 1288 |
+
)
|
| 1289 |
+
request_time_ns = request_update.timestamp_ns if request_update else 0
|
| 1290 |
+
|
| 1291 |
+
for instance_type, count in num_instances_by_type.items():
|
| 1292 |
+
autoscaling_state.pending_instance_requests.append(
|
| 1293 |
+
PendingInstanceRequest(
|
| 1294 |
+
ray_node_type_name=instance_type,
|
| 1295 |
+
count=int(count),
|
| 1296 |
+
request_ts=int(request_time_ns // 1e9),
|
| 1297 |
+
)
|
| 1298 |
+
)
|
| 1299 |
+
|
| 1300 |
+
# Pending instances
|
| 1301 |
+
for instance in (
|
| 1302 |
+
instances_by_status[IMInstance.ALLOCATED]
|
| 1303 |
+
+ instances_by_status[IMInstance.RAY_INSTALLING]
|
| 1304 |
+
):
|
| 1305 |
+
|
| 1306 |
+
status_history = sorted(
|
| 1307 |
+
instance.status_history, key=lambda x: x.timestamp_ns, reverse=True
|
| 1308 |
+
)
|
| 1309 |
+
autoscaling_state.pending_instances.append(
|
| 1310 |
+
PendingInstance(
|
| 1311 |
+
instance_id=instance.instance_id,
|
| 1312 |
+
ray_node_type_name=instance.instance_type,
|
| 1313 |
+
details=status_history[0].details,
|
| 1314 |
+
)
|
| 1315 |
+
)
|
| 1316 |
+
|
| 1317 |
+
# Failed instance requests
|
| 1318 |
+
for instance in instances_by_status[IMInstance.ALLOCATION_FAILED]:
|
| 1319 |
+
request_status_update = InstanceUtil.get_last_status_transition(
|
| 1320 |
+
instance, IMInstance.REQUESTED
|
| 1321 |
+
)
|
| 1322 |
+
failed_status_update = InstanceUtil.get_last_status_transition(
|
| 1323 |
+
instance, IMInstance.ALLOCATION_FAILED
|
| 1324 |
+
)
|
| 1325 |
+
failed_time = (
|
| 1326 |
+
failed_status_update.timestamp_ns if failed_status_update else 0
|
| 1327 |
+
)
|
| 1328 |
+
request_time = (
|
| 1329 |
+
request_status_update.timestamp_ns if request_status_update else 0
|
| 1330 |
+
)
|
| 1331 |
+
autoscaling_state.failed_instance_requests.append(
|
| 1332 |
+
FailedInstanceRequest(
|
| 1333 |
+
ray_node_type_name=instance.instance_type,
|
| 1334 |
+
start_ts=int(request_time // 1e9),
|
| 1335 |
+
failed_ts=int(
|
| 1336 |
+
failed_time // 1e9,
|
| 1337 |
+
),
|
| 1338 |
+
reason=failed_status_update.details,
|
| 1339 |
+
count=1,
|
| 1340 |
+
)
|
| 1341 |
+
)
|
| 1342 |
+
|
| 1343 |
+
@staticmethod
|
| 1344 |
+
def _handle_stuck_requested_instance(
|
| 1345 |
+
instance: IMInstance, timeout_s: int, max_num_retry_request_to_allocate: int
|
| 1346 |
+
) -> Optional[IMInstanceUpdateEvent]:
|
| 1347 |
+
"""
|
| 1348 |
+
Fail the cloud instance allocation if it's stuck in the REQUESTED state.
|
| 1349 |
+
|
| 1350 |
+
Args:
|
| 1351 |
+
instance: The instance to handle.
|
| 1352 |
+
timeout_s: The timeout in seconds.
|
| 1353 |
+
max_num_retry_request_to_allocate: The maximum number of times an instance
|
| 1354 |
+
could be requested to allocate.
|
| 1355 |
+
|
| 1356 |
+
Returns:
|
| 1357 |
+
Instance update to ALLOCATION_FAILED: if the instance allocation failed
|
| 1358 |
+
with errors.
|
| 1359 |
+
None: if there's no update.
|
| 1360 |
+
|
| 1361 |
+
"""
|
| 1362 |
+
if not InstanceUtil.has_timeout(instance, timeout_s):
|
| 1363 |
+
# Not timeout yet, be patient.
|
| 1364 |
+
return None
|
| 1365 |
+
|
| 1366 |
+
all_request_times_ns = sorted(
|
| 1367 |
+
InstanceUtil.get_status_transition_times_ns(
|
| 1368 |
+
instance, select_instance_status=IMInstance.REQUESTED
|
| 1369 |
+
)
|
| 1370 |
+
)
|
| 1371 |
+
|
| 1372 |
+
# Fail the allocation if we have tried too many times.
|
| 1373 |
+
if len(all_request_times_ns) > max_num_retry_request_to_allocate:
|
| 1374 |
+
return IMInstanceUpdateEvent(
|
| 1375 |
+
instance_id=instance.instance_id,
|
| 1376 |
+
new_instance_status=IMInstance.ALLOCATION_FAILED,
|
| 1377 |
+
details=(
|
| 1378 |
+
"failed to allocate cloud instance after "
|
| 1379 |
+
f"{len(all_request_times_ns)} attempts > "
|
| 1380 |
+
f"max_num_retry_request_to_allocate={max_num_retry_request_to_allocate}" # noqa
|
| 1381 |
+
),
|
| 1382 |
+
)
|
| 1383 |
+
|
| 1384 |
+
# Retry the allocation if we could by transitioning to QUEUED again.
|
| 1385 |
+
return IMInstanceUpdateEvent(
|
| 1386 |
+
instance_id=instance.instance_id,
|
| 1387 |
+
new_instance_status=IMInstance.QUEUED,
|
| 1388 |
+
details=f"queue again to launch after timeout={timeout_s}s",
|
| 1389 |
+
)
|
| 1390 |
+
|
| 1391 |
+
@staticmethod
|
| 1392 |
+
def _handle_stuck_instance(
|
| 1393 |
+
instance: IMInstance,
|
| 1394 |
+
timeout_s: int,
|
| 1395 |
+
new_status: IMInstance.InstanceStatus,
|
| 1396 |
+
**update_kwargs: Dict,
|
| 1397 |
+
) -> Optional[IMInstanceUpdateEvent]:
|
| 1398 |
+
"""
|
| 1399 |
+
Fail the instance if it's stuck in the status for too long.
|
| 1400 |
+
|
| 1401 |
+
Args:
|
| 1402 |
+
instance: The instance to handle.
|
| 1403 |
+
timeout_s: The timeout in seconds.
|
| 1404 |
+
new_status: The new status to transition to.
|
| 1405 |
+
update_kwargs: The update kwargs for InstanceUpdateEvent
|
| 1406 |
+
|
| 1407 |
+
Returns:
|
| 1408 |
+
Instance update to the new status: if the instance is stuck in the status
|
| 1409 |
+
for too long.
|
| 1410 |
+
None: if there's no update.
|
| 1411 |
+
|
| 1412 |
+
"""
|
| 1413 |
+
if not InstanceUtil.has_timeout(instance, timeout_s):
|
| 1414 |
+
# Not timeout yet, be patient.
|
| 1415 |
+
return None
|
| 1416 |
+
|
| 1417 |
+
return IMInstanceUpdateEvent(
|
| 1418 |
+
instance_id=instance.instance_id,
|
| 1419 |
+
new_instance_status=new_status,
|
| 1420 |
+
details=f"timeout={timeout_s}s at status "
|
| 1421 |
+
f"{IMInstance.InstanceStatus.Name(instance.status)}",
|
| 1422 |
+
**update_kwargs,
|
| 1423 |
+
)
|
| 1424 |
+
|
| 1425 |
+
@staticmethod
|
| 1426 |
+
def _handle_extra_cloud_instances(
|
| 1427 |
+
instance_manager: InstanceManager,
|
| 1428 |
+
non_terminated_cloud_instances: Dict[CloudInstanceId, CloudInstance],
|
| 1429 |
+
ray_nodes: List[NodeState],
|
| 1430 |
+
):
|
| 1431 |
+
"""
|
| 1432 |
+
For extra cloud instances (i.e. cloud instances that are non terminated as
|
| 1433 |
+
returned by cloud provider, but not managed by the instance manager), we
|
| 1434 |
+
will create new IM instances with ALLOCATED status.
|
| 1435 |
+
|
| 1436 |
+
Such instances could either be:
|
| 1437 |
+
1. Leaked instances that are incorrectly started by the cloud instance
|
| 1438 |
+
provider, and they would be terminated eventually if they fail to
|
| 1439 |
+
transition to RAY_RUNNING by stuck instances reconciliation, or they
|
| 1440 |
+
would join the ray cluster and be terminated when the cluster scales down.
|
| 1441 |
+
2. Instances that are started by the cloud instance provider intentionally
|
| 1442 |
+
but not yet discovered by the instance manager. This could happen for
|
| 1443 |
+
a. Head node that's started before the autoscaler.
|
| 1444 |
+
b. Worker nodes that's started by the cloud provider upon users'
|
| 1445 |
+
actions: i.e. KubeRay scaling up the cluster with ray cluster config
|
| 1446 |
+
change.
|
| 1447 |
+
3. Ray nodes with cloud instance id not in the cloud provider. This could
|
| 1448 |
+
happen if there's delay in the Ray's state (i.e. cloud instance already
|
| 1449 |
+
terminated, but the ray node is still not dead yet).
|
| 1450 |
+
|
| 1451 |
+
Args:
|
| 1452 |
+
instance_manager: The instance manager to reconcile.
|
| 1453 |
+
non_terminated_cloud_instances: The non-terminated cloud instances from
|
| 1454 |
+
the cloud provider.
|
| 1455 |
+
ray_nodes: The ray cluster's states of ray nodes.
|
| 1456 |
+
"""
|
| 1457 |
+
Reconciler._handle_extra_cloud_instances_from_cloud_provider(
|
| 1458 |
+
instance_manager, non_terminated_cloud_instances
|
| 1459 |
+
)
|
| 1460 |
+
Reconciler._handle_extra_cloud_instances_from_ray_nodes(
|
| 1461 |
+
instance_manager, ray_nodes
|
| 1462 |
+
)
|
| 1463 |
+
|
| 1464 |
+
@staticmethod
|
| 1465 |
+
def _handle_extra_cloud_instances_from_cloud_provider(
|
| 1466 |
+
instance_manager: InstanceManager,
|
| 1467 |
+
non_terminated_cloud_instances: Dict[CloudInstanceId, CloudInstance],
|
| 1468 |
+
):
|
| 1469 |
+
"""
|
| 1470 |
+
For extra cloud instances that are not managed by the instance manager but
|
| 1471 |
+
are running in the cloud provider, we will create new IM instances with
|
| 1472 |
+
ALLOCATED status.
|
| 1473 |
+
|
| 1474 |
+
Args:
|
| 1475 |
+
instance_manager: The instance manager to reconcile.
|
| 1476 |
+
non_terminated_cloud_instances: The non-terminated cloud instances from
|
| 1477 |
+
the cloud provider.
|
| 1478 |
+
"""
|
| 1479 |
+
updates = {}
|
| 1480 |
+
|
| 1481 |
+
instances, version = Reconciler._get_im_instances(instance_manager)
|
| 1482 |
+
cloud_instance_ids_managed_by_im = {
|
| 1483 |
+
instance.cloud_instance_id
|
| 1484 |
+
for instance in instances
|
| 1485 |
+
if instance.cloud_instance_id
|
| 1486 |
+
}
|
| 1487 |
+
|
| 1488 |
+
# Find the extra cloud instances that are not managed by the instance manager.
|
| 1489 |
+
for cloud_instance_id, cloud_instance in non_terminated_cloud_instances.items():
|
| 1490 |
+
if cloud_instance_id in cloud_instance_ids_managed_by_im:
|
| 1491 |
+
continue
|
| 1492 |
+
updates[cloud_instance_id] = IMInstanceUpdateEvent(
|
| 1493 |
+
instance_id=InstanceUtil.random_instance_id(), # Assign a new id.
|
| 1494 |
+
cloud_instance_id=cloud_instance_id,
|
| 1495 |
+
new_instance_status=IMInstance.ALLOCATED,
|
| 1496 |
+
node_kind=cloud_instance.node_kind,
|
| 1497 |
+
instance_type=cloud_instance.node_type,
|
| 1498 |
+
details=(
|
| 1499 |
+
"allocated unmanaged cloud instance :"
|
| 1500 |
+
f"{cloud_instance.cloud_instance_id} "
|
| 1501 |
+
f"({NodeKind.Name(cloud_instance.node_kind)}) from cloud provider"
|
| 1502 |
+
),
|
| 1503 |
+
upsert=True,
|
| 1504 |
+
)
|
| 1505 |
+
Reconciler._update_instance_manager(instance_manager, version, updates)
|
| 1506 |
+
|
| 1507 |
+
@staticmethod
|
| 1508 |
+
def _handle_extra_cloud_instances_from_ray_nodes(
|
| 1509 |
+
instance_manager: InstanceManager, ray_nodes: List[NodeState]
|
| 1510 |
+
):
|
| 1511 |
+
"""
|
| 1512 |
+
For extra cloud instances reported by Ray but not managed by the instance
|
| 1513 |
+
manager, we will create new IM instances with ALLOCATED status.
|
| 1514 |
+
|
| 1515 |
+
Args:
|
| 1516 |
+
instance_manager: The instance manager to reconcile.
|
| 1517 |
+
ray_nodes: The ray cluster's states of ray nodes.
|
| 1518 |
+
"""
|
| 1519 |
+
updates = {}
|
| 1520 |
+
|
| 1521 |
+
instances, version = Reconciler._get_im_instances(instance_manager)
|
| 1522 |
+
cloud_instance_ids_managed_by_im = {
|
| 1523 |
+
instance.cloud_instance_id
|
| 1524 |
+
for instance in instances
|
| 1525 |
+
if instance.cloud_instance_id
|
| 1526 |
+
}
|
| 1527 |
+
|
| 1528 |
+
for ray_node in ray_nodes:
|
| 1529 |
+
if not ray_node.instance_id:
|
| 1530 |
+
continue
|
| 1531 |
+
|
| 1532 |
+
cloud_instance_id = ray_node.instance_id
|
| 1533 |
+
if cloud_instance_id in cloud_instance_ids_managed_by_im:
|
| 1534 |
+
continue
|
| 1535 |
+
|
| 1536 |
+
is_head = is_head_node(ray_node)
|
| 1537 |
+
updates[cloud_instance_id] = IMInstanceUpdateEvent(
|
| 1538 |
+
instance_id=InstanceUtil.random_instance_id(), # Assign a new id.
|
| 1539 |
+
cloud_instance_id=cloud_instance_id,
|
| 1540 |
+
new_instance_status=IMInstance.ALLOCATED,
|
| 1541 |
+
node_kind=NodeKind.HEAD if is_head else NodeKind.WORKER,
|
| 1542 |
+
instance_type=ray_node.ray_node_type_name,
|
| 1543 |
+
details=(
|
| 1544 |
+
"allocated unmanaged worker cloud instance from ray node: "
|
| 1545 |
+
f"{binary_to_hex(ray_node.node_id)}"
|
| 1546 |
+
),
|
| 1547 |
+
upsert=True,
|
| 1548 |
+
)
|
| 1549 |
+
|
| 1550 |
+
Reconciler._update_instance_manager(instance_manager, version, updates)
|
| 1551 |
+
|
| 1552 |
+
@staticmethod
|
| 1553 |
+
def _report_metrics(
|
| 1554 |
+
instance_manager: InstanceManager,
|
| 1555 |
+
autoscaling_config: AutoscalingConfig,
|
| 1556 |
+
metrics_reporter: Optional[AutoscalerMetricsReporter] = None,
|
| 1557 |
+
):
|
| 1558 |
+
if not metrics_reporter:
|
| 1559 |
+
return
|
| 1560 |
+
|
| 1561 |
+
instances, _ = Reconciler._get_im_instances(instance_manager)
|
| 1562 |
+
node_type_configs = autoscaling_config.get_node_type_configs()
|
| 1563 |
+
|
| 1564 |
+
metrics_reporter.report_instances(instances, node_type_configs)
|
| 1565 |
+
metrics_reporter.report_resources(instances, node_type_configs)
|
.venv/lib/python3.11/site-packages/ray/autoscaler/v2/instance_manager/storage.py
ADDED
|
@@ -0,0 +1,180 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import copy
|
| 2 |
+
from abc import ABCMeta, abstractmethod
|
| 3 |
+
from collections import defaultdict, namedtuple
|
| 4 |
+
from threading import Lock
|
| 5 |
+
from typing import Dict, List, Optional, Tuple
|
| 6 |
+
|
| 7 |
+
StoreStatus = namedtuple("StoreStatus", ["success", "version"])
|
| 8 |
+
VersionedValue = namedtuple("VersionedValue", ["value", "version"])
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
class Storage(metaclass=ABCMeta):
|
| 12 |
+
"""Interface for a storage backend that stores the state of nodes in the cluster.
|
| 13 |
+
|
| 14 |
+
The storage is thread-safe.
|
| 15 |
+
|
| 16 |
+
The storage is versioned, which means that each successful stage change to the
|
| 17 |
+
storage will bump the version number. The version number can be used to
|
| 18 |
+
implement optimistic concurrency control.
|
| 19 |
+
|
| 20 |
+
Each entry in the storage table is also versioned. The version number of an entry
|
| 21 |
+
is the last version number when the entry is updated.
|
| 22 |
+
"""
|
| 23 |
+
|
| 24 |
+
@abstractmethod
|
| 25 |
+
def batch_update(
|
| 26 |
+
self,
|
| 27 |
+
table: str,
|
| 28 |
+
mutation: Optional[Dict[str, str]] = None,
|
| 29 |
+
deletion: Optional[List[str]] = None,
|
| 30 |
+
expected_storage_version: Optional[int] = None,
|
| 31 |
+
) -> StoreStatus:
|
| 32 |
+
"""Batch update the storage table. This method is atomic.
|
| 33 |
+
|
| 34 |
+
Args:
|
| 35 |
+
table: The name of the table.
|
| 36 |
+
mutation: A dictionary of key-value pairs to be updated.
|
| 37 |
+
deletion: A list of keys to be deleted.
|
| 38 |
+
expected_storage_version: The expected storage version. The
|
| 39 |
+
update will fail if the version does not match the
|
| 40 |
+
current storage version.
|
| 41 |
+
|
| 42 |
+
Returns:
|
| 43 |
+
StoreStatus: A tuple of (success, version). If the update is
|
| 44 |
+
successful, returns (True, new_version).
|
| 45 |
+
Otherwise, returns (False, current_version).
|
| 46 |
+
"""
|
| 47 |
+
raise NotImplementedError("batch_update() has to be implemented")
|
| 48 |
+
|
| 49 |
+
@abstractmethod
|
| 50 |
+
def update(
|
| 51 |
+
self,
|
| 52 |
+
table: str,
|
| 53 |
+
key: str,
|
| 54 |
+
value: str,
|
| 55 |
+
expected_entry_version: Optional[int] = None,
|
| 56 |
+
insert_only: bool = False,
|
| 57 |
+
) -> StoreStatus:
|
| 58 |
+
"""Update a single entry in the storage table.
|
| 59 |
+
|
| 60 |
+
Args:
|
| 61 |
+
table: The name of the table.
|
| 62 |
+
key: The key of the entry.
|
| 63 |
+
value: The value of the entry.
|
| 64 |
+
expected_entry_version: The expected version of the entry.
|
| 65 |
+
The update will fail if the version does not match the current
|
| 66 |
+
version of the entry.
|
| 67 |
+
insert_only: If True, the update will
|
| 68 |
+
fail if the entry already exists.
|
| 69 |
+
Returns:
|
| 70 |
+
StoreStatus: A tuple of (success, version). If the update is
|
| 71 |
+
successful, returns (True, new_version). Otherwise,
|
| 72 |
+
returns (False, current_version).
|
| 73 |
+
"""
|
| 74 |
+
raise NotImplementedError("update() has to be implemented")
|
| 75 |
+
|
| 76 |
+
@abstractmethod
|
| 77 |
+
def get_all(self, table: str) -> Tuple[Dict[str, Tuple[str, int]], int]:
|
| 78 |
+
raise NotImplementedError("get_all() has to be implemented")
|
| 79 |
+
|
| 80 |
+
@abstractmethod
|
| 81 |
+
def get(
|
| 82 |
+
self, table: str, keys: List[str]
|
| 83 |
+
) -> Tuple[Dict[str, Tuple[str, int]], int]:
|
| 84 |
+
"""Get a list of entries from the storage table.
|
| 85 |
+
|
| 86 |
+
Args:
|
| 87 |
+
table: The name of the table.
|
| 88 |
+
keys: A list of keys to be retrieved. If the list is empty,
|
| 89 |
+
all entries in the table will be returned.
|
| 90 |
+
|
| 91 |
+
Returns:
|
| 92 |
+
Tuple[Dict[str, VersionedValue], int]: A tuple of
|
| 93 |
+
(entries, storage_version). The entries is a dictionary of
|
| 94 |
+
(key, (value, entry_version)) pairs. The entry_version is the
|
| 95 |
+
version of the entry when it was last updated. The
|
| 96 |
+
storage_version is the current storage version.
|
| 97 |
+
"""
|
| 98 |
+
raise NotImplementedError("get() has to be implemented")
|
| 99 |
+
|
| 100 |
+
@abstractmethod
|
| 101 |
+
def get_version(self) -> int:
|
| 102 |
+
"""Get the current storage version.
|
| 103 |
+
|
| 104 |
+
Returns:
|
| 105 |
+
int: The current storage version.
|
| 106 |
+
"""
|
| 107 |
+
raise NotImplementedError("get_version() has to be implemented")
|
| 108 |
+
|
| 109 |
+
|
| 110 |
+
class InMemoryStorage(Storage):
|
| 111 |
+
"""An in-memory implementation of the Storage interface. This implementation
|
| 112 |
+
is not durable"""
|
| 113 |
+
|
| 114 |
+
def __init__(self):
|
| 115 |
+
self._version = 0
|
| 116 |
+
self._tables = defaultdict(dict)
|
| 117 |
+
self._lock = Lock()
|
| 118 |
+
|
| 119 |
+
def batch_update(
|
| 120 |
+
self,
|
| 121 |
+
table: str,
|
| 122 |
+
mutation: Dict[str, str] = None,
|
| 123 |
+
deletion: List[str] = None,
|
| 124 |
+
expected_version: Optional[int] = None,
|
| 125 |
+
) -> StoreStatus:
|
| 126 |
+
mutation = mutation if mutation else {}
|
| 127 |
+
deletion = deletion if deletion else []
|
| 128 |
+
with self._lock:
|
| 129 |
+
if expected_version is not None and expected_version != self._version:
|
| 130 |
+
return StoreStatus(False, self._version)
|
| 131 |
+
self._version += 1
|
| 132 |
+
key_value_pairs_with_version = {
|
| 133 |
+
key: VersionedValue(value, self._version)
|
| 134 |
+
for key, value in mutation.items()
|
| 135 |
+
}
|
| 136 |
+
self._tables[table].update(key_value_pairs_with_version)
|
| 137 |
+
for deleted_key in deletion:
|
| 138 |
+
self._tables[table].pop(deleted_key, None)
|
| 139 |
+
return StoreStatus(True, self._version)
|
| 140 |
+
|
| 141 |
+
def update(
|
| 142 |
+
self,
|
| 143 |
+
table: str,
|
| 144 |
+
key: str,
|
| 145 |
+
value: str,
|
| 146 |
+
expected_entry_version: Optional[int] = None,
|
| 147 |
+
expected_storage_version: Optional[int] = None,
|
| 148 |
+
insert_only: bool = False,
|
| 149 |
+
) -> StoreStatus:
|
| 150 |
+
with self._lock:
|
| 151 |
+
if (
|
| 152 |
+
expected_storage_version is not None
|
| 153 |
+
and expected_storage_version != self._version
|
| 154 |
+
):
|
| 155 |
+
return StoreStatus(False, self._version)
|
| 156 |
+
if insert_only and key in self._tables[table]:
|
| 157 |
+
return StoreStatus(False, self._version)
|
| 158 |
+
_, version = self._tables[table].get(key, (None, -1))
|
| 159 |
+
if expected_entry_version is not None and expected_entry_version != version:
|
| 160 |
+
return StoreStatus(False, self._version)
|
| 161 |
+
self._version += 1
|
| 162 |
+
self._tables[table][key] = VersionedValue(value, self._version)
|
| 163 |
+
return StoreStatus(True, self._version)
|
| 164 |
+
|
| 165 |
+
def get_all(self, table: str) -> Tuple[Dict[str, VersionedValue], int]:
|
| 166 |
+
with self._lock:
|
| 167 |
+
return (copy.deepcopy(self._tables[table]), self._version)
|
| 168 |
+
|
| 169 |
+
def get(self, table: str, keys: List[str]) -> Tuple[Dict[str, VersionedValue], int]:
|
| 170 |
+
if not keys:
|
| 171 |
+
return self.get_all(table)
|
| 172 |
+
with self._lock:
|
| 173 |
+
result = {}
|
| 174 |
+
for key in keys:
|
| 175 |
+
if key in self._tables.get(table, {}):
|
| 176 |
+
result[key] = self._tables[table][key]
|
| 177 |
+
return StoreStatus(result, self._version)
|
| 178 |
+
|
| 179 |
+
def get_version(self) -> int:
|
| 180 |
+
return self._version
|
.venv/lib/python3.11/site-packages/ray/autoscaler/v2/metrics_reporter.py
ADDED
|
@@ -0,0 +1,100 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from collections import defaultdict
|
| 2 |
+
from typing import Dict, List
|
| 3 |
+
|
| 4 |
+
from ray.autoscaler._private.prom_metrics import AutoscalerPrometheusMetrics
|
| 5 |
+
from ray.autoscaler.v2.instance_manager.common import InstanceUtil
|
| 6 |
+
from ray.autoscaler.v2.instance_manager.config import NodeTypeConfig
|
| 7 |
+
from ray.autoscaler.v2.schema import NodeType
|
| 8 |
+
from ray.core.generated.instance_manager_pb2 import Instance as IMInstance
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
class AutoscalerMetricsReporter:
|
| 12 |
+
def __init__(self, prom_metrics: AutoscalerPrometheusMetrics) -> None:
|
| 13 |
+
self._prom_metrics = prom_metrics
|
| 14 |
+
|
| 15 |
+
def report_instances(
|
| 16 |
+
self,
|
| 17 |
+
instances: List[IMInstance],
|
| 18 |
+
node_type_configs: Dict[NodeType, NodeTypeConfig],
|
| 19 |
+
):
|
| 20 |
+
"""
|
| 21 |
+
Record autoscaler metrics for:
|
| 22 |
+
- pending_nodes: Nodes that are launching/pending ray start
|
| 23 |
+
- active_nodes: Active nodes (nodes running ray)
|
| 24 |
+
- recently_failed_nodes: Nodes that are being terminated.
|
| 25 |
+
- stopped_nodes: Nodes that are terminated.
|
| 26 |
+
"""
|
| 27 |
+
# map of instance type to a dict of status to count.
|
| 28 |
+
status_count_by_type: Dict[NodeType : Dict[str, int]] = {}
|
| 29 |
+
# initialize the status count by type.
|
| 30 |
+
for instance_type in node_type_configs.keys():
|
| 31 |
+
status_count_by_type[instance_type] = {
|
| 32 |
+
"pending": 0,
|
| 33 |
+
"running": 0,
|
| 34 |
+
"terminating": 0,
|
| 35 |
+
"terminated": 0,
|
| 36 |
+
}
|
| 37 |
+
|
| 38 |
+
for instance in instances:
|
| 39 |
+
if InstanceUtil.is_ray_pending(instance.status):
|
| 40 |
+
status_count_by_type[instance.instance_type]["pending"] += 1
|
| 41 |
+
elif InstanceUtil.is_ray_running(instance.status):
|
| 42 |
+
status_count_by_type[instance.instance_type]["running"] += 1
|
| 43 |
+
elif instance.status == IMInstance.TERMINATING:
|
| 44 |
+
status_count_by_type[instance.instance_type]["terminating"] += 1
|
| 45 |
+
elif instance.status == IMInstance.TERMINATED:
|
| 46 |
+
status_count_by_type[instance.instance_type]["terminated"] += 1
|
| 47 |
+
|
| 48 |
+
for instance_type, status_count in status_count_by_type.items():
|
| 49 |
+
self._prom_metrics.pending_nodes.labels(
|
| 50 |
+
SessionName=self._prom_metrics.session_name, NodeType=instance_type
|
| 51 |
+
).set(status_count["pending"])
|
| 52 |
+
|
| 53 |
+
self._prom_metrics.active_nodes.labels(
|
| 54 |
+
SessionName=self._prom_metrics.session_name, NodeType=instance_type
|
| 55 |
+
).set(status_count["running"])
|
| 56 |
+
|
| 57 |
+
self._prom_metrics.recently_failed_nodes.labels(
|
| 58 |
+
SessionName=self._prom_metrics.session_name, NodeType=instance_type
|
| 59 |
+
).set(status_count["terminating"])
|
| 60 |
+
|
| 61 |
+
self._prom_metrics.stopped_nodes.inc(status_count["terminated"])
|
| 62 |
+
|
| 63 |
+
def report_resources(
|
| 64 |
+
self,
|
| 65 |
+
instances: List[IMInstance],
|
| 66 |
+
node_type_configs: Dict[NodeType, NodeTypeConfig],
|
| 67 |
+
):
|
| 68 |
+
"""
|
| 69 |
+
Record autoscaler metrics for:
|
| 70 |
+
- pending_resources: Pending resources
|
| 71 |
+
- cluster_resources: Cluster resources (resources running on the cluster)
|
| 72 |
+
"""
|
| 73 |
+
# pending resources.
|
| 74 |
+
pending_resources = defaultdict(float)
|
| 75 |
+
cluster_resources = defaultdict(float)
|
| 76 |
+
|
| 77 |
+
def _add_resources(resource_map, node_type_configs, node_type, count):
|
| 78 |
+
node_resources = node_type_configs[node_type].resources
|
| 79 |
+
for resource_name, resource_value in node_resources.items():
|
| 80 |
+
resource_map[resource_name] += resource_value * count
|
| 81 |
+
|
| 82 |
+
for instance in instances:
|
| 83 |
+
if InstanceUtil.is_ray_pending(instance.status):
|
| 84 |
+
_add_resources(
|
| 85 |
+
pending_resources, node_type_configs, instance.instance_type, 1
|
| 86 |
+
)
|
| 87 |
+
elif InstanceUtil.is_ray_running(instance.status):
|
| 88 |
+
_add_resources(
|
| 89 |
+
cluster_resources, node_type_configs, instance.instance_type, 1
|
| 90 |
+
)
|
| 91 |
+
|
| 92 |
+
for resource_name, resource_value in pending_resources.items():
|
| 93 |
+
self._prom_metrics.pending_resources.labels(
|
| 94 |
+
SessionName=self._prom_metrics.session_name, resource=resource_name
|
| 95 |
+
).set(resource_value)
|
| 96 |
+
|
| 97 |
+
for resource_name, resource_value in cluster_resources.items():
|
| 98 |
+
self._prom_metrics.cluster_resources.labels(
|
| 99 |
+
SessionName=self._prom_metrics.session_name, resource=resource_name
|
| 100 |
+
).set(resource_value)
|
.venv/lib/python3.11/site-packages/ray/autoscaler/v2/monitor.py
ADDED
|
@@ -0,0 +1,302 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Autoscaler monitoring loop daemon.
|
| 2 |
+
|
| 3 |
+
See autoscaler._private/monitor.py for the legacy implementation. All the legacy flags
|
| 4 |
+
are supported here, but the new implementation uses the new autoscaler v2.
|
| 5 |
+
"""
|
| 6 |
+
|
| 7 |
+
import argparse
|
| 8 |
+
import logging
|
| 9 |
+
import os
|
| 10 |
+
import sys
|
| 11 |
+
import time
|
| 12 |
+
from typing import Optional
|
| 13 |
+
|
| 14 |
+
import ray
|
| 15 |
+
import ray._private.ray_constants as ray_constants
|
| 16 |
+
import ray._private.utils
|
| 17 |
+
from ray._private.event.event_logger import get_event_logger
|
| 18 |
+
from ray._private.ray_logging import setup_component_logger
|
| 19 |
+
from ray._private.usage.usage_lib import record_extra_usage_tag
|
| 20 |
+
from ray._private.worker import SCRIPT_MODE
|
| 21 |
+
from ray._raylet import GcsClient
|
| 22 |
+
from ray.autoscaler._private.constants import (
|
| 23 |
+
AUTOSCALER_METRIC_PORT,
|
| 24 |
+
AUTOSCALER_UPDATE_INTERVAL_S,
|
| 25 |
+
)
|
| 26 |
+
from ray.autoscaler._private.prom_metrics import AutoscalerPrometheusMetrics
|
| 27 |
+
from ray.autoscaler.v2.autoscaler import Autoscaler
|
| 28 |
+
from ray.autoscaler.v2.event_logger import AutoscalerEventLogger
|
| 29 |
+
from ray.autoscaler.v2.instance_manager.config import (
|
| 30 |
+
FileConfigReader,
|
| 31 |
+
IConfigReader,
|
| 32 |
+
ReadOnlyProviderConfigReader,
|
| 33 |
+
)
|
| 34 |
+
from ray.autoscaler.v2.metrics_reporter import AutoscalerMetricsReporter
|
| 35 |
+
from ray.core.generated.autoscaler_pb2 import AutoscalingState
|
| 36 |
+
from ray.core.generated.event_pb2 import Event as RayEvent
|
| 37 |
+
from ray.core.generated.usage_pb2 import TagKey
|
| 38 |
+
|
| 39 |
+
try:
|
| 40 |
+
import prometheus_client
|
| 41 |
+
except ImportError:
|
| 42 |
+
prometheus_client = None
|
| 43 |
+
|
| 44 |
+
|
| 45 |
+
logger = logging.getLogger(__name__)
|
| 46 |
+
|
| 47 |
+
|
| 48 |
+
class AutoscalerMonitor:
|
| 49 |
+
"""Autoscaling monitor.
|
| 50 |
+
|
| 51 |
+
This process periodically collects stats from the GCS and triggers
|
| 52 |
+
autoscaler updates.
|
| 53 |
+
|
| 54 |
+
TODO:
|
| 55 |
+
We should also handle autoscaler failures properly in the future.
|
| 56 |
+
Right now, we don't restart autoscaler if it fails (internal reconciliation
|
| 57 |
+
however, should not fail the autoscaler process).
|
| 58 |
+
With the Reconciler able to handle extra cloud instances, we could in fact
|
| 59 |
+
recover the autoscaler process from reconciliation.
|
| 60 |
+
"""
|
| 61 |
+
|
| 62 |
+
def __init__(
|
| 63 |
+
self,
|
| 64 |
+
address: str,
|
| 65 |
+
config_reader: IConfigReader,
|
| 66 |
+
log_dir: Optional[str] = None,
|
| 67 |
+
monitor_ip: Optional[str] = None,
|
| 68 |
+
):
|
| 69 |
+
# Record v2 usage (we do this as early as possible to capture usage)
|
| 70 |
+
record_autoscaler_v2_usage(GcsClient(address))
|
| 71 |
+
|
| 72 |
+
self.gcs_address = address
|
| 73 |
+
worker = ray._private.worker.global_worker
|
| 74 |
+
# TODO: eventually plumb ClusterID through to here
|
| 75 |
+
self.gcs_client = GcsClient(address=self.gcs_address)
|
| 76 |
+
|
| 77 |
+
if monitor_ip:
|
| 78 |
+
monitor_addr = f"{monitor_ip}:{AUTOSCALER_METRIC_PORT}"
|
| 79 |
+
self.gcs_client.internal_kv_put(
|
| 80 |
+
b"AutoscalerMetricsAddress", monitor_addr.encode(), True, None
|
| 81 |
+
)
|
| 82 |
+
self._session_name = self._get_session_name(self.gcs_client)
|
| 83 |
+
logger.info(f"session_name: {self._session_name}")
|
| 84 |
+
worker.set_mode(SCRIPT_MODE)
|
| 85 |
+
head_node_ip = self.gcs_address.split(":")[0]
|
| 86 |
+
|
| 87 |
+
self.autoscaler = None
|
| 88 |
+
if log_dir:
|
| 89 |
+
try:
|
| 90 |
+
ray_event_logger = get_event_logger(
|
| 91 |
+
RayEvent.SourceType.AUTOSCALER, log_dir
|
| 92 |
+
)
|
| 93 |
+
self.event_logger = AutoscalerEventLogger(ray_event_logger)
|
| 94 |
+
except Exception:
|
| 95 |
+
self.event_logger = None
|
| 96 |
+
else:
|
| 97 |
+
self.event_logger = None
|
| 98 |
+
|
| 99 |
+
prom_metrics = AutoscalerPrometheusMetrics(session_name=self._session_name)
|
| 100 |
+
self.metric_reporter = AutoscalerMetricsReporter(prom_metrics)
|
| 101 |
+
|
| 102 |
+
if monitor_ip and prometheus_client:
|
| 103 |
+
# If monitor_ip wasn't passed in, then don't attempt to start the
|
| 104 |
+
# metric server to keep behavior identical to before metrics were
|
| 105 |
+
# introduced
|
| 106 |
+
try:
|
| 107 |
+
logger.info(
|
| 108 |
+
"Starting autoscaler metrics server on port {}".format(
|
| 109 |
+
AUTOSCALER_METRIC_PORT
|
| 110 |
+
)
|
| 111 |
+
)
|
| 112 |
+
kwargs = {"addr": "127.0.0.1"} if head_node_ip == "127.0.0.1" else {}
|
| 113 |
+
prometheus_client.start_http_server(
|
| 114 |
+
port=AUTOSCALER_METRIC_PORT,
|
| 115 |
+
registry=prom_metrics.registry,
|
| 116 |
+
**kwargs,
|
| 117 |
+
)
|
| 118 |
+
except Exception:
|
| 119 |
+
logger.exception(
|
| 120 |
+
"An exception occurred while starting the metrics server."
|
| 121 |
+
)
|
| 122 |
+
elif not prometheus_client:
|
| 123 |
+
logger.warning(
|
| 124 |
+
"`prometheus_client` not found, so metrics will not be exported."
|
| 125 |
+
)
|
| 126 |
+
|
| 127 |
+
self.autoscaler = Autoscaler(
|
| 128 |
+
session_name=self._session_name,
|
| 129 |
+
config_reader=config_reader,
|
| 130 |
+
gcs_client=self.gcs_client,
|
| 131 |
+
event_logger=self.event_logger,
|
| 132 |
+
metrics_reporter=self.metric_reporter,
|
| 133 |
+
)
|
| 134 |
+
|
| 135 |
+
@staticmethod
|
| 136 |
+
def _get_session_name(gcs_client: GcsClient) -> Optional[str]:
|
| 137 |
+
"""Obtain the session name from the GCS.
|
| 138 |
+
|
| 139 |
+
If the GCS doesn't respond, session name is considered None.
|
| 140 |
+
In this case, the metrics reported from the monitor won't have
|
| 141 |
+
the correct session name.
|
| 142 |
+
"""
|
| 143 |
+
session_name = gcs_client.internal_kv_get(
|
| 144 |
+
b"session_name",
|
| 145 |
+
ray_constants.KV_NAMESPACE_SESSION,
|
| 146 |
+
timeout=10,
|
| 147 |
+
)
|
| 148 |
+
|
| 149 |
+
if session_name:
|
| 150 |
+
session_name = session_name.decode()
|
| 151 |
+
|
| 152 |
+
return session_name
|
| 153 |
+
|
| 154 |
+
@staticmethod
|
| 155 |
+
def _report_autoscaling_state(
|
| 156 |
+
gcs_client: GcsClient, autoscaling_state: AutoscalingState
|
| 157 |
+
):
|
| 158 |
+
"""Report the autoscaling state to the GCS."""
|
| 159 |
+
try:
|
| 160 |
+
gcs_client.report_autoscaling_state(autoscaling_state.SerializeToString())
|
| 161 |
+
except Exception:
|
| 162 |
+
logger.exception("Error reporting autoscaling state to GCS.")
|
| 163 |
+
|
| 164 |
+
def _run(self):
|
| 165 |
+
"""Run the monitor loop."""
|
| 166 |
+
|
| 167 |
+
while True:
|
| 168 |
+
autoscaling_state = self.autoscaler.update_autoscaling_state()
|
| 169 |
+
if autoscaling_state:
|
| 170 |
+
# report autoscaling state
|
| 171 |
+
self._report_autoscaling_state(self.gcs_client, autoscaling_state)
|
| 172 |
+
else:
|
| 173 |
+
logger.warning("No autoscaling state to report.")
|
| 174 |
+
|
| 175 |
+
# Wait for a autoscaler update interval before processing the next
|
| 176 |
+
# round of messages.
|
| 177 |
+
time.sleep(AUTOSCALER_UPDATE_INTERVAL_S)
|
| 178 |
+
|
| 179 |
+
def run(self):
|
| 180 |
+
try:
|
| 181 |
+
self._run()
|
| 182 |
+
except Exception:
|
| 183 |
+
logger.exception("Error in monitor loop")
|
| 184 |
+
raise
|
| 185 |
+
|
| 186 |
+
|
| 187 |
+
def record_autoscaler_v2_usage(gcs_client: GcsClient) -> None:
|
| 188 |
+
"""
|
| 189 |
+
Record usage for autoscaler v2.
|
| 190 |
+
"""
|
| 191 |
+
try:
|
| 192 |
+
record_extra_usage_tag(TagKey.AUTOSCALER_VERSION, "v2", gcs_client)
|
| 193 |
+
except Exception:
|
| 194 |
+
logger.exception("Error recording usage for autoscaler v2.")
|
| 195 |
+
|
| 196 |
+
|
| 197 |
+
if __name__ == "__main__":
|
| 198 |
+
parser = argparse.ArgumentParser(
|
| 199 |
+
description=("Parse GCS server for the monitor to connect to.")
|
| 200 |
+
)
|
| 201 |
+
parser.add_argument(
|
| 202 |
+
"--gcs-address", required=False, type=str, help="The address (ip:port) of GCS."
|
| 203 |
+
)
|
| 204 |
+
parser.add_argument(
|
| 205 |
+
"--autoscaling-config",
|
| 206 |
+
required=False,
|
| 207 |
+
type=str,
|
| 208 |
+
help="the path to the autoscaling config file",
|
| 209 |
+
)
|
| 210 |
+
parser.add_argument(
|
| 211 |
+
"--logging-level",
|
| 212 |
+
required=False,
|
| 213 |
+
type=str,
|
| 214 |
+
default=ray_constants.LOGGER_LEVEL,
|
| 215 |
+
choices=ray_constants.LOGGER_LEVEL_CHOICES,
|
| 216 |
+
help=ray_constants.LOGGER_LEVEL_HELP,
|
| 217 |
+
)
|
| 218 |
+
parser.add_argument(
|
| 219 |
+
"--logging-format",
|
| 220 |
+
required=False,
|
| 221 |
+
type=str,
|
| 222 |
+
default=ray_constants.LOGGER_FORMAT,
|
| 223 |
+
help=ray_constants.LOGGER_FORMAT_HELP,
|
| 224 |
+
)
|
| 225 |
+
parser.add_argument(
|
| 226 |
+
"--logging-filename",
|
| 227 |
+
required=False,
|
| 228 |
+
type=str,
|
| 229 |
+
default=ray_constants.MONITOR_LOG_FILE_NAME,
|
| 230 |
+
help="Specify the name of log file, "
|
| 231 |
+
"log to stdout if set empty, default is "
|
| 232 |
+
f'"{ray_constants.MONITOR_LOG_FILE_NAME}"',
|
| 233 |
+
)
|
| 234 |
+
parser.add_argument(
|
| 235 |
+
"--logs-dir",
|
| 236 |
+
required=True,
|
| 237 |
+
type=str,
|
| 238 |
+
help="Specify the path of the temporary directory used by Ray processes.",
|
| 239 |
+
)
|
| 240 |
+
parser.add_argument(
|
| 241 |
+
"--logging-rotate-bytes",
|
| 242 |
+
required=False,
|
| 243 |
+
type=int,
|
| 244 |
+
default=ray_constants.LOGGING_ROTATE_BYTES,
|
| 245 |
+
help="Specify the max bytes for rotating "
|
| 246 |
+
"log file, default is "
|
| 247 |
+
f"{ray_constants.LOGGING_ROTATE_BYTES} bytes.",
|
| 248 |
+
)
|
| 249 |
+
parser.add_argument(
|
| 250 |
+
"--logging-rotate-backup-count",
|
| 251 |
+
required=False,
|
| 252 |
+
type=int,
|
| 253 |
+
default=ray_constants.LOGGING_ROTATE_BACKUP_COUNT,
|
| 254 |
+
help="Specify the backup count of rotated log file, default is "
|
| 255 |
+
f"{ray_constants.LOGGING_ROTATE_BACKUP_COUNT}.",
|
| 256 |
+
)
|
| 257 |
+
parser.add_argument(
|
| 258 |
+
"--monitor-ip",
|
| 259 |
+
required=False,
|
| 260 |
+
type=str,
|
| 261 |
+
default=None,
|
| 262 |
+
help="The IP address of the machine hosting the monitor process.",
|
| 263 |
+
)
|
| 264 |
+
|
| 265 |
+
args = parser.parse_args()
|
| 266 |
+
setup_component_logger(
|
| 267 |
+
logging_level=args.logging_level,
|
| 268 |
+
logging_format=args.logging_format,
|
| 269 |
+
log_dir=args.logs_dir,
|
| 270 |
+
filename=args.logging_filename,
|
| 271 |
+
max_bytes=args.logging_rotate_bytes,
|
| 272 |
+
backup_count=args.logging_rotate_backup_count,
|
| 273 |
+
)
|
| 274 |
+
|
| 275 |
+
logger.info(
|
| 276 |
+
f"Starting autoscaler v2 monitor using ray installation: {ray.__file__}"
|
| 277 |
+
)
|
| 278 |
+
logger.info(f"Ray version: {ray.__version__}")
|
| 279 |
+
logger.info(f"Ray commit: {ray.__commit__}")
|
| 280 |
+
logger.info(f"AutoscalerMonitor started with command: {sys.argv}")
|
| 281 |
+
|
| 282 |
+
gcs_address = args.gcs_address
|
| 283 |
+
if gcs_address is None:
|
| 284 |
+
raise ValueError("--gcs-address must be set!")
|
| 285 |
+
|
| 286 |
+
if not args.autoscaling_config:
|
| 287 |
+
logger.info("No autoscaling config provided: use read only node provider.")
|
| 288 |
+
config_reader = ReadOnlyProviderConfigReader(gcs_address)
|
| 289 |
+
else:
|
| 290 |
+
autoscaling_config = os.path.expanduser(args.autoscaling_config)
|
| 291 |
+
config_reader = FileConfigReader(
|
| 292 |
+
config_file=autoscaling_config, skip_content_hash=True
|
| 293 |
+
)
|
| 294 |
+
|
| 295 |
+
monitor = AutoscalerMonitor(
|
| 296 |
+
gcs_address,
|
| 297 |
+
config_reader,
|
| 298 |
+
log_dir=args.logs_dir,
|
| 299 |
+
monitor_ip=args.monitor_ip,
|
| 300 |
+
)
|
| 301 |
+
|
| 302 |
+
monitor.run()
|
.venv/lib/python3.11/site-packages/ray/autoscaler/v2/scheduler.py
ADDED
|
@@ -0,0 +1,1642 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import copy
|
| 2 |
+
import logging
|
| 3 |
+
import time
|
| 4 |
+
import uuid
|
| 5 |
+
from abc import ABC, abstractmethod
|
| 6 |
+
from collections import defaultdict
|
| 7 |
+
from dataclasses import dataclass, field
|
| 8 |
+
from enum import Enum
|
| 9 |
+
from typing import Dict, List, Optional, Tuple
|
| 10 |
+
|
| 11 |
+
from ray._private.protobuf_compat import message_to_dict
|
| 12 |
+
from ray.autoscaler._private.constants import AUTOSCALER_CONSERVE_GPU_NODES
|
| 13 |
+
from ray.autoscaler._private.resource_demand_scheduler import (
|
| 14 |
+
UtilizationScore,
|
| 15 |
+
_fits,
|
| 16 |
+
_inplace_subtract,
|
| 17 |
+
)
|
| 18 |
+
from ray.autoscaler.v2.event_logger import AutoscalerEventLogger
|
| 19 |
+
from ray.autoscaler.v2.instance_manager.common import InstanceUtil
|
| 20 |
+
from ray.autoscaler.v2.instance_manager.config import NodeTypeConfig
|
| 21 |
+
from ray.autoscaler.v2.schema import AutoscalerInstance, NodeType
|
| 22 |
+
from ray.autoscaler.v2.utils import ProtobufUtil, ResourceRequestUtil
|
| 23 |
+
from ray.core.generated.autoscaler_pb2 import (
|
| 24 |
+
ClusterResourceConstraint,
|
| 25 |
+
GangResourceRequest,
|
| 26 |
+
ResourceRequest,
|
| 27 |
+
ResourceRequestByCount,
|
| 28 |
+
)
|
| 29 |
+
from ray.core.generated.instance_manager_pb2 import (
|
| 30 |
+
Instance,
|
| 31 |
+
LaunchRequest,
|
| 32 |
+
NodeKind,
|
| 33 |
+
TerminationRequest,
|
| 34 |
+
)
|
| 35 |
+
|
| 36 |
+
# ============= Resource Scheduling Service API =======================
|
| 37 |
+
#
|
| 38 |
+
# ResourceSchedulerService is a service that schedules resource bundles
|
| 39 |
+
# to nodes. It's used by the autoscaler to schedule resource bundles
|
| 40 |
+
# to determine the desired cluster size to satisfy the current resource
|
| 41 |
+
# demands.
|
| 42 |
+
#
|
| 43 |
+
logger = logging.getLogger(__name__)
|
| 44 |
+
|
| 45 |
+
|
| 46 |
+
@dataclass
|
| 47 |
+
class SchedulingRequest:
|
| 48 |
+
# If outdated node check through launch config is disabled.
|
| 49 |
+
disable_launch_config_check: bool
|
| 50 |
+
# Available node type configs
|
| 51 |
+
node_type_configs: Dict[NodeType, NodeTypeConfig] = field(default_factory=dict)
|
| 52 |
+
# Max number of worker nodes.
|
| 53 |
+
max_num_nodes: Optional[int] = None
|
| 54 |
+
# Idle timeout in seconds.
|
| 55 |
+
idle_timeout_s: Optional[float] = None
|
| 56 |
+
# TODO: This prob could be refactored into the ClusterStatus data class later.
|
| 57 |
+
# The current ray resource requests.
|
| 58 |
+
resource_requests: List[ResourceRequestByCount] = field(default_factory=list)
|
| 59 |
+
# The Gang resource requests.
|
| 60 |
+
gang_resource_requests: List[GangResourceRequest] = field(default_factory=list)
|
| 61 |
+
# cluster resource constraints.
|
| 62 |
+
cluster_resource_constraints: List[ClusterResourceConstraint] = field(
|
| 63 |
+
default_factory=list
|
| 64 |
+
)
|
| 65 |
+
# The current instances.
|
| 66 |
+
current_instances: List[AutoscalerInstance] = field(default_factory=list)
|
| 67 |
+
|
| 68 |
+
|
| 69 |
+
@dataclass
|
| 70 |
+
class SchedulingReply:
|
| 71 |
+
# Instances to launch.
|
| 72 |
+
to_launch: List[LaunchRequest] = field(default_factory=list)
|
| 73 |
+
# To terminate.
|
| 74 |
+
to_terminate: List[TerminationRequest] = field(default_factory=list)
|
| 75 |
+
# The infeasible resource bundles.
|
| 76 |
+
infeasible_resource_requests: List[ResourceRequest] = field(default_factory=list)
|
| 77 |
+
# The infeasible gang resource bundles.
|
| 78 |
+
infeasible_gang_resource_requests: List[GangResourceRequest] = field(
|
| 79 |
+
default_factory=list
|
| 80 |
+
)
|
| 81 |
+
# The infeasible cluster resource constraints.
|
| 82 |
+
infeasible_cluster_resource_constraints: List[ClusterResourceConstraint] = field(
|
| 83 |
+
default_factory=list
|
| 84 |
+
)
|
| 85 |
+
|
| 86 |
+
|
| 87 |
+
class IResourceScheduler(ABC):
|
| 88 |
+
"""
|
| 89 |
+
Interface for a resource scheduler.
|
| 90 |
+
|
| 91 |
+
Implements the `instance_manager.proto ResourceSchedulerService` interface.
|
| 92 |
+
"""
|
| 93 |
+
|
| 94 |
+
@abstractmethod
|
| 95 |
+
def schedule(self, request: SchedulingRequest) -> SchedulingReply:
|
| 96 |
+
"""
|
| 97 |
+
Given the resource requests and the current cluster state, calculate the
|
| 98 |
+
target cluster shape by trying to schedule the resource requests on the
|
| 99 |
+
nodes.
|
| 100 |
+
"""
|
| 101 |
+
pass
|
| 102 |
+
|
| 103 |
+
|
| 104 |
+
class SchedulingNodeStatus(Enum):
|
| 105 |
+
"""
|
| 106 |
+
The status of a scheduling node (`SchedulingNode`)
|
| 107 |
+
"""
|
| 108 |
+
|
| 109 |
+
# The node is added by the ResourceDemandScheduler.
|
| 110 |
+
TO_LAUNCH = "TO_LAUNCH"
|
| 111 |
+
# The node is pending, i.e. there's already an autoscaler instance being launched
|
| 112 |
+
# The node is schedulable. It could be running ray or pending to run ray. Either
|
| 113 |
+
# Way, it should be able to accept new resource requests/resource constraints.
|
| 114 |
+
SCHEDULABLE = "SCHEDULABLE"
|
| 115 |
+
# The node is to be terminated by the ResourceDemandScheduler
|
| 116 |
+
TO_TERMINATE = "TO_TERMINATE"
|
| 117 |
+
|
| 118 |
+
|
| 119 |
+
class ResourceRequestSource(Enum):
|
| 120 |
+
"""
|
| 121 |
+
The source of the resource request.
|
| 122 |
+
"""
|
| 123 |
+
|
| 124 |
+
# The resource request is from demand, e.g. ray tasks/actors,
|
| 125 |
+
# placement groups, etc.
|
| 126 |
+
PENDING_DEMAND = "PENDING_DEMAND"
|
| 127 |
+
# The resource request is from the cluster resource constraints, i.e.
|
| 128 |
+
# from ray.autoscaler.sdk.request_resources().
|
| 129 |
+
CLUSTER_RESOURCE_CONSTRAINT = "CLUSTER_RESOURCE_CONSTRAINT"
|
| 130 |
+
|
| 131 |
+
|
| 132 |
+
@dataclass
|
| 133 |
+
class SchedulingNode:
|
| 134 |
+
"""
|
| 135 |
+
A abstraction of a node that can be scheduled on by the resource scheduler.
|
| 136 |
+
|
| 137 |
+
A scheduling node is expected to be used as:
|
| 138 |
+
|
| 139 |
+
node = SchedulingNode.new(instance, node_configs)
|
| 140 |
+
remaining, score = node.try_schedule(requests)
|
| 141 |
+
|
| 142 |
+
.... do something with the score ....
|
| 143 |
+
|
| 144 |
+
NOTE:
|
| 145 |
+
One could also extend the scheduling behavior by overriding `try_schedule`
|
| 146 |
+
"""
|
| 147 |
+
|
| 148 |
+
# Node type name.
|
| 149 |
+
node_type: NodeType
|
| 150 |
+
# Status
|
| 151 |
+
status: SchedulingNodeStatus
|
| 152 |
+
# Resource requests scheduled on this nodes for different sources.
|
| 153 |
+
sched_requests: Dict[ResourceRequestSource, List[ResourceRequest]] = field(
|
| 154 |
+
default_factory=lambda: defaultdict(list)
|
| 155 |
+
)
|
| 156 |
+
# Available resources for different sources of requests.
|
| 157 |
+
available_resources_for_sched: Dict[
|
| 158 |
+
ResourceRequestSource, Dict[str, float]
|
| 159 |
+
] = field(default_factory=dict)
|
| 160 |
+
# The node's current resource capacity.
|
| 161 |
+
total_resources: Dict[str, float] = field(default_factory=dict)
|
| 162 |
+
# Node's labels, including static or dynamic labels.
|
| 163 |
+
labels: Dict[str, str] = field(default_factory=dict)
|
| 164 |
+
# Observability descriptive message for why the node was launched in the
|
| 165 |
+
# first place.
|
| 166 |
+
launch_reason: Optional[str] = None
|
| 167 |
+
# Termination request, none when the node is not being terminated.
|
| 168 |
+
termination_request: Optional[TerminationRequest] = None
|
| 169 |
+
# The instance id of the IM(Instance Manager) instance. None if the node
|
| 170 |
+
# is not yet in IM.
|
| 171 |
+
im_instance_id: Optional[str] = None
|
| 172 |
+
# The ray node id of the ray node. None if the node is not included in
|
| 173 |
+
# ray cluster's GCS report yet (not running ray yet).
|
| 174 |
+
ray_node_id: Optional[str] = None
|
| 175 |
+
# Idle duration in ms. Default not idle.
|
| 176 |
+
idle_duration_ms: int = 0
|
| 177 |
+
# Launch config hash.
|
| 178 |
+
launch_config_hash: Optional[str] = None
|
| 179 |
+
# node kind.
|
| 180 |
+
node_kind: NodeKind = NodeKind.WORKER
|
| 181 |
+
|
| 182 |
+
def __init__(
|
| 183 |
+
self,
|
| 184 |
+
node_type: NodeType,
|
| 185 |
+
total_resources: Dict[str, float],
|
| 186 |
+
available_resources: Dict[str, float],
|
| 187 |
+
labels: Dict[str, str],
|
| 188 |
+
status: SchedulingNodeStatus,
|
| 189 |
+
im_instance_id: str = "",
|
| 190 |
+
ray_node_id: str = "",
|
| 191 |
+
idle_duration_ms: int = 0,
|
| 192 |
+
launch_config_hash: str = "",
|
| 193 |
+
node_kind: NodeKind = NodeKind.WORKER,
|
| 194 |
+
termination_request: Optional[TerminationRequest] = None,
|
| 195 |
+
):
|
| 196 |
+
self.node_type = node_type
|
| 197 |
+
self.total_resources = total_resources
|
| 198 |
+
self.available_resources_for_sched = {
|
| 199 |
+
ResourceRequestSource.PENDING_DEMAND: dict(available_resources),
|
| 200 |
+
ResourceRequestSource.CLUSTER_RESOURCE_CONSTRAINT: dict(total_resources),
|
| 201 |
+
}
|
| 202 |
+
self.sched_requests = {
|
| 203 |
+
ResourceRequestSource.PENDING_DEMAND: [],
|
| 204 |
+
ResourceRequestSource.CLUSTER_RESOURCE_CONSTRAINT: [],
|
| 205 |
+
}
|
| 206 |
+
self.labels = labels
|
| 207 |
+
self.status = status
|
| 208 |
+
self.im_instance_id = im_instance_id
|
| 209 |
+
self.ray_node_id = ray_node_id
|
| 210 |
+
self.idle_duration_ms = idle_duration_ms
|
| 211 |
+
self.launch_config_hash = launch_config_hash
|
| 212 |
+
self.node_kind = node_kind
|
| 213 |
+
self.termination_request = termination_request
|
| 214 |
+
|
| 215 |
+
def get_available_resources(self, resource_request_source: ResourceRequestSource):
|
| 216 |
+
"""Get the available resources for the given resource request source."""
|
| 217 |
+
return self.available_resources_for_sched[resource_request_source]
|
| 218 |
+
|
| 219 |
+
def get_sched_requests(self, resource_request_source: ResourceRequestSource):
|
| 220 |
+
"""Get the resource requests for the given resource request source."""
|
| 221 |
+
return self.sched_requests[resource_request_source]
|
| 222 |
+
|
| 223 |
+
def add_sched_request(
|
| 224 |
+
self,
|
| 225 |
+
request: ResourceRequest,
|
| 226 |
+
resource_request_source: ResourceRequestSource,
|
| 227 |
+
):
|
| 228 |
+
"""
|
| 229 |
+
Add the resource requests to the node.
|
| 230 |
+
|
| 231 |
+
Args:
|
| 232 |
+
request: The resource request to be added.
|
| 233 |
+
resource_request_source: The source of the resource request.
|
| 234 |
+
"""
|
| 235 |
+
self.sched_requests[resource_request_source].append(request)
|
| 236 |
+
|
| 237 |
+
@staticmethod
|
| 238 |
+
def new(
|
| 239 |
+
instance: AutoscalerInstance,
|
| 240 |
+
node_type_configs: Dict[NodeType, NodeTypeConfig],
|
| 241 |
+
disable_launch_config_check: bool,
|
| 242 |
+
) -> Optional["SchedulingNode"]:
|
| 243 |
+
"""
|
| 244 |
+
Create a new scheduling node from an autoscaler instance.
|
| 245 |
+
|
| 246 |
+
It creates:
|
| 247 |
+
- None if the instance is not schedulable by IM.
|
| 248 |
+
- A schedulable node if the instance is running ray or pending to run ray,
|
| 249 |
+
so it should be considered in the scheduling process.
|
| 250 |
+
|
| 251 |
+
Args:
|
| 252 |
+
instance: The instance.
|
| 253 |
+
node_type_configs: The node type configs.
|
| 254 |
+
disable_launch_config_check: If outdated node check through launch config is
|
| 255 |
+
disabled.
|
| 256 |
+
|
| 257 |
+
"""
|
| 258 |
+
if not SchedulingNode.is_schedulable(instance):
|
| 259 |
+
return None
|
| 260 |
+
|
| 261 |
+
if instance.im_instance.status == Instance.RAY_RUNNING:
|
| 262 |
+
assert instance.ray_node is not None, (
|
| 263 |
+
"ray node should not be None "
|
| 264 |
+
f"when the instance is running ray: instance={instance}"
|
| 265 |
+
)
|
| 266 |
+
# An running ray node
|
| 267 |
+
return SchedulingNode(
|
| 268 |
+
node_type=instance.im_instance.instance_type,
|
| 269 |
+
total_resources=dict(instance.ray_node.total_resources),
|
| 270 |
+
# Available resources for scheduling requests of different
|
| 271 |
+
# sources.
|
| 272 |
+
available_resources=dict(instance.ray_node.available_resources),
|
| 273 |
+
# Use ray node's dynamic labels.
|
| 274 |
+
labels=dict(instance.ray_node.dynamic_labels),
|
| 275 |
+
status=SchedulingNodeStatus.SCHEDULABLE,
|
| 276 |
+
im_instance_id=instance.im_instance.instance_id,
|
| 277 |
+
ray_node_id=instance.im_instance.node_id,
|
| 278 |
+
idle_duration_ms=instance.ray_node.idle_duration_ms,
|
| 279 |
+
launch_config_hash=instance.im_instance.launch_config_hash,
|
| 280 |
+
node_kind=instance.im_instance.node_kind,
|
| 281 |
+
)
|
| 282 |
+
|
| 283 |
+
# This is an instance pending to run ray. Initialize a schedulable node
|
| 284 |
+
# from the node type config.
|
| 285 |
+
node_config = node_type_configs.get(instance.im_instance.instance_type, None)
|
| 286 |
+
if node_config is None:
|
| 287 |
+
if disable_launch_config_check:
|
| 288 |
+
# We are not terminating outdated nodes.
|
| 289 |
+
logger.info(
|
| 290 |
+
f"Node config for {instance.im_instance.instance_type} is missing, "
|
| 291 |
+
"but we are not terminating the outdated node because "
|
| 292 |
+
"`disable_launch_config_check` is True in "
|
| 293 |
+
"the autoscaler's provider config."
|
| 294 |
+
)
|
| 295 |
+
return None
|
| 296 |
+
|
| 297 |
+
# Configs might have been updated, and no more
|
| 298 |
+
# node_type_configs for this node type. We should terminate it.
|
| 299 |
+
return SchedulingNode(
|
| 300 |
+
node_type=instance.im_instance.instance_type,
|
| 301 |
+
total_resources={},
|
| 302 |
+
available_resources={},
|
| 303 |
+
labels={},
|
| 304 |
+
status=SchedulingNodeStatus.TO_TERMINATE,
|
| 305 |
+
im_instance_id=instance.im_instance.instance_id,
|
| 306 |
+
termination_request=TerminationRequest(
|
| 307 |
+
id=str(uuid.uuid4()),
|
| 308 |
+
instance_id=instance.im_instance.instance_id,
|
| 309 |
+
cause=TerminationRequest.Cause.OUTDATED,
|
| 310 |
+
instance_type=instance.im_instance.instance_type,
|
| 311 |
+
),
|
| 312 |
+
node_kind=NodeKind.WORKER,
|
| 313 |
+
)
|
| 314 |
+
|
| 315 |
+
return SchedulingNode.from_node_config(
|
| 316 |
+
node_config,
|
| 317 |
+
SchedulingNodeStatus.SCHEDULABLE,
|
| 318 |
+
node_kind=instance.im_instance.node_kind,
|
| 319 |
+
im_instance_id=instance.im_instance.instance_id,
|
| 320 |
+
)
|
| 321 |
+
|
| 322 |
+
@staticmethod
|
| 323 |
+
def is_schedulable(instance: AutoscalerInstance) -> bool:
|
| 324 |
+
"""
|
| 325 |
+
Check if the instance is schedulable by IM.
|
| 326 |
+
|
| 327 |
+
Args:
|
| 328 |
+
instance: The instance.
|
| 329 |
+
|
| 330 |
+
Returns:
|
| 331 |
+
True if the instance is schedulable by IM.
|
| 332 |
+
"""
|
| 333 |
+
if instance.im_instance is None:
|
| 334 |
+
# We will skip any instances that are not yet in IM which
|
| 335 |
+
# could be
|
| 336 |
+
# 1. an out-of-band ray node
|
| 337 |
+
# 2. an cloud instance running ray not yet discovered
|
| 338 |
+
# by the IM's Reconciler
|
| 339 |
+
# 3. an cloud instance already terminated but ray state
|
| 340 |
+
# still lagging behind.
|
| 341 |
+
#
|
| 342 |
+
# In all of these cases, the instance is not schedulable or
|
| 343 |
+
# shouldn't be managed by IM, so we don't consider them.
|
| 344 |
+
return False
|
| 345 |
+
|
| 346 |
+
# These are the statuses where there's a running ray node or
|
| 347 |
+
# could eventually run ray.
|
| 348 |
+
if InstanceUtil.is_ray_running_reachable(instance.im_instance.status):
|
| 349 |
+
return True
|
| 350 |
+
|
| 351 |
+
return False
|
| 352 |
+
|
| 353 |
+
@staticmethod
|
| 354 |
+
def from_node_config(
|
| 355 |
+
node_config: NodeTypeConfig,
|
| 356 |
+
status: SchedulingNodeStatus,
|
| 357 |
+
node_kind: NodeKind,
|
| 358 |
+
im_instance_id: Optional[str] = None,
|
| 359 |
+
) -> "SchedulingNode":
|
| 360 |
+
"""
|
| 361 |
+
Create a scheduling node from a node config.
|
| 362 |
+
|
| 363 |
+
Args:
|
| 364 |
+
node_config: The node config.
|
| 365 |
+
status: The status of the node.
|
| 366 |
+
node_kind: The node kind.
|
| 367 |
+
im_instance_id: The instance id of the im instance.
|
| 368 |
+
node_kind: The node kind.
|
| 369 |
+
"""
|
| 370 |
+
return SchedulingNode(
|
| 371 |
+
node_type=node_config.name,
|
| 372 |
+
total_resources=dict(node_config.resources),
|
| 373 |
+
available_resources=dict(node_config.resources),
|
| 374 |
+
labels=dict(node_config.labels),
|
| 375 |
+
status=status,
|
| 376 |
+
im_instance_id=im_instance_id,
|
| 377 |
+
node_kind=node_kind,
|
| 378 |
+
)
|
| 379 |
+
|
| 380 |
+
def __post_init__(self):
|
| 381 |
+
assert self.node_type, "node_type should be set"
|
| 382 |
+
|
| 383 |
+
def try_schedule(
|
| 384 |
+
self,
|
| 385 |
+
requests: List[ResourceRequest],
|
| 386 |
+
resource_request_source: ResourceRequestSource,
|
| 387 |
+
) -> Tuple[List[ResourceRequest], UtilizationScore]:
|
| 388 |
+
"""
|
| 389 |
+
Try to schedule the resource requests on this node.
|
| 390 |
+
|
| 391 |
+
This modifies the node's available resources if the requests are schedulable.
|
| 392 |
+
The requests are scheduled one by one in the sorted order, and no
|
| 393 |
+
backtracking is done.
|
| 394 |
+
|
| 395 |
+
Args:
|
| 396 |
+
requests: The resource requests to be scheduled.
|
| 397 |
+
resource_request_source: The source of the resource request, i.e.
|
| 398 |
+
pending demands from ray actors/tasks or cluster resource constraints.
|
| 399 |
+
|
| 400 |
+
Returns:
|
| 401 |
+
A tuple of:
|
| 402 |
+
- list of remaining requests that cannot be scheduled on this node.
|
| 403 |
+
- the utilization score for this node with respect to the current
|
| 404 |
+
resource requests being scheduled.
|
| 405 |
+
"""
|
| 406 |
+
# Track the resource requests that cannot be scheduled on this node.
|
| 407 |
+
unschedulable_requests = []
|
| 408 |
+
|
| 409 |
+
# Sort the requests and try schedule them one by one.
|
| 410 |
+
for r in requests:
|
| 411 |
+
if not self._try_schedule_one(r, resource_request_source):
|
| 412 |
+
unschedulable_requests.append(r)
|
| 413 |
+
|
| 414 |
+
score = self._compute_score(resource_request_source)
|
| 415 |
+
|
| 416 |
+
return unschedulable_requests, score
|
| 417 |
+
|
| 418 |
+
def _compute_score(
|
| 419 |
+
self, resource_request_source: ResourceRequestSource
|
| 420 |
+
) -> UtilizationScore:
|
| 421 |
+
"""
|
| 422 |
+
Compute the utilization score for this node with respect to the current resource
|
| 423 |
+
request being scheduled.
|
| 424 |
+
|
| 425 |
+
A "higher" score means that this node is more suitable for scheduling the
|
| 426 |
+
current scheduled resource requests.
|
| 427 |
+
|
| 428 |
+
The score is a tuple of 4 values:
|
| 429 |
+
1. Whether this node is a GPU node and the current resource request has
|
| 430 |
+
GPU requirements:
|
| 431 |
+
0: if this node is a GPU node and the current resource request
|
| 432 |
+
placed onto the node has no GPU requirements.
|
| 433 |
+
1: if this node is not a GPU node or the current resource request
|
| 434 |
+
placed onto the node has GPU requirements.
|
| 435 |
+
2. The number of resource types being scheduled.
|
| 436 |
+
3. The minimum utilization rate across all resource types.
|
| 437 |
+
4. The average utilization rate across all resource types.
|
| 438 |
+
|
| 439 |
+
NOTE:
|
| 440 |
+
This function is adapted from _resource_based_utilization_scorer from
|
| 441 |
+
autoscaler v1.
|
| 442 |
+
|
| 443 |
+
TODO(rickyx,jjyao): We should also consider node labels for
|
| 444 |
+
scoring. For example, if a node has a label that matches the affinity
|
| 445 |
+
label of the resource request, we should give it a higher score.
|
| 446 |
+
|
| 447 |
+
TODO(rickyx): add pluggable scoring functions here.
|
| 448 |
+
|
| 449 |
+
Returns:
|
| 450 |
+
A utilization score for this node.
|
| 451 |
+
"""
|
| 452 |
+
|
| 453 |
+
sched_requests = self.get_sched_requests(resource_request_source)
|
| 454 |
+
available_resources = self.get_available_resources(resource_request_source)
|
| 455 |
+
|
| 456 |
+
# Compute the number of resource types being scheduled.
|
| 457 |
+
num_matching_resource_types = 0
|
| 458 |
+
sched_resource_types = set()
|
| 459 |
+
for req in sched_requests:
|
| 460 |
+
for resource_name, v in req.resources_bundle.items():
|
| 461 |
+
if v > 0:
|
| 462 |
+
sched_resource_types.add(resource_name)
|
| 463 |
+
|
| 464 |
+
for sched_resource_type in sched_resource_types:
|
| 465 |
+
if sched_resource_type in self.total_resources:
|
| 466 |
+
num_matching_resource_types += 1
|
| 467 |
+
|
| 468 |
+
# Compute the utilization rate for each resource type
|
| 469 |
+
util_by_resources = []
|
| 470 |
+
for k, v in self.total_resources.items():
|
| 471 |
+
if v == 0:
|
| 472 |
+
# Skip any zero values.
|
| 473 |
+
continue
|
| 474 |
+
if k in available_resources:
|
| 475 |
+
util = (v - available_resources.get(k, 0)) / v
|
| 476 |
+
assert util >= 0 and util <= 1, f"Invalid utilization: {util}"
|
| 477 |
+
util_by_resources.append(v * (util**3))
|
| 478 |
+
|
| 479 |
+
# Prefer not to launch a GPU node if there aren't any GPU requirements in the
|
| 480 |
+
# resource bundle.
|
| 481 |
+
gpu_ok = True
|
| 482 |
+
if AUTOSCALER_CONSERVE_GPU_NODES:
|
| 483 |
+
# TODO: we should also generalize this optimization for accelerators.
|
| 484 |
+
# https://github.com/ray-project/ray/issues/43079
|
| 485 |
+
is_gpu_node = self.total_resources.get("GPU", 0) > 0
|
| 486 |
+
any_gpu_requests = any("GPU" in r.resources_bundle for r in sched_requests)
|
| 487 |
+
if is_gpu_node and not any_gpu_requests:
|
| 488 |
+
gpu_ok = False
|
| 489 |
+
|
| 490 |
+
# Prioritize avoiding gpu nodes for non-gpu workloads first,
|
| 491 |
+
# then prioritize matching multiple resource types,
|
| 492 |
+
# then prioritize using all resources,
|
| 493 |
+
# then prioritize overall balance of multiple resources.
|
| 494 |
+
return (
|
| 495 |
+
gpu_ok,
|
| 496 |
+
num_matching_resource_types,
|
| 497 |
+
min(util_by_resources) if util_by_resources else 0,
|
| 498 |
+
float(sum(util_by_resources)) / len(util_by_resources)
|
| 499 |
+
if util_by_resources
|
| 500 |
+
else 0,
|
| 501 |
+
)
|
| 502 |
+
|
| 503 |
+
def _try_schedule_one(
|
| 504 |
+
self, request: ResourceRequest, resource_request_source: ResourceRequestSource
|
| 505 |
+
) -> bool:
|
| 506 |
+
"""
|
| 507 |
+
Try to schedule one resource request on this node. The request could be from
|
| 508 |
+
various sources, specified by `resource_request_source`.
|
| 509 |
+
|
| 510 |
+
Args:
|
| 511 |
+
request: The resource request to be scheduled.
|
| 512 |
+
resource_request_source: The source of the resource request, i.e.
|
| 513 |
+
pending demands from ray actors/tasks or cluster resource constraints.
|
| 514 |
+
|
| 515 |
+
Returns:
|
| 516 |
+
True if the resource request is scheduled on this node.
|
| 517 |
+
"""
|
| 518 |
+
|
| 519 |
+
# Check if there's placement constraints that are not satisfied.
|
| 520 |
+
for constraint in request.placement_constraints:
|
| 521 |
+
if constraint.HasField("anti_affinity"):
|
| 522 |
+
anti_affinity = constraint.anti_affinity
|
| 523 |
+
if (
|
| 524 |
+
anti_affinity.label_name in self.labels
|
| 525 |
+
and anti_affinity.label_value
|
| 526 |
+
== self.labels[anti_affinity.label_name]
|
| 527 |
+
):
|
| 528 |
+
# The node already has a label that matches the anti-affinity
|
| 529 |
+
return False
|
| 530 |
+
|
| 531 |
+
# We don't need to check for affinity constraints here since
|
| 532 |
+
# we have already combined resource requests with the affinity
|
| 533 |
+
# constraints into the same request at `combine_requests_with_affinity`.
|
| 534 |
+
pass
|
| 535 |
+
|
| 536 |
+
available_resources_dict = self.get_available_resources(resource_request_source)
|
| 537 |
+
|
| 538 |
+
# Check if there's enough resources to schedule the request.
|
| 539 |
+
if not _fits(available_resources_dict, dict(request.resources_bundle)):
|
| 540 |
+
return False
|
| 541 |
+
|
| 542 |
+
# Schedule the request, update resources
|
| 543 |
+
_inplace_subtract(available_resources_dict, dict(request.resources_bundle))
|
| 544 |
+
|
| 545 |
+
# Add the request to the node.
|
| 546 |
+
self.add_sched_request(request, resource_request_source)
|
| 547 |
+
|
| 548 |
+
# Update the dynamic labels if there's any
|
| 549 |
+
for constraint in request.placement_constraints:
|
| 550 |
+
# We don't need to check for affinity constraints here since
|
| 551 |
+
# we have already combined resource requests with the affinity
|
| 552 |
+
# constraints into the same request at `combine_requests_with_affinity`.
|
| 553 |
+
# We don't need node labels for enforcing affinity.
|
| 554 |
+
if constraint.HasField("anti_affinity"):
|
| 555 |
+
anti_affinity = constraint.anti_affinity
|
| 556 |
+
self._add_label(anti_affinity.label_name, anti_affinity.label_value)
|
| 557 |
+
|
| 558 |
+
return True
|
| 559 |
+
|
| 560 |
+
def _add_label(self, label_name: str, label_value: str):
|
| 561 |
+
"""
|
| 562 |
+
Add a label to the node.
|
| 563 |
+
This assumes a label key can only have one value.
|
| 564 |
+
"""
|
| 565 |
+
assert (
|
| 566 |
+
self.labels.get(label_name) is None
|
| 567 |
+
or self.labels[label_name] == label_value
|
| 568 |
+
), (
|
| 569 |
+
f"Label {label_name} already exists with value "
|
| 570 |
+
f"{self.labels[label_name]}, cannot set to "
|
| 571 |
+
f"{label_value}"
|
| 572 |
+
)
|
| 573 |
+
self.labels[label_name] = label_value
|
| 574 |
+
|
| 575 |
+
def __repr__(self) -> str:
|
| 576 |
+
return (
|
| 577 |
+
"SchedulingNode(node_type={node_type}, "
|
| 578 |
+
"node_kind={node_kind}, "
|
| 579 |
+
"instance_id={instance_id},"
|
| 580 |
+
"ray_node_id={ray_node_id},"
|
| 581 |
+
"idle_duration_ms={idle_duration_ms},"
|
| 582 |
+
"termination_request={termination_request},"
|
| 583 |
+
"status={status}, "
|
| 584 |
+
"total_resources={total_resources}, "
|
| 585 |
+
"available_resources_for_demand={available_resources_for_demand}, "
|
| 586 |
+
"available_resources_for_cluster_resource_constraints="
|
| 587 |
+
"{available_resources_for_cluster_resource_constraints},"
|
| 588 |
+
"labels={labels}, launch_reason={launch_reason}), "
|
| 589 |
+
"sched_requests_for_demand={sched_requests_for_demand}), "
|
| 590 |
+
"sched_requests_for_cluster_resource_constraints="
|
| 591 |
+
"{sched_requests_for_cluster_resources_constraint})"
|
| 592 |
+
).format(
|
| 593 |
+
node_type=self.node_type,
|
| 594 |
+
node_kind=self.node_kind,
|
| 595 |
+
instance_id=self.im_instance_id,
|
| 596 |
+
ray_node_id=self.ray_node_id,
|
| 597 |
+
idle_duration_ms=self.idle_duration_ms,
|
| 598 |
+
termination_request=str(message_to_dict(self.termination_request))
|
| 599 |
+
if self.termination_request
|
| 600 |
+
else None,
|
| 601 |
+
status=self.status,
|
| 602 |
+
total_resources=self.total_resources,
|
| 603 |
+
available_resources_for_demand=self.available_resources_for_sched[
|
| 604 |
+
ResourceRequestSource.PENDING_DEMAND
|
| 605 |
+
],
|
| 606 |
+
available_resources_for_cluster_resource_constraints=self.available_resources_for_sched[ # noqa
|
| 607 |
+
ResourceRequestSource.CLUSTER_RESOURCE_CONSTRAINT
|
| 608 |
+
],
|
| 609 |
+
labels=self.labels,
|
| 610 |
+
launch_reason=self.launch_reason,
|
| 611 |
+
sched_requests_for_demand="|".join(
|
| 612 |
+
str(message_to_dict(r))
|
| 613 |
+
for r in self.sched_requests[ResourceRequestSource.PENDING_DEMAND]
|
| 614 |
+
),
|
| 615 |
+
sched_requests_for_cluster_resources_constraint="|".join(
|
| 616 |
+
str(message_to_dict(r))
|
| 617 |
+
for r in self.sched_requests[
|
| 618 |
+
ResourceRequestSource.CLUSTER_RESOURCE_CONSTRAINT
|
| 619 |
+
]
|
| 620 |
+
),
|
| 621 |
+
)
|
| 622 |
+
|
| 623 |
+
|
| 624 |
+
class ResourceDemandScheduler(IResourceScheduler):
|
| 625 |
+
"""
|
| 626 |
+
A resource demand scheduler that schedules resource requests based on the
|
| 627 |
+
following rules:
|
| 628 |
+
1. Enforce the minimal count of nodes for each worker node type.
|
| 629 |
+
2. Enforce the cluster resource constraints.
|
| 630 |
+
3. Schedule the gang resource requests.
|
| 631 |
+
4. Schedule the tasks/actor resource requests
|
| 632 |
+
"""
|
| 633 |
+
|
| 634 |
+
def __init__(self, event_logger: Optional[AutoscalerEventLogger] = None):
|
| 635 |
+
self._event_logger = event_logger
|
| 636 |
+
|
| 637 |
+
@dataclass
|
| 638 |
+
class ScheduleContext:
|
| 639 |
+
"""
|
| 640 |
+
Encapsulates the context for processing one scheduling request.
|
| 641 |
+
|
| 642 |
+
This exposes functions to read and write the scheduling nodes, to prevent
|
| 643 |
+
accidental modification of the internal state.
|
| 644 |
+
"""
|
| 645 |
+
|
| 646 |
+
# The node type configs for this scheduling request.
|
| 647 |
+
_node_type_configs: Dict[NodeType, NodeTypeConfig]
|
| 648 |
+
# If outdated node check through launch config is disabled.
|
| 649 |
+
_disable_launch_config_check: bool
|
| 650 |
+
# The max number of nodes for the entire cluster.
|
| 651 |
+
_max_num_nodes: Optional[int] = None
|
| 652 |
+
# The idle timeout in seconds.
|
| 653 |
+
_idle_timeout_s: Optional[float] = None
|
| 654 |
+
# The current schedulable nodes (including pending nodes and pending requests).
|
| 655 |
+
_nodes: List[SchedulingNode] = field(default_factory=list)
|
| 656 |
+
# The number of nodes by node types available for launching based on the max
|
| 657 |
+
# number of workers in the config. This takes into account any pending/running
|
| 658 |
+
# nodes.
|
| 659 |
+
_node_type_available: Dict[NodeType, int] = field(default_factory=dict)
|
| 660 |
+
|
| 661 |
+
def __init__(
|
| 662 |
+
self,
|
| 663 |
+
nodes: List[SchedulingNode],
|
| 664 |
+
node_type_configs: Dict[NodeType, NodeTypeConfig],
|
| 665 |
+
disable_launch_config_check: bool,
|
| 666 |
+
max_num_nodes: Optional[int] = None,
|
| 667 |
+
idle_timeout_s: Optional[float] = None,
|
| 668 |
+
):
|
| 669 |
+
self._nodes = nodes
|
| 670 |
+
self._node_type_configs = node_type_configs
|
| 671 |
+
self._node_type_available = self._compute_available_node_types(
|
| 672 |
+
nodes, node_type_configs
|
| 673 |
+
)
|
| 674 |
+
self._max_num_nodes = max_num_nodes
|
| 675 |
+
self._idle_timeout_s = idle_timeout_s
|
| 676 |
+
self._disable_launch_config_check = disable_launch_config_check
|
| 677 |
+
|
| 678 |
+
@classmethod
|
| 679 |
+
def from_schedule_request(
|
| 680 |
+
cls, req: SchedulingRequest
|
| 681 |
+
) -> "ResourceDemandScheduler.ScheduleContext":
|
| 682 |
+
"""
|
| 683 |
+
Create a schedule context from a schedule request.
|
| 684 |
+
It will populate the context with the existing nodes and the available node
|
| 685 |
+
types from the config.
|
| 686 |
+
|
| 687 |
+
Args:
|
| 688 |
+
req: The scheduling request. The caller should make sure the
|
| 689 |
+
request is valid.
|
| 690 |
+
"""
|
| 691 |
+
|
| 692 |
+
nodes = []
|
| 693 |
+
node_type_configs = req.node_type_configs
|
| 694 |
+
|
| 695 |
+
# Initialize the scheduling nodes.
|
| 696 |
+
for instance in req.current_instances:
|
| 697 |
+
node = SchedulingNode.new(
|
| 698 |
+
instance, node_type_configs, req.disable_launch_config_check
|
| 699 |
+
)
|
| 700 |
+
if node:
|
| 701 |
+
nodes.append(node)
|
| 702 |
+
|
| 703 |
+
return cls(
|
| 704 |
+
nodes=nodes,
|
| 705 |
+
node_type_configs=node_type_configs,
|
| 706 |
+
disable_launch_config_check=req.disable_launch_config_check,
|
| 707 |
+
max_num_nodes=req.max_num_nodes,
|
| 708 |
+
idle_timeout_s=req.idle_timeout_s,
|
| 709 |
+
)
|
| 710 |
+
|
| 711 |
+
@staticmethod
|
| 712 |
+
def _compute_available_node_types(
|
| 713 |
+
nodes: List[SchedulingNode],
|
| 714 |
+
node_type_configs: Dict[NodeType, NodeTypeConfig],
|
| 715 |
+
) -> Dict[NodeType, int]:
|
| 716 |
+
"""
|
| 717 |
+
Compute the number of nodes by node types available for launching based on
|
| 718 |
+
the max number of workers in the config.
|
| 719 |
+
Args:
|
| 720 |
+
nodes: The current existing nodes.
|
| 721 |
+
node_type_configs: The node type configs.
|
| 722 |
+
Returns:
|
| 723 |
+
A dict of node types and the number of nodes available for launching.
|
| 724 |
+
"""
|
| 725 |
+
node_type_available: Dict[NodeType, int] = defaultdict(int)
|
| 726 |
+
node_type_existing: Dict[NodeType, int] = defaultdict(int)
|
| 727 |
+
for node in nodes:
|
| 728 |
+
node_type_existing[node.node_type] += 1
|
| 729 |
+
|
| 730 |
+
for (
|
| 731 |
+
node_type,
|
| 732 |
+
node_type_config,
|
| 733 |
+
) in node_type_configs.items():
|
| 734 |
+
node_type_available[
|
| 735 |
+
node_type
|
| 736 |
+
] = node_type_config.max_worker_nodes - node_type_existing.get(
|
| 737 |
+
node_type, 0
|
| 738 |
+
)
|
| 739 |
+
|
| 740 |
+
return node_type_available
|
| 741 |
+
|
| 742 |
+
def get_nodes(self) -> List[SchedulingNode]:
|
| 743 |
+
"""
|
| 744 |
+
Get the current nodes with filter.
|
| 745 |
+
|
| 746 |
+
Returns:
|
| 747 |
+
A list of nodes.
|
| 748 |
+
"""
|
| 749 |
+
nodes = copy.deepcopy(self._nodes)
|
| 750 |
+
return nodes
|
| 751 |
+
|
| 752 |
+
def get_node_type_available(self) -> Dict[NodeType, int]:
|
| 753 |
+
return copy.deepcopy(self._node_type_available)
|
| 754 |
+
|
| 755 |
+
def get_cluster_shape(self) -> Dict[NodeType, int]:
|
| 756 |
+
cluster_shape = defaultdict(int)
|
| 757 |
+
for node in self._nodes:
|
| 758 |
+
if node.status == SchedulingNodeStatus.TO_TERMINATE:
|
| 759 |
+
# Skip the nodes that are to be terminated.
|
| 760 |
+
continue
|
| 761 |
+
|
| 762 |
+
cluster_shape[node.node_type] += 1
|
| 763 |
+
return cluster_shape
|
| 764 |
+
|
| 765 |
+
def get_idle_timeout_s(self) -> Optional[float]:
|
| 766 |
+
return self._idle_timeout_s
|
| 767 |
+
|
| 768 |
+
def update(self, new_nodes: List[SchedulingNode]) -> None:
|
| 769 |
+
"""
|
| 770 |
+
Update the context with the new nodes.
|
| 771 |
+
"""
|
| 772 |
+
self._nodes = new_nodes
|
| 773 |
+
|
| 774 |
+
# Update the available node types.
|
| 775 |
+
self._node_type_available = self._compute_available_node_types(
|
| 776 |
+
self._nodes, self._node_type_configs
|
| 777 |
+
)
|
| 778 |
+
|
| 779 |
+
def get_max_num_nodes(self) -> Optional[int]:
|
| 780 |
+
"""
|
| 781 |
+
Get the max number of nodes for the entire cluster.
|
| 782 |
+
"""
|
| 783 |
+
return self._max_num_nodes
|
| 784 |
+
|
| 785 |
+
def get_node_type_configs(self) -> Dict[NodeType, NodeTypeConfig]:
|
| 786 |
+
return self._node_type_configs
|
| 787 |
+
|
| 788 |
+
def __str__(self) -> str:
|
| 789 |
+
return "ScheduleContext({} nodes, node_type_available={})".format(
|
| 790 |
+
len(self._nodes), dict(self._node_type_available)
|
| 791 |
+
)
|
| 792 |
+
|
| 793 |
+
def get_launch_requests(self) -> List[LaunchRequest]:
|
| 794 |
+
"""
|
| 795 |
+
Get the launch requests for the nodes that are to be launched.
|
| 796 |
+
"""
|
| 797 |
+
launch_by_type = defaultdict(int)
|
| 798 |
+
for node in self._nodes:
|
| 799 |
+
if node.status == SchedulingNodeStatus.TO_LAUNCH:
|
| 800 |
+
launch_by_type[node.node_type] += 1
|
| 801 |
+
|
| 802 |
+
launch_requests = []
|
| 803 |
+
for instance_type, count in launch_by_type.items():
|
| 804 |
+
launch_requests.append(
|
| 805 |
+
LaunchRequest(
|
| 806 |
+
instance_type=instance_type,
|
| 807 |
+
count=count,
|
| 808 |
+
id=str(uuid.uuid4()),
|
| 809 |
+
request_ts_ms=time.time_ns() // 1000,
|
| 810 |
+
)
|
| 811 |
+
)
|
| 812 |
+
return launch_requests
|
| 813 |
+
|
| 814 |
+
def get_terminate_requests(
|
| 815 |
+
self,
|
| 816 |
+
) -> List[TerminationRequest]:
|
| 817 |
+
"""
|
| 818 |
+
Get the terminate requests for the nodes that are to be terminated.
|
| 819 |
+
"""
|
| 820 |
+
return [
|
| 821 |
+
node.termination_request
|
| 822 |
+
for node in self._nodes
|
| 823 |
+
if node.termination_request is not None
|
| 824 |
+
]
|
| 825 |
+
|
| 826 |
+
def schedule(self, request: SchedulingRequest) -> SchedulingReply:
|
| 827 |
+
logger.debug(
|
| 828 |
+
"Scheduling for request: resource_request={}, gang_resource_request={}, "
|
| 829 |
+
"cluster_constraint={}".format(
|
| 830 |
+
ResourceRequestUtil.to_dict_list(request.resource_requests),
|
| 831 |
+
ProtobufUtil.to_dict_list(request.gang_resource_requests),
|
| 832 |
+
ProtobufUtil.to_dict_list(request.cluster_resource_constraints),
|
| 833 |
+
)
|
| 834 |
+
)
|
| 835 |
+
|
| 836 |
+
ctx = ResourceDemandScheduler.ScheduleContext.from_schedule_request(request)
|
| 837 |
+
|
| 838 |
+
# Enforce outdate nodes.
|
| 839 |
+
ResourceDemandScheduler._terminate_outdated_nodes(ctx)
|
| 840 |
+
|
| 841 |
+
# Enforce the minimal count of nodes for each worker node type.
|
| 842 |
+
ResourceDemandScheduler._enforce_min_workers_per_type(ctx)
|
| 843 |
+
|
| 844 |
+
# Enforce the max worker nodes count.
|
| 845 |
+
ResourceDemandScheduler._enforce_max_workers_per_type(ctx)
|
| 846 |
+
|
| 847 |
+
# Enforce the max worker nodes count globally.
|
| 848 |
+
ResourceDemandScheduler._enforce_max_workers_global(ctx)
|
| 849 |
+
|
| 850 |
+
# Enforce the cluster resource constraints.
|
| 851 |
+
infeasible_constraints = ResourceDemandScheduler._enforce_resource_constraints(
|
| 852 |
+
ctx, request.cluster_resource_constraints
|
| 853 |
+
)
|
| 854 |
+
|
| 855 |
+
# Schedule the gang resource requests.
|
| 856 |
+
infeasible_gang_requests = (
|
| 857 |
+
ResourceDemandScheduler._sched_gang_resource_requests(
|
| 858 |
+
ctx, request.gang_resource_requests
|
| 859 |
+
)
|
| 860 |
+
)
|
| 861 |
+
|
| 862 |
+
# Schedule the tasks/actor resource requests
|
| 863 |
+
infeasible_requests = ResourceDemandScheduler._sched_resource_requests(
|
| 864 |
+
ctx,
|
| 865 |
+
ResourceRequestUtil.ungroup_by_count(request.resource_requests),
|
| 866 |
+
)
|
| 867 |
+
|
| 868 |
+
# Shutdown any idle nodes that's not needed (e.g. no resource constraints.
|
| 869 |
+
# not needed by min_worker count, etc.)
|
| 870 |
+
ResourceDemandScheduler._enforce_idle_termination(ctx)
|
| 871 |
+
|
| 872 |
+
# Compute the number of nodes to launch.
|
| 873 |
+
reply = SchedulingReply(
|
| 874 |
+
infeasible_resource_requests=infeasible_requests,
|
| 875 |
+
infeasible_gang_resource_requests=infeasible_gang_requests,
|
| 876 |
+
infeasible_cluster_resource_constraints=infeasible_constraints,
|
| 877 |
+
to_launch=ctx.get_launch_requests(),
|
| 878 |
+
to_terminate=ctx.get_terminate_requests(),
|
| 879 |
+
)
|
| 880 |
+
|
| 881 |
+
if self._event_logger is not None:
|
| 882 |
+
try:
|
| 883 |
+
self._event_logger.log_cluster_scheduling_update(
|
| 884 |
+
launch_requests=reply.to_launch,
|
| 885 |
+
terminate_requests=reply.to_terminate,
|
| 886 |
+
infeasible_requests=infeasible_requests,
|
| 887 |
+
infeasible_gang_requests=infeasible_gang_requests,
|
| 888 |
+
infeasible_cluster_resource_constraints=infeasible_constraints,
|
| 889 |
+
cluster_shape=ctx.get_cluster_shape(),
|
| 890 |
+
node_type_configs=ctx.get_node_type_configs(),
|
| 891 |
+
)
|
| 892 |
+
except Exception:
|
| 893 |
+
logger.exception("Failed to emit event logs.")
|
| 894 |
+
|
| 895 |
+
return reply
|
| 896 |
+
|
| 897 |
+
@staticmethod
|
| 898 |
+
def _enforce_max_workers_per_type(
|
| 899 |
+
ctx: "ResourceDemandScheduler.ScheduleContext",
|
| 900 |
+
) -> None:
|
| 901 |
+
"""
|
| 902 |
+
Enforce the max number of workers for each node type.
|
| 903 |
+
"""
|
| 904 |
+
|
| 905 |
+
# Get all the nodes by type
|
| 906 |
+
all_nodes = ctx.get_nodes()
|
| 907 |
+
|
| 908 |
+
non_terminating_nodes_by_type = defaultdict(list)
|
| 909 |
+
terminating_nodes = []
|
| 910 |
+
for node in all_nodes:
|
| 911 |
+
if node.status == SchedulingNodeStatus.TO_TERMINATE:
|
| 912 |
+
terminating_nodes.append(node)
|
| 913 |
+
else:
|
| 914 |
+
non_terminating_nodes_by_type[node.node_type].append(node)
|
| 915 |
+
|
| 916 |
+
# Step 1. Enforce the max number of workers for each node type.
|
| 917 |
+
for node_type in non_terminating_nodes_by_type.keys():
|
| 918 |
+
non_terminate_nodes_of_type = non_terminating_nodes_by_type[node_type]
|
| 919 |
+
node_config = ctx.get_node_type_configs()[node_type]
|
| 920 |
+
num_max_nodes_per_type = node_config.max_worker_nodes
|
| 921 |
+
num_extra_nodes = len(non_terminate_nodes_of_type) - num_max_nodes_per_type
|
| 922 |
+
|
| 923 |
+
if num_extra_nodes <= 0:
|
| 924 |
+
# No extra nodes for this type, continue.
|
| 925 |
+
continue
|
| 926 |
+
|
| 927 |
+
# Terminate the nodes
|
| 928 |
+
(
|
| 929 |
+
to_terminate,
|
| 930 |
+
remained_nodes,
|
| 931 |
+
) = ResourceDemandScheduler._select_nodes_to_terminate(
|
| 932 |
+
non_terminate_nodes_of_type,
|
| 933 |
+
num_extra_nodes,
|
| 934 |
+
TerminationRequest.Cause.MAX_NUM_NODE_PER_TYPE,
|
| 935 |
+
max_num_nodes_per_type=num_max_nodes_per_type,
|
| 936 |
+
)
|
| 937 |
+
|
| 938 |
+
non_terminating_nodes_by_type[node_type] = remained_nodes
|
| 939 |
+
terminating_nodes.extend(to_terminate)
|
| 940 |
+
|
| 941 |
+
non_terminating_nodes = []
|
| 942 |
+
for nodes in non_terminating_nodes_by_type.values():
|
| 943 |
+
non_terminating_nodes.extend(nodes)
|
| 944 |
+
|
| 945 |
+
# Update the context
|
| 946 |
+
assert len(all_nodes) == len(
|
| 947 |
+
terminating_nodes + non_terminating_nodes
|
| 948 |
+
), "The number of nodes should be the same after enforcing max nodes per type."
|
| 949 |
+
|
| 950 |
+
ctx.update(terminating_nodes + non_terminating_nodes)
|
| 951 |
+
|
| 952 |
+
if terminating_nodes:
|
| 953 |
+
logger.debug(
|
| 954 |
+
f"Terminating {len(terminating_nodes)} "
|
| 955 |
+
"nodes for per node type max num node's constraints."
|
| 956 |
+
)
|
| 957 |
+
|
| 958 |
+
@staticmethod
|
| 959 |
+
def _enforce_max_workers_global(
|
| 960 |
+
ctx: "ResourceDemandScheduler.ScheduleContext",
|
| 961 |
+
) -> None:
|
| 962 |
+
"""
|
| 963 |
+
Enforce the max number of workers for the entire cluster.
|
| 964 |
+
"""
|
| 965 |
+
all_nodes = ctx.get_nodes()
|
| 966 |
+
|
| 967 |
+
terminating_nodes = []
|
| 968 |
+
non_terminating_nodes = []
|
| 969 |
+
|
| 970 |
+
for node in all_nodes:
|
| 971 |
+
if node.status == SchedulingNodeStatus.TO_TERMINATE:
|
| 972 |
+
terminating_nodes.append(node)
|
| 973 |
+
else:
|
| 974 |
+
non_terminating_nodes.append(node)
|
| 975 |
+
|
| 976 |
+
num_max_nodes = ctx.get_max_num_nodes()
|
| 977 |
+
|
| 978 |
+
num_to_terminate = (
|
| 979 |
+
max(len(non_terminating_nodes) - num_max_nodes, 0) if num_max_nodes else 0
|
| 980 |
+
)
|
| 981 |
+
|
| 982 |
+
if num_to_terminate <= 0:
|
| 983 |
+
# No extra nodes needed to terminate.
|
| 984 |
+
return
|
| 985 |
+
|
| 986 |
+
# Terminate the nodes
|
| 987 |
+
(
|
| 988 |
+
to_terminate_nodes,
|
| 989 |
+
non_terminating_nodes,
|
| 990 |
+
) = ResourceDemandScheduler._select_nodes_to_terminate(
|
| 991 |
+
non_terminating_nodes,
|
| 992 |
+
num_to_terminate,
|
| 993 |
+
TerminationRequest.Cause.MAX_NUM_NODES,
|
| 994 |
+
max_num_nodes=num_max_nodes,
|
| 995 |
+
)
|
| 996 |
+
|
| 997 |
+
assert len(to_terminate_nodes) == num_to_terminate, (
|
| 998 |
+
"Terminating {} nodes, failed to terminate {} nodes to "
|
| 999 |
+
"satisfy max_num_nodes={}".format(
|
| 1000 |
+
len(to_terminate_nodes),
|
| 1001 |
+
num_to_terminate - len(to_terminate_nodes),
|
| 1002 |
+
num_max_nodes,
|
| 1003 |
+
)
|
| 1004 |
+
)
|
| 1005 |
+
|
| 1006 |
+
# Update the context
|
| 1007 |
+
terminating_nodes.extend(to_terminate_nodes)
|
| 1008 |
+
assert len(all_nodes) == len(
|
| 1009 |
+
terminating_nodes + non_terminating_nodes
|
| 1010 |
+
), "The number of nodes should be the same after enforcing max nodes."
|
| 1011 |
+
|
| 1012 |
+
all_nodes = terminating_nodes + non_terminating_nodes
|
| 1013 |
+
ctx.update(all_nodes)
|
| 1014 |
+
|
| 1015 |
+
@staticmethod
|
| 1016 |
+
def _select_nodes_to_terminate(
|
| 1017 |
+
nodes: List[SchedulingNode],
|
| 1018 |
+
num_to_terminate: int,
|
| 1019 |
+
cause: TerminationRequest.Cause,
|
| 1020 |
+
max_num_nodes: Optional[int] = None,
|
| 1021 |
+
max_num_nodes_per_type: Optional[int] = None,
|
| 1022 |
+
) -> Tuple[List[SchedulingNode], List[SchedulingNode]]:
|
| 1023 |
+
"""
|
| 1024 |
+
Select 'num_to_terminate' of nodes to be terminated
|
| 1025 |
+
from the 'nodes' list. It should never select a head node.
|
| 1026 |
+
|
| 1027 |
+
Args:
|
| 1028 |
+
nodes: The nodes to be terminated.
|
| 1029 |
+
num_to_terminate: The number of nodes to be terminated.
|
| 1030 |
+
cause: The cause of the termination. Should be one of
|
| 1031 |
+
TerminationRequest.Cause.MAX_NUM_NODES or
|
| 1032 |
+
TerminationRequest.Cause.MAX_NUM_NODE_PER_TYPE.
|
| 1033 |
+
|
| 1034 |
+
max_num_nodes: The max number of nodes for the entire cluster only
|
| 1035 |
+
used when the cause is TerminationRequest.Cause.MAX_NUM_NODES.
|
| 1036 |
+
max_num_nodes_per_type: The max number of nodes for each node type.
|
| 1037 |
+
Only used when the cause is
|
| 1038 |
+
TerminationRequest.Cause.MAX_NUM_NODE_PER_TYPE.
|
| 1039 |
+
|
| 1040 |
+
Returns:
|
| 1041 |
+
A tuple of:
|
| 1042 |
+
- The terminated nodes.
|
| 1043 |
+
- The remained nodes.
|
| 1044 |
+
"""
|
| 1045 |
+
|
| 1046 |
+
# Sort the nodes for termination.
|
| 1047 |
+
nodes.sort(key=ResourceDemandScheduler._sort_nodes_for_termination)
|
| 1048 |
+
|
| 1049 |
+
# Remove the head node from the list.
|
| 1050 |
+
head_node = None
|
| 1051 |
+
for i, node in enumerate(nodes):
|
| 1052 |
+
if node.node_kind == NodeKind.HEAD:
|
| 1053 |
+
# Remove the head node from the list.
|
| 1054 |
+
head_node = nodes.pop(i)
|
| 1055 |
+
break
|
| 1056 |
+
|
| 1057 |
+
terminated_nodes, remained_nodes = (
|
| 1058 |
+
nodes[:num_to_terminate],
|
| 1059 |
+
# The head could be None if there's no head node being reported yet
|
| 1060 |
+
# from the ray cluster.
|
| 1061 |
+
nodes[num_to_terminate:] + ([head_node] if head_node else []),
|
| 1062 |
+
)
|
| 1063 |
+
|
| 1064 |
+
assert cause in [
|
| 1065 |
+
TerminationRequest.Cause.MAX_NUM_NODES,
|
| 1066 |
+
TerminationRequest.Cause.MAX_NUM_NODE_PER_TYPE,
|
| 1067 |
+
], "Other termination causes don't have to select nodes for termination."
|
| 1068 |
+
|
| 1069 |
+
for node in terminated_nodes:
|
| 1070 |
+
node.status = SchedulingNodeStatus.TO_TERMINATE
|
| 1071 |
+
node.termination_request = TerminationRequest(
|
| 1072 |
+
id=str(uuid.uuid4()),
|
| 1073 |
+
instance_id=node.im_instance_id,
|
| 1074 |
+
ray_node_id=node.ray_node_id,
|
| 1075 |
+
cause=cause,
|
| 1076 |
+
instance_type=node.node_type,
|
| 1077 |
+
details=(
|
| 1078 |
+
f"Terminating node due to {TerminationRequest.Cause.Name(cause)}: "
|
| 1079 |
+
f"max_num_nodes={max_num_nodes}, "
|
| 1080 |
+
f"max_num_nodes_per_type={max_num_nodes_per_type}"
|
| 1081 |
+
),
|
| 1082 |
+
)
|
| 1083 |
+
if cause == TerminationRequest.Cause.MAX_NUM_NODES:
|
| 1084 |
+
node.termination_request.max_num_nodes = max_num_nodes
|
| 1085 |
+
elif cause == TerminationRequest.Cause.MAX_NUM_NODE_PER_TYPE:
|
| 1086 |
+
node.termination_request.max_num_nodes_per_type = max_num_nodes_per_type
|
| 1087 |
+
else:
|
| 1088 |
+
raise ValueError("Unknown termination cause: {}".format(cause))
|
| 1089 |
+
|
| 1090 |
+
return terminated_nodes, remained_nodes
|
| 1091 |
+
|
| 1092 |
+
@staticmethod
|
| 1093 |
+
def _sort_nodes_for_termination(node: SchedulingNode) -> Tuple:
|
| 1094 |
+
"""
|
| 1095 |
+
Sort the nodes for termination increasingly by:
|
| 1096 |
+
|
| 1097 |
+
1. First if ray hasn't been started yet
|
| 1098 |
+
2. Then if the nodes are idle
|
| 1099 |
+
3. Then with lower resources util nodes first.
|
| 1100 |
+
|
| 1101 |
+
Such that nodes sorted earlier will be terminated first.
|
| 1102 |
+
"""
|
| 1103 |
+
|
| 1104 |
+
running_ray = len(node.ray_node_id) > 0
|
| 1105 |
+
# Reverse the idle duration such that the nodes with the largest idle duration
|
| 1106 |
+
# will be terminated first.
|
| 1107 |
+
idle_dur = -1 * node.idle_duration_ms
|
| 1108 |
+
available_resources = node.get_available_resources(
|
| 1109 |
+
ResourceRequestSource.PENDING_DEMAND
|
| 1110 |
+
)
|
| 1111 |
+
|
| 1112 |
+
utils_per_resources = {}
|
| 1113 |
+
for resource, total in node.total_resources.items():
|
| 1114 |
+
if total <= 0:
|
| 1115 |
+
continue
|
| 1116 |
+
utils_per_resources[resource] = (
|
| 1117 |
+
total - available_resources.get(resource, 0)
|
| 1118 |
+
) / total
|
| 1119 |
+
|
| 1120 |
+
avg_util = (
|
| 1121 |
+
sum(utils_per_resources.values()) / len(utils_per_resources)
|
| 1122 |
+
if utils_per_resources
|
| 1123 |
+
else 0
|
| 1124 |
+
)
|
| 1125 |
+
|
| 1126 |
+
return (running_ray, idle_dur, avg_util)
|
| 1127 |
+
|
| 1128 |
+
@staticmethod
|
| 1129 |
+
def _enforce_min_workers_per_type(
|
| 1130 |
+
ctx: "ResourceDemandScheduler.ScheduleContext",
|
| 1131 |
+
) -> None:
|
| 1132 |
+
"""
|
| 1133 |
+
Enforce the minimal count of nodes for each worker node type.
|
| 1134 |
+
"""
|
| 1135 |
+
|
| 1136 |
+
# Count the existing nodes by type
|
| 1137 |
+
count_by_node_type = ctx.get_cluster_shape()
|
| 1138 |
+
|
| 1139 |
+
new_nodes = []
|
| 1140 |
+
# Launch new nodes to satisfy min count for each node type.
|
| 1141 |
+
for (
|
| 1142 |
+
node_type,
|
| 1143 |
+
node_type_config,
|
| 1144 |
+
) in ctx.get_node_type_configs().items():
|
| 1145 |
+
cur_count = count_by_node_type.get(node_type, 0)
|
| 1146 |
+
min_count = node_type_config.min_worker_nodes
|
| 1147 |
+
if cur_count < min_count:
|
| 1148 |
+
logger.info(
|
| 1149 |
+
f"Adding {min_count - cur_count} nodes to satisfy min count for "
|
| 1150 |
+
f"node type: {node_type}."
|
| 1151 |
+
)
|
| 1152 |
+
new_nodes.extend(
|
| 1153 |
+
[
|
| 1154 |
+
SchedulingNode.from_node_config(
|
| 1155 |
+
copy.deepcopy(node_type_config),
|
| 1156 |
+
status=SchedulingNodeStatus.TO_LAUNCH,
|
| 1157 |
+
node_kind=NodeKind.WORKER,
|
| 1158 |
+
)
|
| 1159 |
+
]
|
| 1160 |
+
* (min_count - cur_count)
|
| 1161 |
+
)
|
| 1162 |
+
# NOTE: we assume the aggregated number of min workers across all node types
|
| 1163 |
+
# should not exceed any globally enforced max_num_nodes
|
| 1164 |
+
|
| 1165 |
+
# Add the new nodes to the existing nodes and update the context.
|
| 1166 |
+
ctx.update(new_nodes + ctx.get_nodes())
|
| 1167 |
+
|
| 1168 |
+
@staticmethod
|
| 1169 |
+
def _enforce_resource_constraints(
|
| 1170 |
+
ctx: "ResourceDemandScheduler.ScheduleContext",
|
| 1171 |
+
constraints: List[ClusterResourceConstraint],
|
| 1172 |
+
) -> List[ClusterResourceConstraint]:
|
| 1173 |
+
"""
|
| 1174 |
+
Enforce the cluster resource constraints.
|
| 1175 |
+
|
| 1176 |
+
Args:
|
| 1177 |
+
ctx: The schedule context.
|
| 1178 |
+
constraints: The cluster resource constraints.
|
| 1179 |
+
|
| 1180 |
+
Returns:
|
| 1181 |
+
A list of infeasible constraints.
|
| 1182 |
+
|
| 1183 |
+
Notes:
|
| 1184 |
+
It's different from the other scheduling functions since it doesn't actually
|
| 1185 |
+
schedule any resource requests. Instead, it asks if the cluster could be
|
| 1186 |
+
upscale to a certain shape to fulfill the constraints.
|
| 1187 |
+
"""
|
| 1188 |
+
|
| 1189 |
+
# NOTE: we currently only have 1 constraint from a cluster, but
|
| 1190 |
+
# we may have multiple in the future.
|
| 1191 |
+
assert len(constraints) <= 1, "Max 1 cluster resource constraint is supported."
|
| 1192 |
+
if len(constraints) == 0:
|
| 1193 |
+
# No cluster resource constraints - nothing needs to be done.
|
| 1194 |
+
return []
|
| 1195 |
+
|
| 1196 |
+
constraint = constraints[0]
|
| 1197 |
+
# Flatten the requests for iterating through.
|
| 1198 |
+
requests = ResourceRequestUtil.ungroup_by_count(constraint.resource_requests)
|
| 1199 |
+
|
| 1200 |
+
# Pass the empty nodes to schedule.
|
| 1201 |
+
scheduled_nodes, infeasible = ResourceDemandScheduler._try_schedule(
|
| 1202 |
+
ctx,
|
| 1203 |
+
requests,
|
| 1204 |
+
resource_request_source=ResourceRequestSource.CLUSTER_RESOURCE_CONSTRAINT,
|
| 1205 |
+
)
|
| 1206 |
+
|
| 1207 |
+
if infeasible:
|
| 1208 |
+
# Unable to satisfy the constraint.
|
| 1209 |
+
return [constraint]
|
| 1210 |
+
|
| 1211 |
+
ctx.update(scheduled_nodes)
|
| 1212 |
+
return []
|
| 1213 |
+
|
| 1214 |
+
@staticmethod
|
| 1215 |
+
def _sched_resource_requests(
|
| 1216 |
+
ctx: "ResourceDemandScheduler.ScheduleContext",
|
| 1217 |
+
requests: List[ResourceRequest],
|
| 1218 |
+
) -> List[ResourceRequest]:
|
| 1219 |
+
"""
|
| 1220 |
+
Schedule the resource requests.
|
| 1221 |
+
|
| 1222 |
+
Args:
|
| 1223 |
+
ctx: The schedule context.
|
| 1224 |
+
requests_by_count: The resource requests.
|
| 1225 |
+
|
| 1226 |
+
Returns:
|
| 1227 |
+
A list of infeasible resource requests.
|
| 1228 |
+
"""
|
| 1229 |
+
nodes, infeasible = ResourceDemandScheduler._try_schedule(
|
| 1230 |
+
ctx, requests, resource_request_source=ResourceRequestSource.PENDING_DEMAND
|
| 1231 |
+
)
|
| 1232 |
+
|
| 1233 |
+
# Regardless if there's feasible, we will update the context for schedule nodes.
|
| 1234 |
+
ctx.update(nodes)
|
| 1235 |
+
|
| 1236 |
+
return infeasible
|
| 1237 |
+
|
| 1238 |
+
@staticmethod
|
| 1239 |
+
def _sched_gang_resource_requests(
|
| 1240 |
+
ctx: "ResourceDemandScheduler.ScheduleContext",
|
| 1241 |
+
gang_requests: List[GangResourceRequest],
|
| 1242 |
+
) -> List[GangResourceRequest]:
|
| 1243 |
+
"""
|
| 1244 |
+
Schedule the gang resource requests.
|
| 1245 |
+
|
| 1246 |
+
These requests should be scheduled atomically, i.e. either all of the resources
|
| 1247 |
+
requests in a gang request are scheduled or none of them are scheduled.
|
| 1248 |
+
|
| 1249 |
+
For now, the gang resource requests represent Ray's placement groups, while it
|
| 1250 |
+
could be more general in the future:
|
| 1251 |
+
- For STRICT_PACK placement group requests, we combine them into a single
|
| 1252 |
+
request and try to schedule them together.
|
| 1253 |
+
- For STRICT_SPREAD placement groups requests, they should be scheduled on
|
| 1254 |
+
different nodes by leveraging on the node labels that are associated with
|
| 1255 |
+
the placement group.
|
| 1256 |
+
If there are requests from rescheduling placement groups due to node
|
| 1257 |
+
failures, these requests should not be scheduled on nodes with requests
|
| 1258 |
+
from the same placement group.
|
| 1259 |
+
|
| 1260 |
+
|
| 1261 |
+
Args:
|
| 1262 |
+
ctx: The schedule context.
|
| 1263 |
+
gang_requests: The gang resource requests.
|
| 1264 |
+
|
| 1265 |
+
Returns:
|
| 1266 |
+
A list of infeasible gang resource requests.
|
| 1267 |
+
"""
|
| 1268 |
+
|
| 1269 |
+
def _sort_gang_resource_requests(req: GangResourceRequest) -> Tuple:
|
| 1270 |
+
"""
|
| 1271 |
+
Key function for sorting the gang resource request by:
|
| 1272 |
+
1. the number of placement constraints in the gang request.
|
| 1273 |
+
2. the number of resource requests in the gang request.
|
| 1274 |
+
"""
|
| 1275 |
+
total_placement_constraints = 0
|
| 1276 |
+
for resource_request in req.requests:
|
| 1277 |
+
total_placement_constraints += len(
|
| 1278 |
+
resource_request.placement_constraints
|
| 1279 |
+
)
|
| 1280 |
+
|
| 1281 |
+
return (total_placement_constraints, len(req.requests))
|
| 1282 |
+
|
| 1283 |
+
infeasible_gang_requests = []
|
| 1284 |
+
# Try fulfilling the gang requests one by one.
|
| 1285 |
+
for gang_req in sorted(
|
| 1286 |
+
gang_requests, key=_sort_gang_resource_requests, reverse=True
|
| 1287 |
+
):
|
| 1288 |
+
requests = gang_req.requests
|
| 1289 |
+
# Try to combine requests with affinity constraints into the same request.
|
| 1290 |
+
requests = ResourceRequestUtil.combine_requests_with_affinity(requests)
|
| 1291 |
+
|
| 1292 |
+
nodes, infeasible = ResourceDemandScheduler._try_schedule(
|
| 1293 |
+
ctx, requests, ResourceRequestSource.PENDING_DEMAND
|
| 1294 |
+
)
|
| 1295 |
+
|
| 1296 |
+
if infeasible:
|
| 1297 |
+
# Unable to satisfy the constraint. We will skip the gang request.
|
| 1298 |
+
# Don't update the context.
|
| 1299 |
+
infeasible_gang_requests.append(gang_req)
|
| 1300 |
+
continue
|
| 1301 |
+
|
| 1302 |
+
# We are able to satisfy the constraint and thus update the context.
|
| 1303 |
+
ctx.update(nodes)
|
| 1304 |
+
|
| 1305 |
+
return infeasible_gang_requests
|
| 1306 |
+
|
| 1307 |
+
@staticmethod
|
| 1308 |
+
def _try_schedule(
|
| 1309 |
+
ctx: "ResourceDemandScheduler.ScheduleContext",
|
| 1310 |
+
requests_to_sched: List[ResourceRequest],
|
| 1311 |
+
resource_request_source: ResourceRequestSource,
|
| 1312 |
+
) -> Tuple[List[SchedulingNode], List[ResourceRequest]]:
|
| 1313 |
+
"""
|
| 1314 |
+
Try to schedule the resource requests on the current context.
|
| 1315 |
+
|
| 1316 |
+
It tries to schedule the requests on the existing nodes first, and
|
| 1317 |
+
then try to schedule the requests on new nodes if possible.
|
| 1318 |
+
|
| 1319 |
+
Args:
|
| 1320 |
+
requests_to_sched: The resource requests to be scheduled.
|
| 1321 |
+
ctx: The current scheduling context.
|
| 1322 |
+
resource_request_source: The source of the resource request, i.e.
|
| 1323 |
+
pending demands from ray actors/tasks or cluster resource
|
| 1324 |
+
constraints.
|
| 1325 |
+
|
| 1326 |
+
Returns:
|
| 1327 |
+
- List of scheduled nodes to that have part or all of the requests
|
| 1328 |
+
scheduled.
|
| 1329 |
+
- List of infeasible requests remained that cannot be scheduled.
|
| 1330 |
+
"""
|
| 1331 |
+
# First sort the requests.
|
| 1332 |
+
def _sort_resource_request(req: ResourceRequest) -> Tuple:
|
| 1333 |
+
"""
|
| 1334 |
+
Sort the resource requests by:
|
| 1335 |
+
1. The length of it's placement constraints.
|
| 1336 |
+
2. The number of resources it requests.
|
| 1337 |
+
3. The values of resources it requests.
|
| 1338 |
+
4. lexicographically for each resource (for stable ordering)
|
| 1339 |
+
|
| 1340 |
+
This is a legacy sorting function for the autoscaler's binpacking
|
| 1341 |
+
algo - we do this so that we could have a deterministic scheduling
|
| 1342 |
+
results with reasonable fragmentation.
|
| 1343 |
+
"""
|
| 1344 |
+
return (
|
| 1345 |
+
len(req.placement_constraints),
|
| 1346 |
+
len(req.resources_bundle.values()),
|
| 1347 |
+
sum(req.resources_bundle.values()),
|
| 1348 |
+
sorted(req.resources_bundle.items()),
|
| 1349 |
+
)
|
| 1350 |
+
|
| 1351 |
+
requests_to_sched = sorted(
|
| 1352 |
+
requests_to_sched, key=_sort_resource_request, reverse=True
|
| 1353 |
+
)
|
| 1354 |
+
|
| 1355 |
+
existing_nodes = ctx.get_nodes()
|
| 1356 |
+
node_type_available = ctx.get_node_type_available()
|
| 1357 |
+
|
| 1358 |
+
# A list of nodes that are either:
|
| 1359 |
+
# 1. existing nodes in the cluster. or
|
| 1360 |
+
# 2. new nodes that are launched to satisfy the resource requests.
|
| 1361 |
+
target_nodes = []
|
| 1362 |
+
|
| 1363 |
+
# Try scheduling resource requests with existing nodes first.
|
| 1364 |
+
while len(requests_to_sched) > 0 and len(existing_nodes) > 0:
|
| 1365 |
+
(
|
| 1366 |
+
best_node,
|
| 1367 |
+
requests_to_sched,
|
| 1368 |
+
existing_nodes,
|
| 1369 |
+
) = ResourceDemandScheduler._sched_best_node(
|
| 1370 |
+
requests_to_sched, existing_nodes, resource_request_source
|
| 1371 |
+
)
|
| 1372 |
+
if best_node is None:
|
| 1373 |
+
# No existing nodes can schedule any more requests.
|
| 1374 |
+
break
|
| 1375 |
+
|
| 1376 |
+
target_nodes.append(best_node)
|
| 1377 |
+
|
| 1378 |
+
# If there's any existing nodes left, we will add to the target nodes
|
| 1379 |
+
target_nodes.extend(existing_nodes)
|
| 1380 |
+
|
| 1381 |
+
# Try scheduling resource requests with new nodes.
|
| 1382 |
+
node_pools = [
|
| 1383 |
+
SchedulingNode.from_node_config(
|
| 1384 |
+
ctx.get_node_type_configs()[node_type],
|
| 1385 |
+
status=SchedulingNodeStatus.TO_LAUNCH,
|
| 1386 |
+
node_kind=NodeKind.WORKER,
|
| 1387 |
+
)
|
| 1388 |
+
for node_type, num_available in node_type_available.items()
|
| 1389 |
+
if num_available > 0
|
| 1390 |
+
]
|
| 1391 |
+
while len(requests_to_sched) > 0 and len(node_pools) > 0:
|
| 1392 |
+
# Max number of nodes reached.
|
| 1393 |
+
max_num_nodes = ctx.get_max_num_nodes()
|
| 1394 |
+
if max_num_nodes is not None and len(target_nodes) >= max_num_nodes:
|
| 1395 |
+
logger.debug(
|
| 1396 |
+
"Max number of nodes reached: {}, "
|
| 1397 |
+
"cannot launch more nodes.".format(max_num_nodes)
|
| 1398 |
+
)
|
| 1399 |
+
break
|
| 1400 |
+
|
| 1401 |
+
(
|
| 1402 |
+
best_node,
|
| 1403 |
+
requests_to_sched,
|
| 1404 |
+
node_pools,
|
| 1405 |
+
) = ResourceDemandScheduler._sched_best_node(
|
| 1406 |
+
requests_to_sched, node_pools, resource_request_source
|
| 1407 |
+
)
|
| 1408 |
+
if best_node is None:
|
| 1409 |
+
break
|
| 1410 |
+
|
| 1411 |
+
target_nodes.append(best_node)
|
| 1412 |
+
# Update the node pool if a node with the same node type of the
|
| 1413 |
+
# added node can be launched.
|
| 1414 |
+
node_type_available[best_node.node_type] -= 1
|
| 1415 |
+
if node_type_available[best_node.node_type] > 0:
|
| 1416 |
+
node_pools.append(
|
| 1417 |
+
SchedulingNode.from_node_config(
|
| 1418 |
+
ctx.get_node_type_configs()[best_node.node_type],
|
| 1419 |
+
status=SchedulingNodeStatus.TO_LAUNCH,
|
| 1420 |
+
node_kind=NodeKind.WORKER,
|
| 1421 |
+
)
|
| 1422 |
+
)
|
| 1423 |
+
|
| 1424 |
+
return target_nodes, requests_to_sched
|
| 1425 |
+
|
| 1426 |
+
@staticmethod
|
| 1427 |
+
def _sched_best_node(
|
| 1428 |
+
requests: List[ResourceRequest],
|
| 1429 |
+
nodes: List[SchedulingNode],
|
| 1430 |
+
resource_request_source: ResourceRequestSource,
|
| 1431 |
+
) -> Tuple[SchedulingNode, List[ResourceRequest], List[SchedulingNode]]:
|
| 1432 |
+
"""
|
| 1433 |
+
Schedule the requests on the best node.
|
| 1434 |
+
A simple greedy algorithm is used to schedule the requests:
|
| 1435 |
+
1. Try to schedule the requests on each node.
|
| 1436 |
+
2. Sort the nodes by a score
|
| 1437 |
+
3. Return the node with the highest score.
|
| 1438 |
+
|
| 1439 |
+
The highest score node is updated with the scheduled requests, and the node is
|
| 1440 |
+
removed from the node list.
|
| 1441 |
+
|
| 1442 |
+
Args:
|
| 1443 |
+
requests: The resource requests to be scheduled.
|
| 1444 |
+
nodes: The node candidates to be scheduled on. The nodes will be updated
|
| 1445 |
+
after the scheduling attempt, i.e. the node that is scheduled will be
|
| 1446 |
+
removed from the list.
|
| 1447 |
+
resource_request_source: The source of the resource request, i.e.
|
| 1448 |
+
pending demands from ray actors/tasks or cluster resource constraints.
|
| 1449 |
+
|
| 1450 |
+
Returns:
|
| 1451 |
+
best_node: The best node to schedule the requests.
|
| 1452 |
+
infeasible: The infeasible requests that cannot be scheduled on the best
|
| 1453 |
+
node.
|
| 1454 |
+
nodes: Remaining nodes after the best node is removed.
|
| 1455 |
+
"""
|
| 1456 |
+
results = []
|
| 1457 |
+
|
| 1458 |
+
# A temporary data class to store the scheduling result.
|
| 1459 |
+
@dataclass
|
| 1460 |
+
class ScheduleResult:
|
| 1461 |
+
# The node candidate after a scheduling attempt.
|
| 1462 |
+
node: SchedulingNode
|
| 1463 |
+
# The infeasible resource requests that are not scheduled.
|
| 1464 |
+
infeasible_requests: List[ResourceRequest]
|
| 1465 |
+
# The index of the node in the original node list.
|
| 1466 |
+
idx: int
|
| 1467 |
+
# the score of the scheduling node to compare with others.
|
| 1468 |
+
score: UtilizationScore
|
| 1469 |
+
|
| 1470 |
+
nodes_copy = copy.deepcopy(nodes)
|
| 1471 |
+
|
| 1472 |
+
# Iterate through each node and modify the node's available resources
|
| 1473 |
+
# if the requests are schedulable.
|
| 1474 |
+
for idx, node in enumerate(nodes_copy):
|
| 1475 |
+
remaining, score = node.try_schedule(requests, resource_request_source)
|
| 1476 |
+
|
| 1477 |
+
if len(remaining) == len(requests):
|
| 1478 |
+
# The node cannot schedule any of the requests.
|
| 1479 |
+
continue
|
| 1480 |
+
|
| 1481 |
+
results.append(ScheduleResult(node, remaining, idx, score))
|
| 1482 |
+
|
| 1483 |
+
# No nodes can schedule any of the requests.
|
| 1484 |
+
if len(results) == 0:
|
| 1485 |
+
logger.debug(
|
| 1486 |
+
"No nodes can schedule the requests: {}, for nodes: {}".format(
|
| 1487 |
+
ResourceRequestUtil.to_dict_list(requests), nodes
|
| 1488 |
+
)
|
| 1489 |
+
)
|
| 1490 |
+
return None, requests, nodes
|
| 1491 |
+
|
| 1492 |
+
# Sort the results by score.
|
| 1493 |
+
results = sorted(results, key=lambda r: r.score, reverse=True)
|
| 1494 |
+
best_result = results[0]
|
| 1495 |
+
|
| 1496 |
+
# Remove the best node from the nodes.
|
| 1497 |
+
nodes.pop(best_result.idx)
|
| 1498 |
+
logger.debug(
|
| 1499 |
+
"Best node: {}, score: {}, remaining requests: {}".format(
|
| 1500 |
+
best_result.node,
|
| 1501 |
+
best_result.score,
|
| 1502 |
+
ResourceRequestUtil.to_dict_list(best_result.infeasible_requests),
|
| 1503 |
+
)
|
| 1504 |
+
)
|
| 1505 |
+
return best_result.node, best_result.infeasible_requests, nodes
|
| 1506 |
+
|
| 1507 |
+
@staticmethod
|
| 1508 |
+
def _terminate_outdated_nodes(
|
| 1509 |
+
ctx: "ResourceDemandScheduler.ScheduleContext",
|
| 1510 |
+
) -> None:
|
| 1511 |
+
"""
|
| 1512 |
+
Terminate the nodes that are outdated, i.e. the node type config has been
|
| 1513 |
+
updated or the node's launch config hash is outdated.
|
| 1514 |
+
|
| 1515 |
+
Args:
|
| 1516 |
+
ctx: The schedule context.
|
| 1517 |
+
"""
|
| 1518 |
+
nodes = ctx.get_nodes()
|
| 1519 |
+
|
| 1520 |
+
if ctx._disable_launch_config_check:
|
| 1521 |
+
# Outdated nodes check through launch config check is disabled.
|
| 1522 |
+
return
|
| 1523 |
+
|
| 1524 |
+
for node in nodes:
|
| 1525 |
+
if node.status != SchedulingNodeStatus.SCHEDULABLE:
|
| 1526 |
+
# We don't need to care about the non-running nodes.
|
| 1527 |
+
continue
|
| 1528 |
+
|
| 1529 |
+
if node.node_kind == NodeKind.HEAD:
|
| 1530 |
+
# We should not be terminating the head node even if it's outdated.
|
| 1531 |
+
logger.warning(
|
| 1532 |
+
f"Head node {node.im_instance_id}(ray={node.ray_node_id}) is "
|
| 1533 |
+
"outdated with node config changes. "
|
| 1534 |
+
"Please check the node's config or restart the cluster or restart "
|
| 1535 |
+
"the head node. Autoscaler is not able to shutdown the outdated "
|
| 1536 |
+
"head node"
|
| 1537 |
+
)
|
| 1538 |
+
continue
|
| 1539 |
+
node_type = node.node_type
|
| 1540 |
+
node_type_config = ctx.get_node_type_configs().get(node_type)
|
| 1541 |
+
if node_type_config is None or (
|
| 1542 |
+
node_type_config.launch_config_hash
|
| 1543 |
+
and node_type_config.launch_config_hash != node.launch_config_hash
|
| 1544 |
+
):
|
| 1545 |
+
# The node type config has been updated, and the node's launch config
|
| 1546 |
+
# hash is outdated.
|
| 1547 |
+
node.status = SchedulingNodeStatus.TO_TERMINATE
|
| 1548 |
+
node.termination_request = TerminationRequest(
|
| 1549 |
+
id=str(time.time_ns()),
|
| 1550 |
+
instance_id=node.im_instance_id,
|
| 1551 |
+
ray_node_id=node.ray_node_id,
|
| 1552 |
+
instance_type=node.node_type,
|
| 1553 |
+
cause=TerminationRequest.Cause.OUTDATED,
|
| 1554 |
+
details=f"node from {node.node_type} has outdated config",
|
| 1555 |
+
)
|
| 1556 |
+
|
| 1557 |
+
ctx.update(nodes)
|
| 1558 |
+
|
| 1559 |
+
@staticmethod
|
| 1560 |
+
def _enforce_idle_termination(
|
| 1561 |
+
ctx: "ResourceDemandScheduler.ScheduleContext",
|
| 1562 |
+
) -> None:
|
| 1563 |
+
"""
|
| 1564 |
+
Enforce the idle termination for the nodes that are not needed by the cluster
|
| 1565 |
+
resource constraints and idle for too long.
|
| 1566 |
+
|
| 1567 |
+
Args:
|
| 1568 |
+
ctx: The schedule context.
|
| 1569 |
+
"""
|
| 1570 |
+
count_by_node_type = ctx.get_cluster_shape()
|
| 1571 |
+
node_type_configs = ctx.get_node_type_configs()
|
| 1572 |
+
terminate_nodes_by_type: Dict[NodeType, int] = defaultdict(int)
|
| 1573 |
+
|
| 1574 |
+
nodes = ctx.get_nodes()
|
| 1575 |
+
s_to_ms = 1000
|
| 1576 |
+
for node in nodes:
|
| 1577 |
+
if node.status != SchedulingNodeStatus.SCHEDULABLE:
|
| 1578 |
+
# We don't need to care about the non-running nodes.
|
| 1579 |
+
continue
|
| 1580 |
+
|
| 1581 |
+
if node.node_kind == NodeKind.HEAD:
|
| 1582 |
+
# The head node is not subject to idle termination.
|
| 1583 |
+
continue
|
| 1584 |
+
|
| 1585 |
+
idle_timeout_s = ctx.get_idle_timeout_s()
|
| 1586 |
+
# Override the scheduler idle_timeout_s if set for this node_type.
|
| 1587 |
+
node_type = node.node_type
|
| 1588 |
+
if node_type in node_type_configs:
|
| 1589 |
+
if node_type_configs[node_type].idle_timeout_s is not None:
|
| 1590 |
+
idle_timeout_s = node_type_configs[node_type].idle_timeout_s
|
| 1591 |
+
if idle_timeout_s is None:
|
| 1592 |
+
# No idle timeout is set, skip the idle termination.
|
| 1593 |
+
continue
|
| 1594 |
+
|
| 1595 |
+
if node.idle_duration_ms <= idle_timeout_s * s_to_ms:
|
| 1596 |
+
# The node is not idle for too long, skip it.
|
| 1597 |
+
continue
|
| 1598 |
+
|
| 1599 |
+
if node.sched_requests[ResourceRequestSource.CLUSTER_RESOURCE_CONSTRAINT]:
|
| 1600 |
+
# The node is needed by the resource constraints.
|
| 1601 |
+
# Skip it.
|
| 1602 |
+
if node.idle_duration_ms > ctx.get_idle_timeout_s() * s_to_ms:
|
| 1603 |
+
logger.debug(
|
| 1604 |
+
"Node {} (idle for {} secs) is needed by the cluster resource "
|
| 1605 |
+
"constraints, skip idle termination.".format(
|
| 1606 |
+
node.ray_node_id, node.idle_duration_ms / s_to_ms
|
| 1607 |
+
)
|
| 1608 |
+
)
|
| 1609 |
+
continue
|
| 1610 |
+
|
| 1611 |
+
# Honor the min_worker_nodes setting for the node type.
|
| 1612 |
+
min_count = 0
|
| 1613 |
+
if node_type in node_type_configs:
|
| 1614 |
+
min_count = node_type_configs[node_type].min_worker_nodes
|
| 1615 |
+
if (
|
| 1616 |
+
count_by_node_type.get(node_type, 0)
|
| 1617 |
+
- terminate_nodes_by_type[node_type]
|
| 1618 |
+
<= min_count
|
| 1619 |
+
):
|
| 1620 |
+
logger.info(
|
| 1621 |
+
"Node {} (idle for {} secs) belongs to node_type {} and is "
|
| 1622 |
+
"required by min_worker_nodes, skipping idle termination.".format(
|
| 1623 |
+
node.ray_node_id, node.idle_duration_ms / s_to_ms, node_type
|
| 1624 |
+
)
|
| 1625 |
+
)
|
| 1626 |
+
continue
|
| 1627 |
+
|
| 1628 |
+
terminate_nodes_by_type[node.node_type] += 1
|
| 1629 |
+
# The node is idle for too long, terminate it.
|
| 1630 |
+
node.status = SchedulingNodeStatus.TO_TERMINATE
|
| 1631 |
+
node.termination_request = TerminationRequest(
|
| 1632 |
+
id=str(uuid.uuid4()),
|
| 1633 |
+
instance_id=node.im_instance_id,
|
| 1634 |
+
ray_node_id=node.ray_node_id,
|
| 1635 |
+
cause=TerminationRequest.Cause.IDLE,
|
| 1636 |
+
instance_type=node.node_type,
|
| 1637 |
+
idle_duration_ms=node.idle_duration_ms,
|
| 1638 |
+
details=f"idle for {node.idle_duration_ms/s_to_ms} secs > "
|
| 1639 |
+
f"timeout={idle_timeout_s} secs",
|
| 1640 |
+
)
|
| 1641 |
+
|
| 1642 |
+
ctx.update(nodes)
|
.venv/lib/python3.11/site-packages/ray/autoscaler/v2/schema.py
ADDED
|
@@ -0,0 +1,351 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import re
|
| 2 |
+
from dataclasses import dataclass, field
|
| 3 |
+
from enum import Enum
|
| 4 |
+
from typing import Dict, List, Optional, Tuple
|
| 5 |
+
|
| 6 |
+
from ray.autoscaler.v2.instance_manager.common import InstanceUtil
|
| 7 |
+
from ray.core.generated.autoscaler_pb2 import NodeState, NodeStatus
|
| 8 |
+
from ray.core.generated.instance_manager_pb2 import Instance
|
| 9 |
+
|
| 10 |
+
# TODO(rickyx): once we have graceful shutdown, we could populate
|
| 11 |
+
# the failure detail with the actual termination message. As of now,
|
| 12 |
+
# we will use a more generic message to include cases such as:
|
| 13 |
+
# (idle termination, node death, crash, preemption, etc)
|
| 14 |
+
NODE_DEATH_CAUSE_RAYLET_DIED = "NodeTerminated"
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
# e.g., cpu_4_ondemand.
|
| 18 |
+
NodeType = str
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
@dataclass
|
| 22 |
+
class ResourceUsage:
|
| 23 |
+
# Resource name.
|
| 24 |
+
resource_name: str = ""
|
| 25 |
+
# Total resource.
|
| 26 |
+
total: float = 0.0
|
| 27 |
+
# Resource used.
|
| 28 |
+
used: float = 0.0
|
| 29 |
+
|
| 30 |
+
|
| 31 |
+
@dataclass
|
| 32 |
+
class NodeUsage:
|
| 33 |
+
# The node resource usage.
|
| 34 |
+
usage: List[ResourceUsage]
|
| 35 |
+
# How long the node has been idle.
|
| 36 |
+
idle_time_ms: int
|
| 37 |
+
|
| 38 |
+
|
| 39 |
+
@dataclass
|
| 40 |
+
class NodeInfo:
|
| 41 |
+
# The instance type name, e.g. p3.2xlarge
|
| 42 |
+
instance_type_name: str
|
| 43 |
+
# ray node type name.
|
| 44 |
+
ray_node_type_name: str
|
| 45 |
+
# Cloud instance id.
|
| 46 |
+
instance_id: str
|
| 47 |
+
# Ip address of the node when alive.
|
| 48 |
+
ip_address: str
|
| 49 |
+
# The status of the node. Optional for pending nodes.
|
| 50 |
+
node_status: Optional[str] = None
|
| 51 |
+
# ray node id in hex. None if still pending.
|
| 52 |
+
node_id: Optional[str] = None
|
| 53 |
+
# Resource usage breakdown if node is running.
|
| 54 |
+
resource_usage: Optional[NodeUsage] = None
|
| 55 |
+
# Failure detail if the node failed.
|
| 56 |
+
failure_detail: Optional[str] = None
|
| 57 |
+
# Descriptive details.
|
| 58 |
+
details: Optional[str] = None
|
| 59 |
+
# Activity on the node.
|
| 60 |
+
node_activity: Optional[List[str]] = None
|
| 61 |
+
|
| 62 |
+
def total_resources(self) -> Dict[str, float]:
|
| 63 |
+
if self.resource_usage is None:
|
| 64 |
+
return {}
|
| 65 |
+
return {r.resource_name: r.total for r in self.resource_usage.usage}
|
| 66 |
+
|
| 67 |
+
def available_resources(self) -> Dict[str, float]:
|
| 68 |
+
if self.resource_usage is None:
|
| 69 |
+
return {}
|
| 70 |
+
return {r.resource_name: r.total - r.used for r in self.resource_usage.usage}
|
| 71 |
+
|
| 72 |
+
def used_resources(self) -> Dict[str, float]:
|
| 73 |
+
if self.resource_usage is None:
|
| 74 |
+
return {}
|
| 75 |
+
return {r.resource_name: r.used for r in self.resource_usage.usage}
|
| 76 |
+
|
| 77 |
+
|
| 78 |
+
@dataclass
|
| 79 |
+
class LaunchRequest:
|
| 80 |
+
class Status(Enum):
|
| 81 |
+
FAILED = "FAILED"
|
| 82 |
+
PENDING = "PENDING"
|
| 83 |
+
|
| 84 |
+
# The instance type name, e.g. p3.2xlarge
|
| 85 |
+
instance_type_name: str
|
| 86 |
+
# ray node type name.
|
| 87 |
+
ray_node_type_name: str
|
| 88 |
+
# count.
|
| 89 |
+
count: int
|
| 90 |
+
# State: (e.g. PENDING, FAILED)
|
| 91 |
+
state: Status
|
| 92 |
+
# When the launch request was made in unix timestamp in secs.
|
| 93 |
+
request_ts_s: int
|
| 94 |
+
# When the launch request failed unix timestamp in secs if failed.
|
| 95 |
+
failed_ts_s: Optional[int] = None
|
| 96 |
+
# Request details, e.g. error reason if the launch request failed.
|
| 97 |
+
details: Optional[str] = None
|
| 98 |
+
|
| 99 |
+
|
| 100 |
+
@dataclass
|
| 101 |
+
class ResourceRequestByCount:
|
| 102 |
+
# Bundles in the demand.
|
| 103 |
+
bundle: Dict[str, float]
|
| 104 |
+
# Number of bundles with the same shape.
|
| 105 |
+
count: int
|
| 106 |
+
|
| 107 |
+
def __str__(self) -> str:
|
| 108 |
+
return f"[{self.count} {self.bundle}]"
|
| 109 |
+
|
| 110 |
+
|
| 111 |
+
@dataclass
|
| 112 |
+
class ResourceDemand:
|
| 113 |
+
# The bundles in the demand with shape and count info.
|
| 114 |
+
bundles_by_count: List[ResourceRequestByCount]
|
| 115 |
+
|
| 116 |
+
|
| 117 |
+
@dataclass
|
| 118 |
+
class PlacementGroupResourceDemand(ResourceDemand):
|
| 119 |
+
# Details string (parsed into below information)
|
| 120 |
+
details: str
|
| 121 |
+
# Placement group's id.
|
| 122 |
+
pg_id: Optional[str] = None
|
| 123 |
+
# Strategy, e.g. STRICT_SPREAD
|
| 124 |
+
strategy: Optional[str] = None
|
| 125 |
+
# Placement group's state, e.g. PENDING
|
| 126 |
+
state: Optional[str] = None
|
| 127 |
+
|
| 128 |
+
def __post_init__(self):
|
| 129 |
+
if not self.details:
|
| 130 |
+
return
|
| 131 |
+
|
| 132 |
+
# Details in the format of <pg_id>:<strategy>|<state>, parse
|
| 133 |
+
# it into the above fields.
|
| 134 |
+
pattern = r"^.*:.*\|.*$"
|
| 135 |
+
match = re.match(pattern, self.details)
|
| 136 |
+
if not match:
|
| 137 |
+
return
|
| 138 |
+
|
| 139 |
+
pg_id, details = self.details.split(":")
|
| 140 |
+
strategy, state = details.split("|")
|
| 141 |
+
self.pg_id = pg_id
|
| 142 |
+
self.strategy = strategy
|
| 143 |
+
self.state = state
|
| 144 |
+
|
| 145 |
+
|
| 146 |
+
@dataclass
|
| 147 |
+
class RayTaskActorDemand(ResourceDemand):
|
| 148 |
+
pass
|
| 149 |
+
|
| 150 |
+
|
| 151 |
+
@dataclass
|
| 152 |
+
class ClusterConstraintDemand(ResourceDemand):
|
| 153 |
+
pass
|
| 154 |
+
|
| 155 |
+
|
| 156 |
+
@dataclass
|
| 157 |
+
class ResourceDemandSummary:
|
| 158 |
+
# Placement group demand.
|
| 159 |
+
placement_group_demand: List[PlacementGroupResourceDemand] = field(
|
| 160 |
+
default_factory=list
|
| 161 |
+
)
|
| 162 |
+
# Ray task actor demand.
|
| 163 |
+
ray_task_actor_demand: List[RayTaskActorDemand] = field(default_factory=list)
|
| 164 |
+
# Cluster constraint demand.
|
| 165 |
+
cluster_constraint_demand: List[ClusterConstraintDemand] = field(
|
| 166 |
+
default_factory=list
|
| 167 |
+
)
|
| 168 |
+
|
| 169 |
+
|
| 170 |
+
@dataclass
|
| 171 |
+
class Stats:
|
| 172 |
+
# How long it took to get the GCS request.
|
| 173 |
+
# This is required when initializing the Stats since it should be calculated before
|
| 174 |
+
# the request was made.
|
| 175 |
+
gcs_request_time_s: float
|
| 176 |
+
# How long it took to get all live instances from node provider.
|
| 177 |
+
none_terminated_node_request_time_s: Optional[float] = None
|
| 178 |
+
# How long for autoscaler to process the scaling decision.
|
| 179 |
+
autoscaler_iteration_time_s: Optional[float] = None
|
| 180 |
+
# The last seen autoscaler state version from Ray.
|
| 181 |
+
autoscaler_version: Optional[str] = None
|
| 182 |
+
# The last seen cluster state resource version.
|
| 183 |
+
cluster_resource_state_version: Optional[str] = None
|
| 184 |
+
# Request made time unix timestamp: when the data was pulled from GCS.
|
| 185 |
+
request_ts_s: Optional[int] = None
|
| 186 |
+
|
| 187 |
+
|
| 188 |
+
@dataclass
|
| 189 |
+
class ClusterStatus:
|
| 190 |
+
# Healthy nodes information (non-idle)
|
| 191 |
+
active_nodes: List[NodeInfo] = field(default_factory=list)
|
| 192 |
+
# Idle node information
|
| 193 |
+
idle_nodes: List[NodeInfo] = field(default_factory=list)
|
| 194 |
+
# Pending launches.
|
| 195 |
+
pending_launches: List[LaunchRequest] = field(default_factory=list)
|
| 196 |
+
# Failed launches.
|
| 197 |
+
failed_launches: List[LaunchRequest] = field(default_factory=list)
|
| 198 |
+
# Pending nodes.
|
| 199 |
+
pending_nodes: List[NodeInfo] = field(default_factory=list)
|
| 200 |
+
# Failures
|
| 201 |
+
failed_nodes: List[NodeInfo] = field(default_factory=list)
|
| 202 |
+
# Resource usage summary for entire cluster.
|
| 203 |
+
cluster_resource_usage: List[ResourceUsage] = field(default_factory=list)
|
| 204 |
+
# Demand summary.
|
| 205 |
+
resource_demands: ResourceDemandSummary = field(
|
| 206 |
+
default_factory=ResourceDemandSummary
|
| 207 |
+
)
|
| 208 |
+
# Query metics
|
| 209 |
+
stats: Stats = field(default_factory=Stats)
|
| 210 |
+
|
| 211 |
+
def total_resources(self) -> Dict[str, float]:
|
| 212 |
+
return {r.resource_name: r.total for r in self.cluster_resource_usage}
|
| 213 |
+
|
| 214 |
+
def available_resources(self) -> Dict[str, float]:
|
| 215 |
+
return {r.resource_name: r.total - r.used for r in self.cluster_resource_usage}
|
| 216 |
+
|
| 217 |
+
# TODO(rickyx): we don't show infeasible requests as of now.
|
| 218 |
+
# (They will just be pending forever as part of the demands)
|
| 219 |
+
# We should show them properly in the future.
|
| 220 |
+
|
| 221 |
+
|
| 222 |
+
@dataclass
|
| 223 |
+
class AutoscalerInstance:
|
| 224 |
+
"""
|
| 225 |
+
AutoscalerInstance represents an instance that's managed by the autoscaler.
|
| 226 |
+
This includes two states:
|
| 227 |
+
1. the instance manager state: information of the underlying cloud instance.
|
| 228 |
+
2. the ray node state, e.g. resources, ray node status.
|
| 229 |
+
|
| 230 |
+
The two states are linked by the cloud instance id, which should be set
|
| 231 |
+
when the ray node is started.
|
| 232 |
+
"""
|
| 233 |
+
|
| 234 |
+
# The cloud instance id. It could be None if the instance hasn't been assigned
|
| 235 |
+
# a cloud instance id, e.g. the instance is still in QUEUED or REQUESTED status.
|
| 236 |
+
cloud_instance_id: Optional[str] = None
|
| 237 |
+
|
| 238 |
+
# The ray node state status. It could be None when no ray node is running
|
| 239 |
+
# or has run on the cloud instance: for example, ray is still being installed
|
| 240 |
+
# or the instance manager hasn't had a cloud instance assigned (e.g. QUEUED,
|
| 241 |
+
# REQUESTED).
|
| 242 |
+
ray_node: Optional[NodeState] = None
|
| 243 |
+
|
| 244 |
+
# The instance manager instance state. It would be None when the ray_node is not
|
| 245 |
+
# None.
|
| 246 |
+
# It could be None iff:
|
| 247 |
+
# 1. There's a ray node, but the instance manager hasn't discovered the
|
| 248 |
+
# cloud instance that's running this ray process yet. This could happen since
|
| 249 |
+
# the instance manager only discovers instances periodically.
|
| 250 |
+
#
|
| 251 |
+
# 2. There was a ray node running on the cloud instance, which was already stopped
|
| 252 |
+
# and removed from the instance manager state. But the ray state is still lagging
|
| 253 |
+
# behind.
|
| 254 |
+
#
|
| 255 |
+
# 3. There is a ray node that's unmanaged by the instance manager.
|
| 256 |
+
#
|
| 257 |
+
im_instance: Optional[Instance] = None
|
| 258 |
+
|
| 259 |
+
# | cloud_instance_id | ray_node | im_instance |
|
| 260 |
+
# |-------------------|----------|-------------|
|
| 261 |
+
# | None | None | None | Not possible.
|
| 262 |
+
# | None | None | not None | OK. An instance hasn't had ray running on it yet. # noqa E501
|
| 263 |
+
# | None | Not None | None | OK. Possible if the ray node is not started by autoscaler. # noqa E501
|
| 264 |
+
# | None | Not None | not None | Not possible - no way to link im instance with ray node. # noqa E501
|
| 265 |
+
# | not None | None | None | Not possible since cloud instance id is either part of im state or ray node. # noqa E501
|
| 266 |
+
# | not None | None | not None | OK. e.g. An instance that's not running ray yet. # noqa E501
|
| 267 |
+
# | not None | Not None | None | OK. See scenario 1, 2, 3 above.
|
| 268 |
+
# | not None | Not None | not None | OK. An instance that's running ray.
|
| 269 |
+
def validate(self) -> Tuple[bool, str]:
|
| 270 |
+
"""Validate the autoscaler instance state.
|
| 271 |
+
|
| 272 |
+
Returns:
|
| 273 |
+
A tuple of (valid, error_msg) where:
|
| 274 |
+
- valid is whether the state is valid
|
| 275 |
+
- error_msg is the error message for the validation results.
|
| 276 |
+
"""
|
| 277 |
+
|
| 278 |
+
state_combinations = {
|
| 279 |
+
# (cloud_instance_id is None, ray_node is None, im_instance is None): (valid, error_msg) # noqa E501
|
| 280 |
+
(True, True, True): (False, "Not possible"),
|
| 281 |
+
(True, True, False): (True, ""),
|
| 282 |
+
(True, False, True): (
|
| 283 |
+
True,
|
| 284 |
+
"There's a ray node w/o cloud instance id, must be started not "
|
| 285 |
+
"by autoscaler",
|
| 286 |
+
),
|
| 287 |
+
(True, False, False): (
|
| 288 |
+
False,
|
| 289 |
+
"Not possible - no way to link im instance with ray node",
|
| 290 |
+
),
|
| 291 |
+
(False, True, True): (
|
| 292 |
+
False,
|
| 293 |
+
"Not possible since cloud instance id is either part of "
|
| 294 |
+
"im state or ray node",
|
| 295 |
+
),
|
| 296 |
+
(False, True, False): (True, ""),
|
| 297 |
+
(False, False, True): (True, ""),
|
| 298 |
+
(False, False, False): (True, ""),
|
| 299 |
+
}
|
| 300 |
+
|
| 301 |
+
valid, error_msg = state_combinations[
|
| 302 |
+
(
|
| 303 |
+
self.cloud_instance_id is None,
|
| 304 |
+
self.ray_node is None,
|
| 305 |
+
self.im_instance is None,
|
| 306 |
+
)
|
| 307 |
+
]
|
| 308 |
+
if not valid:
|
| 309 |
+
return valid, error_msg
|
| 310 |
+
|
| 311 |
+
if self.im_instance is not None and self.ray_node is None:
|
| 312 |
+
# We don't see a ray node, but tracking an im instance.
|
| 313 |
+
if self.cloud_instance_id is None:
|
| 314 |
+
if InstanceUtil.is_cloud_instance_allocated(self.im_instance.status):
|
| 315 |
+
return (
|
| 316 |
+
False,
|
| 317 |
+
"instance should be in a status where cloud instance "
|
| 318 |
+
"is not allocated.",
|
| 319 |
+
)
|
| 320 |
+
else:
|
| 321 |
+
if not InstanceUtil.is_cloud_instance_allocated(
|
| 322 |
+
self.im_instance.status
|
| 323 |
+
):
|
| 324 |
+
return (
|
| 325 |
+
False,
|
| 326 |
+
"instance should be in a status where cloud instance is "
|
| 327 |
+
"allocated.",
|
| 328 |
+
)
|
| 329 |
+
|
| 330 |
+
if self.ray_node is not None:
|
| 331 |
+
if self.cloud_instance_id != self.ray_node.instance_id:
|
| 332 |
+
return False, "cloud instance id doesn't match."
|
| 333 |
+
|
| 334 |
+
if self.im_instance is not None and self.cloud_instance_id is not None:
|
| 335 |
+
if self.cloud_instance_id != self.im_instance.cloud_instance_id:
|
| 336 |
+
return False, "cloud instance id doesn't match."
|
| 337 |
+
|
| 338 |
+
return True, ""
|
| 339 |
+
|
| 340 |
+
def is_ray_running(self) -> bool:
|
| 341 |
+
"""Whether the ray node is running."""
|
| 342 |
+
return self.ray_node is not None and self.ray_node.status in [
|
| 343 |
+
NodeStatus.RUNNING,
|
| 344 |
+
NodeStatus.IDLE,
|
| 345 |
+
]
|
| 346 |
+
|
| 347 |
+
def is_ray_stop(self) -> bool:
|
| 348 |
+
"""Whether the ray node is stopped."""
|
| 349 |
+
return self.ray_node is None or self.ray_node.status in [
|
| 350 |
+
NodeStatus.DEAD,
|
| 351 |
+
]
|