koichi12 commited on
Commit
ed5a2c3
·
verified ·
1 Parent(s): 8479d0d

Add files using upload-large-folder tool

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .venv/lib/python3.11/site-packages/ray/autoscaler/_private/aliyun/__init__.py +0 -0
  2. .venv/lib/python3.11/site-packages/ray/autoscaler/_private/aliyun/__pycache__/__init__.cpython-311.pyc +0 -0
  3. .venv/lib/python3.11/site-packages/ray/autoscaler/_private/aliyun/__pycache__/config.cpython-311.pyc +0 -0
  4. .venv/lib/python3.11/site-packages/ray/autoscaler/_private/aliyun/__pycache__/node_provider.cpython-311.pyc +0 -0
  5. .venv/lib/python3.11/site-packages/ray/autoscaler/_private/aliyun/__pycache__/utils.cpython-311.pyc +0 -0
  6. .venv/lib/python3.11/site-packages/ray/autoscaler/_private/aliyun/config.py +116 -0
  7. .venv/lib/python3.11/site-packages/ray/autoscaler/_private/aliyun/node_provider.py +324 -0
  8. .venv/lib/python3.11/site-packages/ray/autoscaler/_private/aliyun/utils.py +461 -0
  9. .venv/lib/python3.11/site-packages/ray/autoscaler/_private/aws/__pycache__/utils.cpython-311.pyc +0 -0
  10. .venv/lib/python3.11/site-packages/ray/autoscaler/_private/kuberay/__init__.py +0 -0
  11. .venv/lib/python3.11/site-packages/ray/autoscaler/_private/kuberay/__pycache__/__init__.cpython-311.pyc +0 -0
  12. .venv/lib/python3.11/site-packages/ray/autoscaler/_private/kuberay/__pycache__/autoscaling_config.cpython-311.pyc +0 -0
  13. .venv/lib/python3.11/site-packages/ray/autoscaler/_private/kuberay/__pycache__/node_provider.cpython-311.pyc +0 -0
  14. .venv/lib/python3.11/site-packages/ray/autoscaler/_private/kuberay/__pycache__/run_autoscaler.cpython-311.pyc +0 -0
  15. .venv/lib/python3.11/site-packages/ray/autoscaler/_private/kuberay/__pycache__/utils.cpython-311.pyc +0 -0
  16. .venv/lib/python3.11/site-packages/ray/autoscaler/_private/kuberay/node_provider.py +536 -0
  17. .venv/lib/python3.11/site-packages/ray/autoscaler/_private/kuberay/run_autoscaler.py +119 -0
  18. .venv/lib/python3.11/site-packages/ray/autoscaler/_private/kuberay/utils.py +111 -0
  19. .venv/lib/python3.11/site-packages/ray/autoscaler/_private/readonly/__init__.py +0 -0
  20. .venv/lib/python3.11/site-packages/ray/autoscaler/_private/readonly/__pycache__/__init__.cpython-311.pyc +0 -0
  21. .venv/lib/python3.11/site-packages/ray/autoscaler/_private/readonly/__pycache__/node_provider.cpython-311.pyc +0 -0
  22. .venv/lib/python3.11/site-packages/ray/autoscaler/_private/readonly/node_provider.py +80 -0
  23. .venv/lib/python3.11/site-packages/ray/autoscaler/aws/__init__.py +0 -0
  24. .venv/lib/python3.11/site-packages/ray/autoscaler/aws/__pycache__/__init__.cpython-311.pyc +0 -0
  25. .venv/lib/python3.11/site-packages/ray/autoscaler/aws/cloudwatch/prometheus.yml +15 -0
  26. .venv/lib/python3.11/site-packages/ray/autoscaler/aws/cloudwatch/ray_prometheus_waiter.sh +23 -0
  27. .venv/lib/python3.11/site-packages/ray/autoscaler/aws/defaults.yaml +144 -0
  28. .venv/lib/python3.11/site-packages/ray/autoscaler/azure/__init__.py +0 -0
  29. .venv/lib/python3.11/site-packages/ray/autoscaler/azure/__pycache__/__init__.cpython-311.pyc +0 -0
  30. .venv/lib/python3.11/site-packages/ray/autoscaler/azure/defaults.yaml +152 -0
  31. .venv/lib/python3.11/site-packages/ray/autoscaler/sdk/__init__.py +29 -0
  32. .venv/lib/python3.11/site-packages/ray/autoscaler/sdk/__pycache__/__init__.cpython-311.pyc +0 -0
  33. .venv/lib/python3.11/site-packages/ray/autoscaler/sdk/__pycache__/sdk.cpython-311.pyc +0 -0
  34. .venv/lib/python3.11/site-packages/ray/autoscaler/sdk/sdk.py +343 -0
  35. .venv/lib/python3.11/site-packages/ray/autoscaler/v2/__init__.py +0 -0
  36. .venv/lib/python3.11/site-packages/ray/autoscaler/v2/autoscaler.py +201 -0
  37. .venv/lib/python3.11/site-packages/ray/autoscaler/v2/event_logger.py +157 -0
  38. .venv/lib/python3.11/site-packages/ray/autoscaler/v2/instance_manager/__init__.py +0 -0
  39. .venv/lib/python3.11/site-packages/ray/autoscaler/v2/instance_manager/common.py +472 -0
  40. .venv/lib/python3.11/site-packages/ray/autoscaler/v2/instance_manager/config.py +541 -0
  41. .venv/lib/python3.11/site-packages/ray/autoscaler/v2/instance_manager/instance_manager.py +270 -0
  42. .venv/lib/python3.11/site-packages/ray/autoscaler/v2/instance_manager/instance_storage.py +151 -0
  43. .venv/lib/python3.11/site-packages/ray/autoscaler/v2/instance_manager/node_provider.py +522 -0
  44. .venv/lib/python3.11/site-packages/ray/autoscaler/v2/instance_manager/ray_installer.py +99 -0
  45. .venv/lib/python3.11/site-packages/ray/autoscaler/v2/instance_manager/reconciler.py +1565 -0
  46. .venv/lib/python3.11/site-packages/ray/autoscaler/v2/instance_manager/storage.py +180 -0
  47. .venv/lib/python3.11/site-packages/ray/autoscaler/v2/metrics_reporter.py +100 -0
  48. .venv/lib/python3.11/site-packages/ray/autoscaler/v2/monitor.py +302 -0
  49. .venv/lib/python3.11/site-packages/ray/autoscaler/v2/scheduler.py +1642 -0
  50. .venv/lib/python3.11/site-packages/ray/autoscaler/v2/schema.py +351 -0
.venv/lib/python3.11/site-packages/ray/autoscaler/_private/aliyun/__init__.py ADDED
File without changes
.venv/lib/python3.11/site-packages/ray/autoscaler/_private/aliyun/__pycache__/__init__.cpython-311.pyc ADDED
Binary file (203 Bytes). View file
 
.venv/lib/python3.11/site-packages/ray/autoscaler/_private/aliyun/__pycache__/config.cpython-311.pyc ADDED
Binary file (6.06 kB). View file
 
.venv/lib/python3.11/site-packages/ray/autoscaler/_private/aliyun/__pycache__/node_provider.cpython-311.pyc ADDED
Binary file (18.3 kB). View file
 
.venv/lib/python3.11/site-packages/ray/autoscaler/_private/aliyun/__pycache__/utils.cpython-311.pyc ADDED
Binary file (24.5 kB). View file
 
.venv/lib/python3.11/site-packages/ray/autoscaler/_private/aliyun/config.py ADDED
@@ -0,0 +1,116 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ import os
3
+ import stat
4
+
5
+ from ray.autoscaler._private.aliyun.utils import AcsClient
6
+
7
+ # instance status
8
+ PENDING = "Pending"
9
+ RUNNING = "Running"
10
+ STARTING = "Starting"
11
+ STOPPING = "Stopping"
12
+ STOPPED = "Stopped"
13
+
14
+ logger = logging.getLogger(__name__)
15
+
16
+
17
+ def bootstrap_aliyun(config):
18
+ # print(config["provider"])
19
+ # create vpc
20
+ _get_or_create_vpc(config)
21
+ # create security group id
22
+ _get_or_create_security_group(config)
23
+ # create vswitch
24
+ _get_or_create_vswitch(config)
25
+ # create key pair
26
+ _get_or_import_key_pair(config)
27
+ # print(config["provider"])
28
+ return config
29
+
30
+
31
+ def _client(config):
32
+ return AcsClient(
33
+ access_key=config["provider"].get("access_key"),
34
+ access_key_secret=config["provider"].get("access_key_secret"),
35
+ region_id=config["provider"]["region"],
36
+ max_retries=1,
37
+ )
38
+
39
+
40
+ def _get_or_create_security_group(config):
41
+ cli = _client(config)
42
+ security_groups = cli.describe_security_groups(vpc_id=config["provider"]["vpc_id"])
43
+ if security_groups is not None and len(security_groups) > 0:
44
+ config["provider"]["security_group_id"] = security_groups[0]["SecurityGroupId"]
45
+ return config
46
+
47
+ security_group_id = cli.create_security_group(vpc_id=config["provider"]["vpc_id"])
48
+
49
+ for rule in config["provider"].get("security_group_rule", {}):
50
+ cli.authorize_security_group(
51
+ security_group_id=security_group_id,
52
+ port_range=rule["port_range"],
53
+ source_cidr_ip=rule["source_cidr_ip"],
54
+ ip_protocol=rule["ip_protocol"],
55
+ )
56
+ config["provider"]["security_group_id"] = security_group_id
57
+ return
58
+
59
+
60
+ def _get_or_create_vpc(config):
61
+ cli = _client(config)
62
+ vpcs = cli.describe_vpcs()
63
+ if vpcs is not None and len(vpcs) > 0:
64
+ config["provider"]["vpc_id"] = vpcs[0].get("VpcId")
65
+ return
66
+
67
+ vpc_id = cli.create_vpc()
68
+ if vpc_id is not None:
69
+ config["provider"]["vpc_id"] = vpc_id
70
+
71
+
72
+ def _get_or_create_vswitch(config):
73
+ cli = _client(config)
74
+ vswitches = cli.describe_v_switches(vpc_id=config["provider"]["vpc_id"])
75
+ if vswitches is not None and len(vswitches) > 0:
76
+ config["provider"]["v_switch_id"] = vswitches[0].get("VSwitchId")
77
+ return
78
+
79
+ v_switch_id = cli.create_v_switch(
80
+ vpc_id=config["provider"]["vpc_id"],
81
+ zone_id=config["provider"]["zone_id"],
82
+ cidr_block=config["provider"]["cidr_block"],
83
+ )
84
+
85
+ if v_switch_id is not None:
86
+ config["provider"]["v_switch_id"] = v_switch_id
87
+
88
+
89
+ def _get_or_import_key_pair(config):
90
+ cli = _client(config)
91
+ key_name = config["provider"].get("key_name", "ray")
92
+ key_path = os.path.expanduser("~/.ssh/{}".format(key_name))
93
+ keypairs = cli.describe_key_pairs(key_pair_name=key_name)
94
+
95
+ if keypairs is not None and len(keypairs) > 0:
96
+ if "ssh_private_key" not in config["auth"]:
97
+ logger.info(
98
+ "{} keypair exists, use {} as local ssh key".format(key_name, key_path)
99
+ )
100
+ config["auth"]["ssh_private_key"] = key_path
101
+ else:
102
+ if "ssh_private_key" not in config["auth"]:
103
+ # create new keypair
104
+ resp = cli.create_key_pair(key_pair_name=key_name)
105
+ if resp is not None:
106
+ with open(key_path, "w+") as f:
107
+ f.write(resp.get("PrivateKeyBody"))
108
+ os.chmod(key_path, stat.S_IRUSR)
109
+ config["auth"]["ssh_private_key"] = key_path
110
+ else:
111
+ public_key_file = config["auth"]["ssh_private_key"] + ".pub"
112
+ # create new keypair, from local file
113
+ with open(public_key_file) as f:
114
+ public_key = f.readline().strip("\n")
115
+ cli.import_key_pair(key_pair_name=key_name, public_key_body=public_key)
116
+ return
.venv/lib/python3.11/site-packages/ray/autoscaler/_private/aliyun/node_provider.py ADDED
@@ -0,0 +1,324 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ import random
3
+ import threading
4
+ import time
5
+ from collections import defaultdict
6
+ from typing import Any, Dict, List, Optional
7
+
8
+ from ray.autoscaler._private.aliyun.config import (
9
+ PENDING,
10
+ RUNNING,
11
+ STOPPED,
12
+ STOPPING,
13
+ bootstrap_aliyun,
14
+ )
15
+ from ray.autoscaler._private.aliyun.utils import AcsClient
16
+ from ray.autoscaler._private.cli_logger import cli_logger
17
+ from ray.autoscaler._private.constants import BOTO_MAX_RETRIES
18
+ from ray.autoscaler._private.log_timer import LogTimer
19
+ from ray.autoscaler.node_provider import NodeProvider
20
+ from ray.autoscaler.tags import (
21
+ TAG_RAY_CLUSTER_NAME,
22
+ TAG_RAY_LAUNCH_CONFIG,
23
+ TAG_RAY_NODE_KIND,
24
+ TAG_RAY_NODE_NAME,
25
+ TAG_RAY_NODE_STATUS,
26
+ TAG_RAY_USER_NODE_TYPE,
27
+ )
28
+
29
+ logger = logging.getLogger(__name__)
30
+
31
+ TAG_BATCH_DELAY = 1
32
+ STOPPING_NODE_DELAY = 1
33
+
34
+
35
+ class AliyunNodeProvider(NodeProvider):
36
+ def __init__(self, provider_config, cluster_name):
37
+ NodeProvider.__init__(self, provider_config, cluster_name)
38
+ self.cache_stopped_nodes = provider_config.get("cache_stopped_nodes", True)
39
+ self.acs = AcsClient(
40
+ access_key=provider_config["access_key"],
41
+ access_key_secret=provider_config["access_key_secret"],
42
+ region_id=provider_config["region"],
43
+ max_retries=BOTO_MAX_RETRIES,
44
+ )
45
+
46
+ # Try availability zones round-robin, starting from random offset
47
+ self.subnet_idx = random.randint(0, 100)
48
+
49
+ # Tags that we believe to actually be on the node.
50
+ self.tag_cache = {}
51
+ # Tags that we will soon upload.
52
+ self.tag_cache_pending = defaultdict(dict)
53
+ # Number of threads waiting for a batched tag update.
54
+ self.batch_thread_count = 0
55
+ self.batch_update_done = threading.Event()
56
+ self.batch_update_done.set()
57
+ self.ready_for_new_batch = threading.Event()
58
+ self.ready_for_new_batch.set()
59
+ self.tag_cache_lock = threading.Lock()
60
+ self.count_lock = threading.Lock()
61
+
62
+ # Cache of node objects from the last nodes() call. This avoids
63
+ # excessive DescribeInstances requests.
64
+ self.cached_nodes = {}
65
+
66
+ def non_terminated_nodes(self, tag_filters: Dict[str, str]) -> List[str]:
67
+ tags = [
68
+ {
69
+ "Key": TAG_RAY_CLUSTER_NAME,
70
+ "Value": self.cluster_name,
71
+ },
72
+ ]
73
+ for k, v in tag_filters.items():
74
+ tags.append(
75
+ {
76
+ "Key": k,
77
+ "Value": v,
78
+ }
79
+ )
80
+
81
+ instances = self.acs.describe_instances(tags=tags)
82
+ non_terminated_instance = []
83
+ for instance in instances:
84
+ if instance.get("Status") == RUNNING or instance.get("Status") == PENDING:
85
+ non_terminated_instance.append(instance.get("InstanceId"))
86
+ self.cached_nodes[instance.get("InstanceId")] = instance
87
+ return non_terminated_instance
88
+
89
+ def is_running(self, node_id: str) -> bool:
90
+ instances = self.acs.describe_instances(instance_ids=[node_id])
91
+ if instances is not None:
92
+ instance = instances[0]
93
+ return instance.get("Status") == "Running"
94
+ cli_logger.error("Invalid node id: %s", node_id)
95
+ return False
96
+
97
+ def is_terminated(self, node_id: str) -> bool:
98
+ instances = self.acs.describe_instances(instance_ids=[node_id])
99
+ if instances is not None:
100
+ assert len(instances) == 1
101
+ instance = instances[0]
102
+ return instance.get("Status") == "Stopped"
103
+ cli_logger.error("Invalid node id: %s", node_id)
104
+ return False
105
+
106
+ def node_tags(self, node_id: str) -> Dict[str, str]:
107
+ instances = self.acs.describe_instances(instance_ids=[node_id])
108
+ if instances is not None:
109
+ assert len(instances) == 1
110
+ instance = instances[0]
111
+ if instance.get("Tags") is not None:
112
+ node_tags = dict()
113
+ for tag in instance.get("Tags").get("Tag"):
114
+ node_tags[tag.get("TagKey")] = tag.get("TagValue")
115
+ return node_tags
116
+ return dict()
117
+
118
+ def external_ip(self, node_id: str) -> str:
119
+ while True:
120
+ instances = self.acs.describe_instances(instance_ids=[node_id])
121
+ if instances is not None:
122
+ assert len(instances)
123
+ instance = instances[0]
124
+ if (
125
+ instance.get("PublicIpAddress") is not None
126
+ and instance.get("PublicIpAddress").get("IpAddress") is not None
127
+ ):
128
+ if len(instance.get("PublicIpAddress").get("IpAddress")) > 0:
129
+ return instance.get("PublicIpAddress").get("IpAddress")[0]
130
+ cli_logger.error("PublicIpAddress attribute is not exist. %s" % instance)
131
+ time.sleep(STOPPING_NODE_DELAY)
132
+
133
+ def internal_ip(self, node_id: str) -> str:
134
+ while True:
135
+ instances = self.acs.describe_instances(instance_ids=[node_id])
136
+ if instances is not None:
137
+ assert len(instances) == 1
138
+ instance = instances[0]
139
+ if (
140
+ instance.get("VpcAttributes") is not None
141
+ and instance.get("VpcAttributes").get("PrivateIpAddress")
142
+ is not None
143
+ and len(
144
+ instance.get("VpcAttributes")
145
+ .get("PrivateIpAddress")
146
+ .get("IpAddress")
147
+ )
148
+ > 0
149
+ ):
150
+ return (
151
+ instance.get("VpcAttributes")
152
+ .get("PrivateIpAddress")
153
+ .get("IpAddress")[0]
154
+ )
155
+ cli_logger.error("InnerIpAddress attribute is not exist. %s" % instance)
156
+ time.sleep(STOPPING_NODE_DELAY)
157
+
158
+ def set_node_tags(self, node_id: str, tags: Dict[str, str]) -> None:
159
+ is_batching_thread = False
160
+ with self.tag_cache_lock:
161
+ if not self.tag_cache_pending:
162
+ is_batching_thread = True
163
+ # Wait for threads in the last batch to exit
164
+ self.ready_for_new_batch.wait()
165
+ self.ready_for_new_batch.clear()
166
+ self.batch_update_done.clear()
167
+ self.tag_cache_pending[node_id].update(tags)
168
+
169
+ if is_batching_thread:
170
+ time.sleep(TAG_BATCH_DELAY)
171
+ with self.tag_cache_lock:
172
+ self._update_node_tags()
173
+ self.batch_update_done.set()
174
+
175
+ with self.count_lock:
176
+ self.batch_thread_count += 1
177
+ self.batch_update_done.wait()
178
+
179
+ with self.count_lock:
180
+ self.batch_thread_count -= 1
181
+ if self.batch_thread_count == 0:
182
+ self.ready_for_new_batch.set()
183
+
184
+ def _update_node_tags(self):
185
+ batch_updates = defaultdict(list)
186
+
187
+ for node_id, tags in self.tag_cache_pending.items():
188
+ for x in tags.items():
189
+ batch_updates[x].append(node_id)
190
+ self.tag_cache[node_id] = tags
191
+
192
+ self.tag_cache_pending = defaultdict(dict)
193
+
194
+ self._create_tags(batch_updates)
195
+
196
+ def _create_tags(self, batch_updates):
197
+
198
+ for (k, v), node_ids in batch_updates.items():
199
+ m = "Set tag {}={} on {}".format(k, v, node_ids)
200
+ with LogTimer("AliyunNodeProvider: {}".format(m)):
201
+ if k == TAG_RAY_NODE_NAME:
202
+ k = "Name"
203
+
204
+ self.acs.tag_resource(node_ids, [{"Key": k, "Value": v}])
205
+
206
+ def create_node(
207
+ self, node_config: Dict[str, Any], tags: Dict[str, str], count: int
208
+ ) -> Optional[Dict[str, Any]]:
209
+ filter_tags = [
210
+ {
211
+ "Key": TAG_RAY_CLUSTER_NAME,
212
+ "Value": self.cluster_name,
213
+ },
214
+ {"Key": TAG_RAY_NODE_KIND, "Value": tags[TAG_RAY_NODE_KIND]},
215
+ {"Key": TAG_RAY_USER_NODE_TYPE, "Value": tags[TAG_RAY_USER_NODE_TYPE]},
216
+ {"Key": TAG_RAY_LAUNCH_CONFIG, "Value": tags[TAG_RAY_LAUNCH_CONFIG]},
217
+ {"Key": TAG_RAY_NODE_NAME, "Value": tags[TAG_RAY_NODE_NAME]},
218
+ ]
219
+
220
+ reused_nodes_dict = {}
221
+ if self.cache_stopped_nodes:
222
+ reuse_nodes_candidate = self.acs.describe_instances(tags=filter_tags)
223
+ if reuse_nodes_candidate:
224
+ with cli_logger.group("Stopping instances to reuse"):
225
+ reuse_node_ids = []
226
+ for node in reuse_nodes_candidate:
227
+ node_id = node.get("InstanceId")
228
+ status = node.get("Status")
229
+ if status != STOPPING and status != STOPPED:
230
+ continue
231
+ if status == STOPPING:
232
+ # wait for node stopped
233
+ while (
234
+ self.acs.describe_instances(instance_ids=[node_id])[
235
+ 0
236
+ ].get("Status")
237
+ == STOPPING
238
+ ):
239
+ logging.info("wait for %s stop" % node_id)
240
+ time.sleep(STOPPING_NODE_DELAY)
241
+ # logger.info("reuse %s" % node_id)
242
+ reuse_node_ids.append(node_id)
243
+ reused_nodes_dict[node.get("InstanceId")] = node
244
+ self.acs.start_instance(node_id)
245
+ self.tag_cache[node_id] = node.get("Tags")
246
+ self.set_node_tags(node_id, tags)
247
+ if len(reuse_node_ids) == count:
248
+ break
249
+ count -= len(reuse_node_ids)
250
+
251
+ created_nodes_dict = {}
252
+ if count > 0:
253
+ filter_tags.append(
254
+ {"Key": TAG_RAY_NODE_STATUS, "Value": tags[TAG_RAY_NODE_STATUS]}
255
+ )
256
+ instance_id_sets = self.acs.run_instances(
257
+ instance_type=node_config["InstanceType"],
258
+ image_id=node_config["ImageId"],
259
+ tags=filter_tags,
260
+ amount=count,
261
+ vswitch_id=self.provider_config["v_switch_id"],
262
+ security_group_id=self.provider_config["security_group_id"],
263
+ key_pair_name=self.provider_config["key_name"],
264
+ )
265
+ instances = self.acs.describe_instances(instance_ids=instance_id_sets)
266
+
267
+ if instances is not None:
268
+ for instance in instances:
269
+ created_nodes_dict[instance.get("InstanceId")] = instance
270
+
271
+ all_created_nodes = reused_nodes_dict
272
+ all_created_nodes.update(created_nodes_dict)
273
+ return all_created_nodes
274
+
275
+ def terminate_node(self, node_id: str) -> None:
276
+ logger.info("terminate node: %s" % node_id)
277
+ if self.cache_stopped_nodes:
278
+ logger.info(
279
+ "Stopping instance {} (to terminate instead, "
280
+ "set `cache_stopped_nodes: False` "
281
+ "under `provider` in the cluster configuration)"
282
+ ).format(node_id)
283
+ self.acs.stop_instance(node_id)
284
+ else:
285
+ self.acs.delete_instance(node_id)
286
+
287
+ def terminate_nodes(self, node_ids: List[str]) -> None:
288
+ if not node_ids:
289
+ return
290
+ if self.cache_stopped_nodes:
291
+ logger.info(
292
+ "Stopping instances {} (to terminate instead, "
293
+ "set `cache_stopped_nodes: False` "
294
+ "under `provider` in the cluster configuration)".format(node_ids)
295
+ )
296
+
297
+ self.acs.stop_instances(node_ids)
298
+ else:
299
+ self.acs.delete_instances(node_ids)
300
+
301
+ def _get_node(self, node_id):
302
+ """Refresh and get info for this node, updating the cache."""
303
+ self.non_terminated_nodes({}) # Side effect: updates cache
304
+
305
+ if node_id in self.cached_nodes:
306
+ return self.cached_nodes[node_id]
307
+
308
+ # Node not in {pending, running} -- retry with a point query. This
309
+ # usually means the node was recently preempted or terminated.
310
+ matches = self.acs.describe_instances(instance_ids=[node_id])
311
+
312
+ assert len(matches) == 1, "Invalid instance id {}".format(node_id)
313
+ return matches[0]
314
+
315
+ def _get_cached_node(self, node_id):
316
+ """Return node info from cache if possible, otherwise fetches it."""
317
+ if node_id in self.cached_nodes:
318
+ return self.cached_nodes[node_id]
319
+
320
+ return self._get_node(node_id)
321
+
322
+ @staticmethod
323
+ def bootstrap_config(cluster_config):
324
+ return bootstrap_aliyun(cluster_config)
.venv/lib/python3.11/site-packages/ray/autoscaler/_private/aliyun/utils.py ADDED
@@ -0,0 +1,461 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import logging
3
+
4
+ from aliyunsdkcore import client
5
+ from aliyunsdkcore.acs_exception.exceptions import ClientException, ServerException
6
+ from aliyunsdkecs.request.v20140526.AllocatePublicIpAddressRequest import (
7
+ AllocatePublicIpAddressRequest,
8
+ )
9
+ from aliyunsdkecs.request.v20140526.AuthorizeSecurityGroupRequest import (
10
+ AuthorizeSecurityGroupRequest,
11
+ )
12
+ from aliyunsdkecs.request.v20140526.CreateInstanceRequest import CreateInstanceRequest
13
+ from aliyunsdkecs.request.v20140526.CreateKeyPairRequest import CreateKeyPairRequest
14
+ from aliyunsdkecs.request.v20140526.CreateSecurityGroupRequest import (
15
+ CreateSecurityGroupRequest,
16
+ )
17
+ from aliyunsdkecs.request.v20140526.CreateVpcRequest import CreateVpcRequest
18
+ from aliyunsdkecs.request.v20140526.CreateVSwitchRequest import CreateVSwitchRequest
19
+ from aliyunsdkecs.request.v20140526.DeleteInstanceRequest import DeleteInstanceRequest
20
+ from aliyunsdkecs.request.v20140526.DeleteInstancesRequest import DeleteInstancesRequest
21
+ from aliyunsdkecs.request.v20140526.DeleteKeyPairsRequest import DeleteKeyPairsRequest
22
+ from aliyunsdkecs.request.v20140526.DescribeInstancesRequest import (
23
+ DescribeInstancesRequest,
24
+ )
25
+ from aliyunsdkecs.request.v20140526.DescribeKeyPairsRequest import (
26
+ DescribeKeyPairsRequest,
27
+ )
28
+ from aliyunsdkecs.request.v20140526.DescribeSecurityGroupsRequest import (
29
+ DescribeSecurityGroupsRequest,
30
+ )
31
+ from aliyunsdkecs.request.v20140526.DescribeVpcsRequest import DescribeVpcsRequest
32
+ from aliyunsdkecs.request.v20140526.DescribeVSwitchesRequest import (
33
+ DescribeVSwitchesRequest,
34
+ )
35
+ from aliyunsdkecs.request.v20140526.ImportKeyPairRequest import ImportKeyPairRequest
36
+ from aliyunsdkecs.request.v20140526.RunInstancesRequest import RunInstancesRequest
37
+ from aliyunsdkecs.request.v20140526.StartInstanceRequest import StartInstanceRequest
38
+ from aliyunsdkecs.request.v20140526.StopInstanceRequest import StopInstanceRequest
39
+ from aliyunsdkecs.request.v20140526.StopInstancesRequest import StopInstancesRequest
40
+ from aliyunsdkecs.request.v20140526.TagResourcesRequest import TagResourcesRequest
41
+
42
+
43
+ class AcsClient:
44
+ """
45
+ A wrapper around Aliyun SDK. We use this wrapper in aliyun node provider.
46
+
47
+ Parameters:
48
+ access_key: The AccessKey ID of your aliyun account.
49
+ access_key_secret: The AccessKey secret of your aliyun account.
50
+ region_id: A region is a geographic area where a data center resides.
51
+ Region_id is the ID of region (e.g., cn-hangzhou,
52
+ us-west-1, etc.)
53
+ max_retries: The maximum number of retries each connection.
54
+ """
55
+
56
+ def __init__(self, access_key, access_key_secret, region_id, max_retries):
57
+ self.cli = client.AcsClient(
58
+ ak=access_key,
59
+ secret=access_key_secret,
60
+ max_retry_time=max_retries,
61
+ region_id=region_id,
62
+ )
63
+
64
+ def describe_instances(self, tags=None, instance_ids=None):
65
+ """Query the details of one or more Elastic Compute Service (ECS) instances.
66
+
67
+ :param tags: The tags of the instance.
68
+ :param instance_ids: The IDs of ECS instances
69
+ :return: ECS instance list
70
+ """
71
+ request = DescribeInstancesRequest()
72
+ if tags is not None:
73
+ request.set_Tags(tags)
74
+ if instance_ids is not None:
75
+ request.set_InstanceIds(instance_ids)
76
+ response = self._send_request(request)
77
+ if response is not None:
78
+ instance_list = response.get("Instances").get("Instance")
79
+ return instance_list
80
+ return None
81
+
82
+ def create_instance(
83
+ self,
84
+ instance_type,
85
+ image_id,
86
+ tags,
87
+ key_pair_name,
88
+ optimized="optimized",
89
+ instance_charge_type="PostPaid",
90
+ spot_strategy="SpotWithPriceLimit",
91
+ internet_charge_type="PayByTraffic",
92
+ internet_max_bandwidth_out=5,
93
+ ):
94
+ """Create a subscription or pay-as-you-go ECS instance.
95
+
96
+ :param instance_type: The instance type of the ECS.
97
+ :param image_id: The ID of the image used to create the instance.
98
+ :param tags: The tags of the instance.
99
+ :param key_pair_name: The name of the key pair to be bound to
100
+ the instance.
101
+ :param optimized: Specifies whether the instance is I/O optimized
102
+ :param instance_charge_type: The billing method of the instance.
103
+ Default value: PostPaid.
104
+ :param spot_strategy: The preemption policy for the pay-as-you-go
105
+ instance.
106
+ :param internet_charge_type: The billing method for network usage.
107
+ Default value: PayByTraffic.
108
+ :param internet_max_bandwidth_out: The maximum inbound public
109
+ bandwidth. Unit: Mbit/s.
110
+ :return: The created instance ID.
111
+ """
112
+ request = CreateInstanceRequest()
113
+ request.set_InstanceType(instance_type)
114
+ request.set_ImageId(image_id)
115
+ request.set_IoOptimized(optimized)
116
+ request.set_InstanceChargeType(instance_charge_type)
117
+ request.set_SpotStrategy(spot_strategy)
118
+ request.set_InternetChargeType(internet_charge_type)
119
+ request.set_InternetMaxBandwidthOut(internet_max_bandwidth_out)
120
+ request.set_KeyPairName(key_pair_name)
121
+ request.set_Tags(tags)
122
+
123
+ response = self._send_request(request)
124
+ if response is not None:
125
+ instance_id = response.get("InstanceId")
126
+ logging.info("instance %s created task submit successfully.", instance_id)
127
+ return instance_id
128
+ logging.error("instance created failed.")
129
+ return None
130
+
131
+ def run_instances(
132
+ self,
133
+ instance_type,
134
+ image_id,
135
+ tags,
136
+ security_group_id,
137
+ vswitch_id,
138
+ key_pair_name,
139
+ amount=1,
140
+ optimized="optimized",
141
+ instance_charge_type="PostPaid",
142
+ spot_strategy="SpotWithPriceLimit",
143
+ internet_charge_type="PayByTraffic",
144
+ internet_max_bandwidth_out=1,
145
+ ):
146
+ """Create one or more pay-as-you-go or subscription
147
+ Elastic Compute Service (ECS) instances
148
+
149
+ :param instance_type: The instance type of the ECS.
150
+ :param image_id: The ID of the image used to create the instance.
151
+ :param tags: The tags of the instance.
152
+ :param security_group_id: The ID of the security group to which to
153
+ assign the instance. Instances in the same
154
+ security group can communicate with
155
+ each other.
156
+ :param vswitch_id: The ID of the vSwitch to which to connect
157
+ the instance.
158
+ :param key_pair_name: The name of the key pair to be bound to
159
+ the instance.
160
+ :param amount: The number of instances that you want to create.
161
+ :param optimized: Specifies whether the instance is I/O optimized
162
+ :param instance_charge_type: The billing method of the instance.
163
+ Default value: PostPaid.
164
+ :param spot_strategy: The preemption policy for the pay-as-you-go
165
+ instance.
166
+ :param internet_charge_type: The billing method for network usage.
167
+ Default value: PayByTraffic.
168
+ :param internet_max_bandwidth_out: The maximum inbound public
169
+ bandwidth. Unit: Mbit/s.
170
+ :return: The created instance IDs.
171
+ """
172
+ request = RunInstancesRequest()
173
+ request.set_InstanceType(instance_type)
174
+ request.set_ImageId(image_id)
175
+ request.set_IoOptimized(optimized)
176
+ request.set_InstanceChargeType(instance_charge_type)
177
+ request.set_SpotStrategy(spot_strategy)
178
+ request.set_InternetChargeType(internet_charge_type)
179
+ request.set_InternetMaxBandwidthOut(internet_max_bandwidth_out)
180
+ request.set_Tags(tags)
181
+ request.set_Amount(amount)
182
+ request.set_SecurityGroupId(security_group_id)
183
+ request.set_VSwitchId(vswitch_id)
184
+ request.set_KeyPairName(key_pair_name)
185
+
186
+ response = self._send_request(request)
187
+ if response is not None:
188
+ instance_ids = response.get("InstanceIdSets").get("InstanceIdSet")
189
+ return instance_ids
190
+ logging.error("instance created failed.")
191
+ return None
192
+
193
+ def create_security_group(self, vpc_id):
194
+ """Create a security group
195
+
196
+ :param vpc_id: The ID of the VPC in which to create
197
+ the security group.
198
+ :return: The created security group ID.
199
+ """
200
+ request = CreateSecurityGroupRequest()
201
+ request.set_VpcId(vpc_id)
202
+ response = self._send_request(request)
203
+ if response is not None:
204
+ security_group_id = response.get("SecurityGroupId")
205
+ return security_group_id
206
+ return None
207
+
208
+ def describe_security_groups(self, vpc_id=None, tags=None):
209
+ """Query basic information of security groups.
210
+
211
+ :param vpc_id: The ID of the VPC to which the security group belongs.
212
+ :param tags: The tags of the security group.
213
+ :return: Security group list.
214
+ """
215
+ request = DescribeSecurityGroupsRequest()
216
+ if vpc_id is not None:
217
+ request.set_VpcId(vpc_id)
218
+ if tags is not None:
219
+ request.set_Tags(tags)
220
+ response = self._send_request(request)
221
+ if response is not None:
222
+ security_groups = response.get("SecurityGroups").get("SecurityGroup")
223
+ return security_groups
224
+ logging.error("describe security group failed.")
225
+ return None
226
+
227
+ def authorize_security_group(
228
+ self, ip_protocol, port_range, security_group_id, source_cidr_ip
229
+ ):
230
+ """Create an inbound security group rule.
231
+
232
+ :param ip_protocol: The transport layer protocol.
233
+ :param port_range: The range of destination ports relevant to
234
+ the transport layer protocol.
235
+ :param security_group_id: The ID of the destination security group.
236
+ :param source_cidr_ip: The range of source IPv4 addresses.
237
+ CIDR blocks and IPv4 addresses are supported.
238
+ """
239
+ request = AuthorizeSecurityGroupRequest()
240
+ request.set_IpProtocol(ip_protocol)
241
+ request.set_PortRange(port_range)
242
+ request.set_SecurityGroupId(security_group_id)
243
+ request.set_SourceCidrIp(source_cidr_ip)
244
+ self._send_request(request)
245
+
246
+ def create_v_switch(self, vpc_id, zone_id, cidr_block):
247
+ """Create vSwitches to divide the VPC into one or more subnets
248
+
249
+ :param vpc_id: The ID of the VPC to which the VSwitch belongs.
250
+ :param zone_id: The ID of the zone to which
251
+ the target VSwitch belongs.
252
+ :param cidr_block: The CIDR block of the VSwitch.
253
+ :return:
254
+ """
255
+ request = CreateVSwitchRequest()
256
+ request.set_ZoneId(zone_id)
257
+ request.set_VpcId(vpc_id)
258
+ request.set_CidrBlock(cidr_block)
259
+ response = self._send_request(request)
260
+ if response is not None:
261
+ return response.get("VSwitchId")
262
+ else:
263
+ logging.error("create_v_switch vpc_id %s failed.", vpc_id)
264
+ return None
265
+
266
+ def create_vpc(self):
267
+ """Creates a virtual private cloud (VPC).
268
+
269
+ :return: The created VPC ID.
270
+ """
271
+ request = CreateVpcRequest()
272
+ response = self._send_request(request)
273
+ if response is not None:
274
+ return response.get("VpcId")
275
+ return None
276
+
277
+ def describe_vpcs(self):
278
+ """Queries one or more VPCs in a region.
279
+
280
+ :return: VPC list.
281
+ """
282
+ request = DescribeVpcsRequest()
283
+ response = self._send_request(request)
284
+ if response is not None:
285
+ return response.get("Vpcs").get("Vpc")
286
+ return None
287
+
288
+ def tag_resource(self, resource_ids, tags, resource_type="instance"):
289
+ """Create and bind tags to specified ECS resources.
290
+
291
+ :param resource_ids: The IDs of N resources.
292
+ :param tags: The tags of the resource.
293
+ :param resource_type: The type of the resource.
294
+ """
295
+ request = TagResourcesRequest()
296
+ request.set_Tags(tags)
297
+ request.set_ResourceType(resource_type)
298
+ request.set_ResourceIds(resource_ids)
299
+ response = self._send_request(request)
300
+ if response is not None:
301
+ logging.info("instance %s create tag successfully.", resource_ids)
302
+ else:
303
+ logging.error("instance %s create tag failed.", resource_ids)
304
+
305
+ def start_instance(self, instance_id):
306
+ """Start an ECS instance.
307
+
308
+ :param instance_id: The Ecs instance ID.
309
+ """
310
+ request = StartInstanceRequest()
311
+ request.set_InstanceId(instance_id)
312
+ response = self._send_request(request)
313
+
314
+ if response is not None:
315
+ logging.info("instance %s start successfully.", instance_id)
316
+ else:
317
+ logging.error("instance %s start failed.", instance_id)
318
+
319
+ def stop_instance(self, instance_id, force_stop=False):
320
+ """Stop an ECS instance that is in the Running state.
321
+
322
+ :param instance_id: The Ecs instance ID.
323
+ :param force_stop: Specifies whether to forcibly stop the instance.
324
+ :return:
325
+ """
326
+ request = StopInstanceRequest()
327
+ request.set_InstanceId(instance_id)
328
+ request.set_ForceStop(force_stop)
329
+ logging.info("Stop %s command submit successfully.", instance_id)
330
+ self._send_request(request)
331
+
332
+ def stop_instances(self, instance_ids, stopped_mode="StopCharging"):
333
+ """Stop one or more ECS instances that are in the Running state.
334
+
335
+ :param instance_ids: The IDs of instances.
336
+ :param stopped_mode: Specifies whether billing for the instance
337
+ continues after the instance is stopped.
338
+ """
339
+ request = StopInstancesRequest()
340
+ request.set_InstanceIds(instance_ids)
341
+ request.set_StoppedMode(stopped_mode)
342
+ response = self._send_request(request)
343
+ if response is None:
344
+ logging.error("stop_instances failed")
345
+
346
+ def delete_instance(self, instance_id):
347
+ """Release a pay-as-you-go instance or
348
+ an expired subscription instance.
349
+
350
+ :param instance_id: The ID of the instance that you want to release.
351
+ """
352
+ request = DeleteInstanceRequest()
353
+ request.set_InstanceId(instance_id)
354
+ request.set_Force(True)
355
+ logging.info("Delete %s command submit successfully", instance_id)
356
+ self._send_request(request)
357
+
358
+ def delete_instances(self, instance_ids):
359
+ """Release one or more pay-as-you-go instances or
360
+ expired subscription instances.
361
+
362
+ :param instance_ids: The IDs of instances that you want to release.
363
+ """
364
+ request = DeleteInstancesRequest()
365
+ request.set_Force(True)
366
+ request.set_InstanceIds(instance_ids)
367
+ self._send_request(request)
368
+
369
+ def allocate_public_address(self, instance_id):
370
+ """Assign a public IP address to an ECS instance.
371
+
372
+ :param instance_id: The ID of the instance to which you want to
373
+ assign a public IP address.
374
+ :return: The assigned ip.
375
+ """
376
+ request = AllocatePublicIpAddressRequest()
377
+ request.set_InstanceId(instance_id)
378
+ response = self._send_request(request)
379
+ if response is not None:
380
+ return response.get("IpAddress")
381
+
382
+ def create_key_pair(self, key_pair_name):
383
+ """Create an SSH key pair.
384
+
385
+ :param key_pair_name: The name of the key pair.
386
+ :return: The created keypair data.
387
+ """
388
+ request = CreateKeyPairRequest()
389
+ request.set_KeyPairName(key_pair_name)
390
+ response = self._send_request(request)
391
+ if response is not None:
392
+ logging.info("Create Key Pair %s Successfully", response.get("KeyPairId"))
393
+ return response
394
+ else:
395
+ logging.error("Create Key Pair Failed")
396
+ return None
397
+
398
+ def import_key_pair(self, key_pair_name, public_key_body):
399
+ """Import the public key of an RSA-encrypted key pair
400
+ that is generated by a third-party tool.
401
+
402
+ :param key_pair_name: The name of the key pair.
403
+ :param public_key_body: The public key of the key pair.
404
+ """
405
+ request = ImportKeyPairRequest()
406
+ request.set_KeyPairName(key_pair_name)
407
+ request.set_PublicKeyBody(public_key_body)
408
+ self._send_request(request)
409
+
410
+ def delete_key_pairs(self, key_pair_names):
411
+ """Delete one or more SSH key pairs.
412
+
413
+ :param key_pair_names: The name of the key pair.
414
+ :return:
415
+ """
416
+ request = DeleteKeyPairsRequest()
417
+ request.set_KeyPairNames(key_pair_names)
418
+ self._send_request(request)
419
+
420
+ def describe_key_pairs(self, key_pair_name=None):
421
+ """Query one or more key pairs.
422
+
423
+ :param key_pair_name: The name of the key pair.
424
+ :return:
425
+ """
426
+ request = DescribeKeyPairsRequest()
427
+ if key_pair_name is not None:
428
+ request.set_KeyPairName(key_pair_name)
429
+ response = self._send_request(request)
430
+ if response is not None:
431
+ return response.get("KeyPairs").get("KeyPair")
432
+ else:
433
+ return None
434
+
435
+ def describe_v_switches(self, vpc_id=None):
436
+ """Queries one or more VSwitches.
437
+
438
+ :param vpc_id: The ID of the VPC to which the VSwitch belongs.
439
+ :return: VSwitch list.
440
+ """
441
+ request = DescribeVSwitchesRequest()
442
+ if vpc_id is not None:
443
+ request.set_VpcId(vpc_id)
444
+ response = self._send_request(request)
445
+ if response is not None:
446
+ return response.get("VSwitches").get("VSwitch")
447
+ else:
448
+ logging.error("Describe VSwitches Failed.")
449
+ return None
450
+
451
+ def _send_request(self, request):
452
+ """send open api request"""
453
+ request.set_accept_format("json")
454
+ try:
455
+ response_str = self.cli.do_action_with_exception(request)
456
+ response_detail = json.loads(response_str)
457
+ return response_detail
458
+ except (ClientException, ServerException) as e:
459
+ logging.error(request.get_action_name())
460
+ logging.error(e)
461
+ return None
.venv/lib/python3.11/site-packages/ray/autoscaler/_private/aws/__pycache__/utils.cpython-311.pyc ADDED
Binary file (8.36 kB). View file
 
.venv/lib/python3.11/site-packages/ray/autoscaler/_private/kuberay/__init__.py ADDED
File without changes
.venv/lib/python3.11/site-packages/ray/autoscaler/_private/kuberay/__pycache__/__init__.cpython-311.pyc ADDED
Binary file (204 Bytes). View file
 
.venv/lib/python3.11/site-packages/ray/autoscaler/_private/kuberay/__pycache__/autoscaling_config.cpython-311.pyc ADDED
Binary file (16.6 kB). View file
 
.venv/lib/python3.11/site-packages/ray/autoscaler/_private/kuberay/__pycache__/node_provider.cpython-311.pyc ADDED
Binary file (24.6 kB). View file
 
.venv/lib/python3.11/site-packages/ray/autoscaler/_private/kuberay/__pycache__/run_autoscaler.cpython-311.pyc ADDED
Binary file (5.01 kB). View file
 
.venv/lib/python3.11/site-packages/ray/autoscaler/_private/kuberay/__pycache__/utils.cpython-311.pyc ADDED
Binary file (4.45 kB). View file
 
.venv/lib/python3.11/site-packages/ray/autoscaler/_private/kuberay/node_provider.py ADDED
@@ -0,0 +1,536 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import datetime
2
+ import json
3
+ import logging
4
+ import os
5
+ from abc import ABC, abstractmethod
6
+ from collections import defaultdict
7
+ from typing import Any, Dict, List, Optional, Tuple
8
+
9
+ import requests
10
+
11
+ from ray.autoscaler._private.constants import WORKER_LIVENESS_CHECK_KEY
12
+ from ray.autoscaler._private.util import NodeID, NodeIP, NodeKind, NodeStatus, NodeType
13
+ from ray.autoscaler.batching_node_provider import (
14
+ BatchingNodeProvider,
15
+ NodeData,
16
+ ScaleRequest,
17
+ )
18
+ from ray.autoscaler.tags import (
19
+ NODE_KIND_HEAD,
20
+ NODE_KIND_WORKER,
21
+ STATUS_UP_TO_DATE,
22
+ STATUS_UPDATE_FAILED,
23
+ TAG_RAY_USER_NODE_TYPE,
24
+ )
25
+
26
+ # Key for KubeRay label that identifies a Ray pod as head or worker.
27
+ KUBERAY_LABEL_KEY_KIND = "ray.io/node-type"
28
+ # Key for KubeRay label that identifies the worker group (autoscaler node type) of a
29
+ # Ray pod.
30
+ KUBERAY_LABEL_KEY_TYPE = "ray.io/group"
31
+
32
+ # These should be synced with:
33
+ # https://github.com/ray-project/kuberay/blob/f2d94ffe213dd8f69481b09c474047cb899fa73b/ray-operator/apis/ray/v1/raycluster_types.go#L165-L171 # noqa
34
+ # Kind label value indicating the pod is the head.
35
+ KUBERAY_KIND_HEAD = "head"
36
+ # Kind label value indicating the pod is the worker.
37
+ KUBERAY_KIND_WORKER = "worker"
38
+
39
+ # KubeRay CRD version
40
+ KUBERAY_CRD_VER = os.getenv("KUBERAY_CRD_VER", "v1alpha1")
41
+
42
+ KUBERAY_REQUEST_TIMEOUT_S = int(os.getenv("KUBERAY_REQUEST_TIMEOUT_S", 60))
43
+
44
+ RAY_HEAD_POD_NAME = os.getenv("RAY_HEAD_POD_NAME")
45
+
46
+ # https://kubernetes.io/docs/tasks/run-application/access-api-from-pod
47
+ # While running in a Pod, your container can create an HTTPS URL for the
48
+ # Kubernetes API server by fetching the KUBERNETES_SERVICE_HOST and
49
+ # KUBERNETES_SERVICE_PORT_HTTPS environment variables.
50
+ KUBERNETES_SERVICE_HOST = os.getenv(
51
+ "KUBERNETES_SERVICE_HOST", "https://kubernetes.default"
52
+ )
53
+ KUBERNETES_SERVICE_PORT = os.getenv("KUBERNETES_SERVICE_PORT_HTTPS", "443")
54
+ KUBERNETES_HOST = f"{KUBERNETES_SERVICE_HOST}:{KUBERNETES_SERVICE_PORT}"
55
+ # Key for GKE label that identifies which multi-host replica a pod belongs to
56
+ REPLICA_INDEX_KEY = "replicaIndex"
57
+
58
+ TOKEN_REFRESH_PERIOD = datetime.timedelta(minutes=1)
59
+
60
+ # Design:
61
+
62
+ # Each modification the autoscaler wants to make is posted to the API server goal state
63
+ # (e.g. if the autoscaler wants to scale up, it increases the number of
64
+ # replicas of the worker group it wants to scale, if it wants to scale down
65
+ # it decreases the number of replicas and adds the exact pods that should be
66
+ # terminated to the scaleStrategy).
67
+
68
+ # KubeRayNodeProvider inherits from BatchingNodeProvider.
69
+ # Thus, the autoscaler's create and terminate requests are batched into a single
70
+ # Scale Request object which is submitted at the end of autoscaler update.
71
+ # KubeRay node provider converts the ScaleRequest into a RayCluster CR patch
72
+ # and applies the patch in the submit_scale_request method.
73
+
74
+ # To reduce potential for race conditions, KubeRayNodeProvider
75
+ # aborts the autoscaler update if the operator has not yet processed workersToDelete -
76
+ # see KubeRayNodeProvider.safe_to_scale().
77
+ # Once it is confirmed that workersToDelete have been cleaned up, KubeRayNodeProvider
78
+ # clears the workersToDelete list.
79
+
80
+
81
+ # Note: Log handlers set up in autoscaling monitor entrypoint.
82
+ logger = logging.getLogger(__name__)
83
+
84
+
85
+ def node_data_from_pod(pod: Dict[str, Any]) -> NodeData:
86
+ """Converts a Ray pod extracted from K8s into Ray NodeData.
87
+ NodeData is processed by BatchingNodeProvider.
88
+ """
89
+ kind, type = kind_and_type(pod)
90
+ status = status_tag(pod)
91
+ ip = pod_ip(pod)
92
+ replica_index = _replica_index_label(pod)
93
+ return NodeData(
94
+ kind=kind, type=type, replica_index=replica_index, status=status, ip=ip
95
+ )
96
+
97
+
98
+ def kind_and_type(pod: Dict[str, Any]) -> Tuple[NodeKind, NodeType]:
99
+ """Determine Ray node kind (head or workers) and node type (worker group name)
100
+ from a Ray pod's labels.
101
+ """
102
+ labels = pod["metadata"]["labels"]
103
+ kind = (
104
+ NODE_KIND_HEAD
105
+ if labels[KUBERAY_LABEL_KEY_KIND] == KUBERAY_KIND_HEAD
106
+ else NODE_KIND_WORKER
107
+ )
108
+ type = labels[KUBERAY_LABEL_KEY_TYPE]
109
+ return kind, type
110
+
111
+
112
+ def _replica_index_label(pod: Dict[str, Any]) -> Optional[str]:
113
+ """Returns the replicaIndex label for a Pod in a multi-host TPU worker group.
114
+ The replicaIndex label is set by the GKE TPU Ray webhook and is of
115
+ the form {$WORKER_GROUP_NAME-$REPLICA_INDEX} where $REPLICA_INDEX
116
+ is an integer from 0 to Replicas-1.
117
+ """
118
+ labels = pod["metadata"]["labels"]
119
+ return labels.get(REPLICA_INDEX_KEY, None)
120
+
121
+
122
+ def pod_ip(pod: Dict[str, Any]) -> NodeIP:
123
+ return pod["status"].get("podIP", "IP not yet assigned")
124
+
125
+
126
+ def status_tag(pod: Dict[str, Any]) -> NodeStatus:
127
+ """Convert pod state to Ray autoscaler node status.
128
+
129
+ See the doc string of the class
130
+ batching_node_provider.NodeData for the semantics of node status.
131
+ """
132
+ if (
133
+ "containerStatuses" not in pod["status"]
134
+ or not pod["status"]["containerStatuses"]
135
+ ):
136
+ return "pending"
137
+
138
+ state = pod["status"]["containerStatuses"][0]["state"]
139
+
140
+ if "pending" in state:
141
+ return "pending"
142
+ if "running" in state:
143
+ return STATUS_UP_TO_DATE
144
+ if "waiting" in state:
145
+ return "waiting"
146
+ if "terminated" in state:
147
+ return STATUS_UPDATE_FAILED
148
+ raise ValueError("Unexpected container state.")
149
+
150
+
151
+ def worker_delete_patch(group_index: str, workers_to_delete: List[NodeID]):
152
+ path = f"/spec/workerGroupSpecs/{group_index}/scaleStrategy"
153
+ value = {"workersToDelete": workers_to_delete}
154
+ return replace_patch(path, value)
155
+
156
+
157
+ def worker_replica_patch(group_index: str, target_replicas: int):
158
+ path = f"/spec/workerGroupSpecs/{group_index}/replicas"
159
+ value = target_replicas
160
+ return replace_patch(path, value)
161
+
162
+
163
+ def replace_patch(path: str, value: Any) -> Dict[str, Any]:
164
+ return {"op": "replace", "path": path, "value": value}
165
+
166
+
167
+ def load_k8s_secrets() -> Tuple[Dict[str, str], str]:
168
+ """
169
+ Loads secrets needed to access K8s resources.
170
+
171
+ Returns:
172
+ headers: Headers with K8s access token
173
+ verify: Path to certificate
174
+ """
175
+ with open("/var/run/secrets/kubernetes.io/serviceaccount/token") as secret:
176
+ token = secret.read()
177
+
178
+ headers = {
179
+ "Authorization": "Bearer " + token,
180
+ }
181
+ verify = "/var/run/secrets/kubernetes.io/serviceaccount/ca.crt"
182
+
183
+ return headers, verify
184
+
185
+
186
+ def url_from_resource(
187
+ namespace: str,
188
+ path: str,
189
+ kuberay_crd_version: str = KUBERAY_CRD_VER,
190
+ kubernetes_host: str = KUBERNETES_HOST,
191
+ ) -> str:
192
+ """Convert resource path to REST URL for Kubernetes API server.
193
+
194
+ Args:
195
+ namespace: The K8s namespace of the resource
196
+ path: The part of the resource path that starts with the resource type.
197
+ Supported resource types are "pods" and "rayclusters".
198
+ kuberay_crd_version: The API version of the KubeRay CRD.
199
+ Looks like "v1alpha1", "v1".
200
+ kubernetes_host: The host of the Kubernetes API server.
201
+ Uses $KUBERNETES_SERVICE_HOST and
202
+ $KUBERNETES_SERVICE_PORT to construct the kubernetes_host if not provided.
203
+
204
+ When set by Kubernetes,
205
+ $KUBERNETES_SERVICE_HOST could be an IP address. That's why the https
206
+ scheme is added here.
207
+
208
+ Defaults to "https://kubernetes.default:443".
209
+ """
210
+ if kubernetes_host.startswith("http://"):
211
+ raise ValueError("Kubernetes host must be accessed over HTTPS.")
212
+ if not kubernetes_host.startswith("https://"):
213
+ kubernetes_host = "https://" + kubernetes_host
214
+ if path.startswith("pods"):
215
+ api_group = "/api/v1"
216
+ elif path.startswith("rayclusters"):
217
+ api_group = "/apis/ray.io/" + kuberay_crd_version
218
+ else:
219
+ raise NotImplementedError("Tried to access unknown entity at {}".format(path))
220
+ return kubernetes_host + api_group + "/namespaces/" + namespace + "/" + path
221
+
222
+
223
+ def _worker_group_index(raycluster: Dict[str, Any], group_name: str) -> int:
224
+ """Extract worker group index from RayCluster."""
225
+ group_names = [
226
+ spec["groupName"] for spec in raycluster["spec"].get("workerGroupSpecs", [])
227
+ ]
228
+ return group_names.index(group_name)
229
+
230
+
231
+ def _worker_group_max_replicas(
232
+ raycluster: Dict[str, Any], group_index: int
233
+ ) -> Optional[int]:
234
+ """Extract the maxReplicas of a worker group.
235
+
236
+ If maxReplicas is unset, return None, to be interpreted as "no constraint".
237
+ At time of writing, it should be impossible for maxReplicas to be unset, but it's
238
+ better to handle this anyway.
239
+ """
240
+ return raycluster["spec"]["workerGroupSpecs"][group_index].get("maxReplicas")
241
+
242
+
243
+ def _worker_group_replicas(raycluster: Dict[str, Any], group_index: int):
244
+ # 1 is the default replicas value used by the KubeRay operator
245
+ return raycluster["spec"]["workerGroupSpecs"][group_index].get("replicas", 1)
246
+
247
+
248
+ class IKubernetesHttpApiClient(ABC):
249
+ """
250
+ An interface for a Kubernetes HTTP API client.
251
+
252
+ This interface could be used to mock the Kubernetes API client in tests.
253
+ """
254
+
255
+ @abstractmethod
256
+ def get(self, path: str) -> Dict[str, Any]:
257
+ """Wrapper for REST GET of resource with proper headers."""
258
+ pass
259
+
260
+ @abstractmethod
261
+ def patch(self, path: str, payload: List[Dict[str, Any]]) -> Dict[str, Any]:
262
+ """Wrapper for REST PATCH of resource with proper headers."""
263
+ pass
264
+
265
+
266
+ class KubernetesHttpApiClient(IKubernetesHttpApiClient):
267
+ def __init__(self, namespace: str, kuberay_crd_version: str = KUBERAY_CRD_VER):
268
+ self._kuberay_crd_version = kuberay_crd_version
269
+ self._namespace = namespace
270
+ self._token_expires_at = datetime.datetime.now() + TOKEN_REFRESH_PERIOD
271
+ self._headers, self._verify = None, None
272
+
273
+ def _get_refreshed_headers_and_verify(self):
274
+ if (datetime.datetime.now() >= self._token_expires_at) or (
275
+ self._headers is None or self._verify is None
276
+ ):
277
+ logger.info("Refreshing K8s API client token and certs.")
278
+ self._headers, self._verify = load_k8s_secrets()
279
+ self._token_expires_at = datetime.datetime.now() + TOKEN_REFRESH_PERIOD
280
+ return self._headers, self._verify
281
+ else:
282
+ return self._headers, self._verify
283
+
284
+ def get(self, path: str) -> Dict[str, Any]:
285
+ """Wrapper for REST GET of resource with proper headers.
286
+
287
+ Args:
288
+ path: The part of the resource path that starts with the resource type.
289
+
290
+ Returns:
291
+ The JSON response of the GET request.
292
+
293
+ Raises:
294
+ HTTPError: If the GET request fails.
295
+ """
296
+ url = url_from_resource(
297
+ namespace=self._namespace,
298
+ path=path,
299
+ kuberay_crd_version=self._kuberay_crd_version,
300
+ )
301
+
302
+ headers, verify = self._get_refreshed_headers_and_verify()
303
+ result = requests.get(
304
+ url,
305
+ headers=headers,
306
+ timeout=KUBERAY_REQUEST_TIMEOUT_S,
307
+ verify=verify,
308
+ )
309
+ if not result.status_code == 200:
310
+ result.raise_for_status()
311
+ return result.json()
312
+
313
+ def patch(self, path: str, payload: List[Dict[str, Any]]) -> Dict[str, Any]:
314
+ """Wrapper for REST PATCH of resource with proper headers
315
+
316
+ Args:
317
+ path: The part of the resource path that starts with the resource type.
318
+ payload: The JSON patch payload.
319
+
320
+ Returns:
321
+ The JSON response of the PATCH request.
322
+
323
+ Raises:
324
+ HTTPError: If the PATCH request fails.
325
+ """
326
+ url = url_from_resource(
327
+ namespace=self._namespace,
328
+ path=path,
329
+ kuberay_crd_version=self._kuberay_crd_version,
330
+ )
331
+ headers, verify = self._get_refreshed_headers_and_verify()
332
+ result = requests.patch(
333
+ url,
334
+ json.dumps(payload),
335
+ headers={**headers, "Content-type": "application/json-patch+json"},
336
+ verify=verify,
337
+ )
338
+ if not result.status_code == 200:
339
+ result.raise_for_status()
340
+ return result.json()
341
+
342
+
343
+ class KubeRayNodeProvider(BatchingNodeProvider): # type: ignore
344
+ def __init__(
345
+ self,
346
+ provider_config: Dict[str, Any],
347
+ cluster_name: str,
348
+ ):
349
+ logger.info("Creating KubeRayNodeProvider.")
350
+ self.namespace = provider_config["namespace"]
351
+ self.cluster_name = cluster_name
352
+
353
+ self.k8s_api_client = KubernetesHttpApiClient(self.namespace)
354
+
355
+ assert (
356
+ provider_config.get(WORKER_LIVENESS_CHECK_KEY, True) is False
357
+ ), f"To use KubeRayNodeProvider, must set `{WORKER_LIVENESS_CHECK_KEY}:False`."
358
+ BatchingNodeProvider.__init__(self, provider_config, cluster_name)
359
+
360
+ def get_node_data(self) -> Dict[NodeID, NodeData]:
361
+ """Queries K8s for pods in the RayCluster. Converts that pod data into a
362
+ map of pod name to Ray NodeData, as required by BatchingNodeProvider.
363
+ """
364
+ # Store the raycluster CR
365
+ self._raycluster = self._get(f"rayclusters/{self.cluster_name}")
366
+
367
+ # Get the pods resource version.
368
+ # Specifying a resource version in list requests is important for scalability:
369
+ # https://kubernetes.io/docs/reference/using-api/api-concepts/#semantics-for-get-and-list
370
+ resource_version = self._get_pods_resource_version()
371
+ if resource_version:
372
+ logger.info(
373
+ f"Listing pods for RayCluster {self.cluster_name}"
374
+ f" in namespace {self.namespace}"
375
+ f" at pods resource version >= {resource_version}."
376
+ )
377
+
378
+ # Filter pods by cluster_name.
379
+ label_selector = requests.utils.quote(f"ray.io/cluster={self.cluster_name}")
380
+
381
+ resource_path = f"pods?labelSelector={label_selector}"
382
+ if resource_version:
383
+ resource_path += (
384
+ f"&resourceVersion={resource_version}"
385
+ + "&resourceVersionMatch=NotOlderThan"
386
+ )
387
+
388
+ pod_list = self._get(resource_path)
389
+ fetched_resource_version = pod_list["metadata"]["resourceVersion"]
390
+ logger.info(
391
+ f"Fetched pod data at resource version" f" {fetched_resource_version}."
392
+ )
393
+
394
+ # Extract node data from the pod list.
395
+ node_data_dict = {}
396
+ for pod in pod_list["items"]:
397
+ # Kubernetes sets metadata.deletionTimestamp immediately after admitting a
398
+ # request to delete an object. Full removal of the object may take some time
399
+ # after the deletion timestamp is set. See link for details:
400
+ # https://kubernetes.io/docs/reference/using-api/api-concepts/#resource-deletion
401
+ if "deletionTimestamp" in pod["metadata"]:
402
+ # Ignore pods marked for termination.
403
+ continue
404
+ pod_name = pod["metadata"]["name"]
405
+ node_data_dict[pod_name] = node_data_from_pod(pod)
406
+ return node_data_dict
407
+
408
+ def submit_scale_request(self, scale_request: ScaleRequest):
409
+ """Converts the scale request generated by BatchingNodeProvider into
410
+ a patch that modifies the RayCluster CR's replicas and/or workersToDelete
411
+ fields. Then submits the patch to the K8s API server.
412
+ """
413
+ # Transform the scale request into a patch payload.
414
+ patch_payload = self._scale_request_to_patch_payload(
415
+ scale_request, self._raycluster
416
+ )
417
+
418
+ # Submit the patch to K8s.
419
+ logger.info(
420
+ "Autoscaler is submitting the following patch to RayCluster "
421
+ f"{self.cluster_name} in namespace {self.namespace}."
422
+ )
423
+ logger.info(patch_payload)
424
+ self._submit_raycluster_patch(patch_payload)
425
+
426
+ def safe_to_scale(self) -> bool:
427
+ """Returns False iff non_terminated_nodes contains any pods in the RayCluster's
428
+ workersToDelete lists.
429
+
430
+ Explanation:
431
+ If there are any workersToDelete which are non-terminated,
432
+ we should wait for the operator to do its job and delete those
433
+ pods. Therefore, we back off the autoscaler update.
434
+
435
+ If, on the other hand, all of the workersToDelete have already been cleaned up,
436
+ then we patch away the workersToDelete lists and return True.
437
+ In the future, we may consider having the operator clean up workersToDelete
438
+ on it own:
439
+ https://github.com/ray-project/kuberay/issues/733
440
+
441
+ Note (Dmitri):
442
+ It is stylistically bad that this function has a side effect.
443
+ """
444
+ # Get the list of nodes.
445
+ node_set = set(self.node_data_dict.keys())
446
+ worker_groups = self._raycluster["spec"].get("workerGroupSpecs", [])
447
+
448
+ # Accumulates the indices of worker groups with non-empty workersToDelete
449
+ non_empty_worker_group_indices = []
450
+
451
+ for group_index, worker_group in enumerate(worker_groups):
452
+ workersToDelete = worker_group.get("scaleStrategy", {}).get(
453
+ "workersToDelete", []
454
+ )
455
+ if workersToDelete:
456
+ non_empty_worker_group_indices.append(group_index)
457
+ for worker in workersToDelete:
458
+ if worker in node_set:
459
+ # The operator hasn't removed this worker yet. Abort
460
+ # the autoscaler update.
461
+ logger.warning(f"Waiting for operator to remove worker {worker}.")
462
+ return False
463
+
464
+ # All required workersToDelete have been removed.
465
+ # Clean up the workersToDelete field.
466
+ patch_payload = []
467
+ for group_index in non_empty_worker_group_indices:
468
+ patch = worker_delete_patch(group_index, workers_to_delete=[])
469
+ patch_payload.append(patch)
470
+ if patch_payload:
471
+ logger.info("Cleaning up workers to delete.")
472
+ logger.info(f"Submitting patch {patch_payload}.")
473
+ self._submit_raycluster_patch(patch_payload)
474
+
475
+ # It's safe to proceed with the autoscaler update.
476
+ return True
477
+
478
+ def _get_pods_resource_version(self) -> str:
479
+ """
480
+ Extract a recent pods resource version by reading the head pod's
481
+ metadata.resourceVersion of the response.
482
+ """
483
+ if not RAY_HEAD_POD_NAME:
484
+ return None
485
+ pod_resp = self._get(f"pods/{RAY_HEAD_POD_NAME}")
486
+ return pod_resp["metadata"]["resourceVersion"]
487
+
488
+ def _scale_request_to_patch_payload(
489
+ self, scale_request: ScaleRequest, raycluster: Dict[str, Any]
490
+ ) -> List[Dict[str, Any]]:
491
+ """Converts autoscaler scale request into a RayCluster CR patch payload."""
492
+ patch_payload = []
493
+ # Collect patches for replica counts.
494
+ for node_type, target_replicas in scale_request.desired_num_workers.items():
495
+ group_index = _worker_group_index(raycluster, node_type)
496
+ group_max_replicas = _worker_group_max_replicas(raycluster, group_index)
497
+ # Cap the replica count to maxReplicas.
498
+ if group_max_replicas is not None and group_max_replicas < target_replicas:
499
+ logger.warning(
500
+ "Autoscaler attempted to create "
501
+ + "more than maxReplicas pods of type {}.".format(node_type)
502
+ )
503
+ target_replicas = group_max_replicas
504
+ # Check if we need to change the target count.
505
+ if target_replicas == _worker_group_replicas(raycluster, group_index):
506
+ # No patch required.
507
+ continue
508
+ # Need to patch replica count. Format the patch and add it to the payload.
509
+ patch = worker_replica_patch(group_index, target_replicas)
510
+ patch_payload.append(patch)
511
+
512
+ # Maps node_type to nodes to delete for that group.
513
+ deletion_groups = defaultdict(list)
514
+ for worker in scale_request.workers_to_delete:
515
+ node_type = self.node_tags(worker)[TAG_RAY_USER_NODE_TYPE]
516
+ deletion_groups[node_type].append(worker)
517
+
518
+ for node_type, workers_to_delete in deletion_groups.items():
519
+ group_index = _worker_group_index(raycluster, node_type)
520
+ patch = worker_delete_patch(group_index, workers_to_delete)
521
+ patch_payload.append(patch)
522
+
523
+ return patch_payload
524
+
525
+ def _submit_raycluster_patch(self, patch_payload: List[Dict[str, Any]]):
526
+ """Submits a patch to modify a RayCluster CR."""
527
+ path = "rayclusters/{}".format(self.cluster_name)
528
+ self._patch(path, patch_payload)
529
+
530
+ def _get(self, path: str) -> Dict[str, Any]:
531
+ """Wrapper for REST GET of resource with proper headers."""
532
+ return self.k8s_api_client.get(path)
533
+
534
+ def _patch(self, path: str, payload: List[Dict[str, Any]]) -> Dict[str, Any]:
535
+ """Wrapper for REST PATCH of resource with proper headers."""
536
+ return self.k8s_api_client.patch(path, payload)
.venv/lib/python3.11/site-packages/ray/autoscaler/_private/kuberay/run_autoscaler.py ADDED
@@ -0,0 +1,119 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ import os
3
+ import subprocess
4
+ import time
5
+
6
+ import ray
7
+ from ray._private import ray_constants
8
+ from ray._private.ray_logging import setup_component_logger
9
+ from ray._private.services import get_node_ip_address
10
+ from ray._private.utils import try_to_create_directory
11
+ from ray._raylet import GcsClient
12
+ from ray.autoscaler._private.kuberay.autoscaling_config import AutoscalingConfigProducer
13
+ from ray.autoscaler._private.monitor import Monitor
14
+ from ray.autoscaler.v2.instance_manager.config import KubeRayConfigReader
15
+ from ray.autoscaler.v2.utils import is_autoscaler_v2
16
+
17
+ logger = logging.getLogger(__name__)
18
+
19
+ BACKOFF_S = 5
20
+
21
+
22
+ def _get_log_dir() -> str:
23
+ return os.path.join(
24
+ ray._private.utils.get_ray_temp_dir(),
25
+ ray._private.ray_constants.SESSION_LATEST,
26
+ "logs",
27
+ )
28
+
29
+
30
+ def run_kuberay_autoscaler(cluster_name: str, cluster_namespace: str):
31
+ """Wait until the Ray head container is ready. Then start the autoscaler."""
32
+ head_ip = get_node_ip_address()
33
+ ray_address = f"{head_ip}:6379"
34
+ while True:
35
+ try:
36
+ # Autoscaler Ray version might not exactly match GCS version, so skip the
37
+ # version check when checking GCS status.
38
+ subprocess.check_call(
39
+ [
40
+ "ray",
41
+ "health-check",
42
+ "--address",
43
+ ray_address,
44
+ "--skip-version-check",
45
+ ]
46
+ )
47
+ logger.info("The Ray head is ready. Starting the autoscaler.")
48
+ break
49
+ except subprocess.CalledProcessError:
50
+ logger.warning(
51
+ f"The Ray head is not ready. Will check again in {BACKOFF_S} seconds."
52
+ )
53
+ time.sleep(BACKOFF_S)
54
+
55
+ # The Ray head container sets up the log directory. Thus, we set up logging
56
+ # only after the Ray head is ready.
57
+ _setup_logging()
58
+
59
+ # autoscaling_config_producer reads the RayCluster CR from K8s and uses the CR
60
+ # to output an autoscaling config.
61
+ autoscaling_config_producer = AutoscalingConfigProducer(
62
+ cluster_name, cluster_namespace
63
+ )
64
+
65
+ gcs_client = GcsClient(ray_address)
66
+ if is_autoscaler_v2(fetch_from_server=True, gcs_client=gcs_client):
67
+ from ray.autoscaler.v2.monitor import AutoscalerMonitor as MonitorV2
68
+
69
+ MonitorV2(
70
+ address=gcs_client.address,
71
+ config_reader=KubeRayConfigReader(autoscaling_config_producer),
72
+ log_dir=_get_log_dir(),
73
+ monitor_ip=head_ip,
74
+ ).run()
75
+ else:
76
+ Monitor(
77
+ address=gcs_client.address,
78
+ # The `autoscaling_config` arg can be a dict or a `Callable: () -> dict`.
79
+ # In this case, it's a callable.
80
+ autoscaling_config=autoscaling_config_producer,
81
+ monitor_ip=head_ip,
82
+ # Let the autoscaler process exit after it hits 5 exceptions.
83
+ # (See ray.autoscaler._private.constants.AUTOSCALER_MAX_NUM_FAILURES.)
84
+ # Kubernetes will then restart the autoscaler container.
85
+ retry_on_failure=False,
86
+ ).run()
87
+
88
+
89
+ def _setup_logging() -> None:
90
+ """Log to autoscaler log file
91
+ (typically, /tmp/ray/session_latest/logs/monitor.*)
92
+
93
+ Also log to pod stdout (logs viewable with `kubectl logs <head-pod> -c autoscaler`).
94
+ """
95
+ log_dir = _get_log_dir()
96
+ # The director should already exist, but try (safely) to create it just in case.
97
+ try_to_create_directory(log_dir)
98
+
99
+ # Write logs at info level to monitor.log.
100
+ setup_component_logger(
101
+ logging_level=ray_constants.LOGGER_LEVEL,
102
+ logging_format=ray_constants.LOGGER_FORMAT,
103
+ log_dir=log_dir,
104
+ filename=ray_constants.MONITOR_LOG_FILE_NAME, # monitor.log
105
+ max_bytes=ray_constants.LOGGING_ROTATE_BYTES,
106
+ backup_count=ray_constants.LOGGING_ROTATE_BACKUP_COUNT,
107
+ )
108
+
109
+ # For the autoscaler, the root logger _also_ needs to write to stderr, not just
110
+ # ray_constants.MONITOR_LOG_FILE_NAME.
111
+ level = logging.getLevelName(ray_constants.LOGGER_LEVEL.upper())
112
+ stderr_handler = logging._StderrHandler()
113
+ stderr_handler.setFormatter(logging.Formatter(ray_constants.LOGGER_FORMAT))
114
+ stderr_handler.setLevel(level)
115
+ logging.root.setLevel(level)
116
+ logging.root.addHandler(stderr_handler)
117
+
118
+ # The stdout handler was set up in the Ray CLI entry point.
119
+ # See ray.scripts.scripts::cli().
.venv/lib/python3.11/site-packages/ray/autoscaler/_private/kuberay/utils.py ADDED
@@ -0,0 +1,111 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Source:
2
+ # https://github.com/kubernetes-client/python/blob/master/kubernetes/utils/quantity.py
3
+ from decimal import Decimal, InvalidOperation
4
+ from functools import reduce
5
+ from typing import Optional
6
+
7
+ # Mapping used to get generation for TPU-{accelerator}-head resource
8
+ # https://cloud.google.com/kubernetes-engine/docs/how-to/tpus#run
9
+ gke_tpu_accelerator_to_generation = {
10
+ "tpu-v4-podslice": "v4",
11
+ "tpu-v5-lite-device": "v5e",
12
+ "tpu-v5-lite-podslice": "v5e",
13
+ "tpu-v5p-slice": "v5p",
14
+ "tpu-v6e-slice": "v6e",
15
+ }
16
+
17
+
18
+ def parse_quantity(quantity):
19
+ """
20
+ Parse kubernetes canonical form quantity like 200Mi to a decimal number.
21
+ Supported SI suffixes:
22
+ base1024: Ki | Mi | Gi | Ti | Pi | Ei
23
+ base1000: n | u | m | "" | k | M | G | T | P | E
24
+
25
+ See
26
+ https://github.com/kubernetes/apimachinery/blob/master/pkg/api/resource/quantity.go
27
+
28
+ Input:
29
+ quantity: string. kubernetes canonical form quantity
30
+
31
+ Returns:
32
+ Decimal
33
+
34
+ Raises:
35
+ ValueError on invalid or unknown input
36
+ """
37
+ if isinstance(quantity, (int, float, Decimal)):
38
+ return Decimal(quantity)
39
+
40
+ exponents = {
41
+ "n": -3,
42
+ "u": -2,
43
+ "m": -1,
44
+ "K": 1,
45
+ "k": 1,
46
+ "M": 2,
47
+ "G": 3,
48
+ "T": 4,
49
+ "P": 5,
50
+ "E": 6,
51
+ }
52
+
53
+ quantity = str(quantity)
54
+ number = quantity
55
+ suffix = None
56
+ if len(quantity) >= 2 and quantity[-1] == "i":
57
+ if quantity[-2] in exponents:
58
+ number = quantity[:-2]
59
+ suffix = quantity[-2:]
60
+ elif len(quantity) >= 1 and quantity[-1] in exponents:
61
+ number = quantity[:-1]
62
+ suffix = quantity[-1:]
63
+
64
+ try:
65
+ number = Decimal(number)
66
+ except InvalidOperation:
67
+ raise ValueError("Invalid number format: {}".format(number))
68
+
69
+ if suffix is None:
70
+ return number
71
+
72
+ if suffix.endswith("i"):
73
+ base = 1024
74
+ elif len(suffix) == 1:
75
+ base = 1000
76
+ else:
77
+ raise ValueError("{} has unknown suffix".format(quantity))
78
+
79
+ # handle SI inconsistency
80
+ if suffix == "ki":
81
+ raise ValueError("{} has unknown suffix".format(quantity))
82
+
83
+ if suffix[0] not in exponents:
84
+ raise ValueError("{} has unknown suffix".format(quantity))
85
+
86
+ exponent = Decimal(exponents[suffix[0]])
87
+ return number * (base**exponent)
88
+
89
+
90
+ def tpu_node_selectors_to_type(topology: str, accelerator: str) -> Optional[str]:
91
+ """Convert Kubernetes gke-tpu nodeSelectors to TPU accelerator_type
92
+ for a kuberay TPU worker group.
93
+ Args:
94
+ topology: value of the cloud.google.com/gke-tpu-topology Kubernetes
95
+ nodeSelector, describes the physical topology of the TPU podslice.
96
+ accelerator: value of the cloud.google.com/gke-tpu-accelerator nodeSelector,
97
+ the name of the TPU accelerator, e.g. tpu-v4-podslice
98
+ Returns:
99
+ A string, accelerator_type, e.g. "v4-8".
100
+ """
101
+ if topology and accelerator:
102
+ generation = gke_tpu_accelerator_to_generation[accelerator]
103
+ # Reduce e.g. "2x2x2" to 8
104
+ chip_dimensions = [int(chip_count) for chip_count in topology.split("x")]
105
+ num_chips = reduce(lambda x, y: x * y, chip_dimensions)
106
+ default_num_cores_per_chip = 1
107
+ if generation == "v4" or generation == "v5p":
108
+ default_num_cores_per_chip = 2
109
+ num_cores = num_chips * default_num_cores_per_chip
110
+ return f"{generation}-{num_cores}"
111
+ return None
.venv/lib/python3.11/site-packages/ray/autoscaler/_private/readonly/__init__.py ADDED
File without changes
.venv/lib/python3.11/site-packages/ray/autoscaler/_private/readonly/__pycache__/__init__.cpython-311.pyc ADDED
Binary file (205 Bytes). View file
 
.venv/lib/python3.11/site-packages/ray/autoscaler/_private/readonly/__pycache__/node_provider.cpython-311.pyc ADDED
Binary file (4.39 kB). View file
 
.venv/lib/python3.11/site-packages/ray/autoscaler/_private/readonly/node_provider.py ADDED
@@ -0,0 +1,80 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import List, Tuple
2
+
3
+ from ray.autoscaler._private.util import format_readonly_node_type
4
+ from ray.autoscaler.node_provider import NodeProvider
5
+ from ray.autoscaler.tags import (
6
+ NODE_KIND_HEAD,
7
+ STATUS_UP_TO_DATE,
8
+ TAG_RAY_NODE_KIND,
9
+ TAG_RAY_NODE_NAME,
10
+ TAG_RAY_NODE_STATUS,
11
+ TAG_RAY_USER_NODE_TYPE,
12
+ )
13
+
14
+
15
+ class ReadOnlyNodeProvider(NodeProvider):
16
+ """A node provider that merely reports the current cluster state.
17
+
18
+ This is used for laptop mode / manual cluster setup modes, in order to
19
+ provide status reporting in the same way for users."""
20
+
21
+ def __init__(self, provider_config, cluster_name):
22
+ NodeProvider.__init__(self, provider_config, cluster_name)
23
+ self.nodes = {}
24
+
25
+ def is_readonly(self):
26
+ return True
27
+
28
+ def _set_nodes(self, nodes: List[Tuple[str, str]]):
29
+ """Update the set of nodes in the cluster.
30
+
31
+ Args:
32
+ nodes: List of (node_id, node_manager_address) tuples.
33
+ """
34
+ new_nodes = {}
35
+ for node_id, node_manager_address in nodes:
36
+ # We make up a fake node type for each node (since each node
37
+ # could have its own unique configuration).
38
+ new_nodes[node_id] = {
39
+ # Keep prefix in sync with node config gen in monitor.py
40
+ "node_type": format_readonly_node_type(node_id),
41
+ "ip": node_manager_address,
42
+ }
43
+ self.nodes = new_nodes
44
+
45
+ def non_terminated_nodes(self, tag_filters):
46
+ return list(self.nodes.keys())
47
+
48
+ def is_running(self, node_id):
49
+ return node_id in self.nodes
50
+
51
+ def is_terminated(self, node_id):
52
+ return node_id not in self.nodes
53
+
54
+ def node_tags(self, node_id):
55
+ tags = {
56
+ TAG_RAY_NODE_KIND: NODE_KIND_HEAD,
57
+ TAG_RAY_USER_NODE_TYPE: self.nodes[node_id]["node_type"],
58
+ TAG_RAY_NODE_NAME: node_id,
59
+ TAG_RAY_NODE_STATUS: STATUS_UP_TO_DATE,
60
+ }
61
+ return tags
62
+
63
+ def external_ip(self, node_id):
64
+ return node_id
65
+
66
+ def internal_ip(self, node_id):
67
+ return node_id
68
+
69
+ def set_node_tags(self, node_id, tags):
70
+ raise AssertionError("Readonly node provider cannot be updated")
71
+
72
+ def create_node(self, node_config, tags, count):
73
+ raise AssertionError("Readonly node provider cannot be updated")
74
+
75
+ def terminate_node(self, node_id):
76
+ raise AssertionError("Readonly node provider cannot be updated")
77
+
78
+ @staticmethod
79
+ def bootstrap_config(cluster_config):
80
+ return cluster_config
.venv/lib/python3.11/site-packages/ray/autoscaler/aws/__init__.py ADDED
File without changes
.venv/lib/python3.11/site-packages/ray/autoscaler/aws/__pycache__/__init__.cpython-311.pyc ADDED
Binary file (191 Bytes). View file
 
.venv/lib/python3.11/site-packages/ray/autoscaler/aws/cloudwatch/prometheus.yml ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Prometheus config file
2
+
3
+ # my global config
4
+ global:
5
+ scrape_interval: 10s
6
+ evaluation_interval: 10s
7
+ scrape_timeout: 10s
8
+
9
+ # use ray file-based service discovery file as scrape target.
10
+ scrape_configs:
11
+ - job_name: 'ray'
12
+ file_sd_configs:
13
+ - files:
14
+ - '/tmp/ray/prom_metrics_service_discovery.json'
15
+ refresh_interval: 1m
.venv/lib/python3.11/site-packages/ray/autoscaler/aws/cloudwatch/ray_prometheus_waiter.sh ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+
3
+ MAX_ATTEMPTS=120
4
+ DELAY_SECONDS=10
5
+ RAY_PROM_METRICS_FILE_PATH="/tmp/ray/prom_metrics_service_discovery.json"
6
+ CLUSTER_NAME=$1
7
+ while [ $MAX_ATTEMPTS -gt 0 ]; do
8
+ if [ -f $RAY_PROM_METRICS_FILE_PATH ]; then
9
+ echo "Ray Prometheus metrics service discovery file found at: $RAY_PROM_METRICS_FILE_PATH."
10
+ echo "Restarting cloudwatch agent.This may take a few minutes..."
11
+ sudo /opt/aws/amazon-cloudwatch-agent/bin/amazon-cloudwatch-agent-ctl -m ec2 -a stop
12
+ echo "Cloudwatch agent stopped, starting cloudwatch agent..."
13
+ sudo /opt/aws/amazon-cloudwatch-agent/bin/amazon-cloudwatch-agent-ctl -a fetch-config -m ec2 -s -c "ssm:AmazonCloudWatch-ray_agent_config_$CLUSTER_NAME"
14
+ echo "Cloudwatch agent successfully restarted!"
15
+ exit 0
16
+ else
17
+ echo "Ray Prometheus metrics service discovery file not found at: $RAY_PROM_METRICS_FILE_PATH. Will check again in $DELAY_SECONDS seconds..."
18
+ sleep $DELAY_SECONDS
19
+ MAX_ATTEMPTS=$((MAX_ATTEMPTS-1))
20
+ fi
21
+ done
22
+ echo "Ray Prometheus metrics service discovery file not found at: $RAY_PROM_METRICS_FILE_PATH. Ray system metrics will not be available in CloudWatch."
23
+ exit 1
.venv/lib/python3.11/site-packages/ray/autoscaler/aws/defaults.yaml ADDED
@@ -0,0 +1,144 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # An unique identifier for the head node and workers of this cluster.
2
+ cluster_name: default
3
+
4
+ # The maximum number of workers nodes to launch in addition to the head
5
+ # node.
6
+ max_workers: 2
7
+
8
+ # The autoscaler will scale up the cluster faster with higher upscaling speed.
9
+ # E.g., if the task requires adding more nodes then autoscaler will gradually
10
+ # scale up the cluster in chunks of upscaling_speed*currently_running_nodes.
11
+ # This number should be > 0.
12
+ upscaling_speed: 1.0
13
+
14
+ # This executes all commands on all nodes in the docker container,
15
+ # and opens all the necessary ports to support the Ray cluster.
16
+ # Empty string means disabled.
17
+ docker: {}
18
+
19
+ # If a node is idle for this many minutes, it will be removed.
20
+ idle_timeout_minutes: 5
21
+
22
+ # Cloud-provider specific configuration.
23
+ provider:
24
+ type: aws
25
+ region: us-west-2
26
+ # Availability zone(s), comma-separated, that nodes may be launched in.
27
+ # Nodes will be launched in the first listed availability zone and will
28
+ # be tried in the subsequent availability zones if launching fails.
29
+ availability_zone: us-west-2a,us-west-2b
30
+ # Whether to allow node reuse. If set to False, nodes will be terminated
31
+ # instead of stopped.
32
+ cache_stopped_nodes: True # If not present, the default is True.
33
+
34
+ # How Ray will authenticate with newly launched nodes.
35
+ auth:
36
+ ssh_user: ubuntu
37
+ # By default Ray creates a new private keypair, but you can also use your own.
38
+ # If you do so, make sure to also set "KeyName" in the head and worker node
39
+ # configurations below.
40
+ # ssh_private_key: /path/to/your/key.pem
41
+
42
+ # Tell the autoscaler the allowed node types and the resources they provide.
43
+ # The key is the name of the node type, which is just for debugging purposes.
44
+ # The node config specifies the launch config and physical instance type.
45
+ available_node_types:
46
+ ray.head.default:
47
+ # The node type's CPU and GPU resources are auto-detected based on AWS instance type.
48
+ # If desired, you can override the autodetected CPU and GPU resources advertised to the autoscaler.
49
+ # You can also set custom resources.
50
+ # For example, to mark a node type as having 1 CPU, 1 GPU, and 5 units of a resource called "custom", set
51
+ # resources: {"CPU": 1, "GPU": 1, "custom": 5}
52
+ resources: {}
53
+ # Provider-specific config for this node type, e.g. instance type. By default
54
+ # Ray will auto-configure unspecified fields such as SubnetId and KeyName.
55
+ # For more documentation on available fields, see:
56
+ # http://boto3.readthedocs.io/en/latest/reference/services/ec2.html#EC2.ServiceResource.create_instances
57
+ node_config:
58
+ InstanceType: m5.large
59
+ # You can provision additional disk space with a conf as follows
60
+ BlockDeviceMappings:
61
+ - DeviceName: /dev/sda1
62
+ Ebs:
63
+ VolumeSize: 256
64
+ # Additional options in the boto docs.
65
+ ray.worker.default:
66
+ # The minimum number of nodes of this type to launch.
67
+ # This number should be >= 0.
68
+ min_workers: 0
69
+ # The node type's CPU and GPU resources are auto-detected based on AWS instance type.
70
+ # If desired, you can override the autodetected CPU and GPU resources advertised to the autoscaler.
71
+ # You can also set custom resources.
72
+ # For example, to mark a node type as having 1 CPU, 1 GPU, and 5 units of a resource called "custom", set
73
+ # resources: {"CPU": 1, "GPU": 1, "custom": 5}
74
+ resources: {}
75
+ # Provider-specific config for this node type, e.g. instance type. By default
76
+ # Ray will auto-configure unspecified fields such as SubnetId and KeyName.
77
+ # For more documentation on available fields, see:
78
+ # http://boto3.readthedocs.io/en/latest/reference/services/ec2.html#EC2.ServiceResource.create_instances
79
+ node_config:
80
+ InstanceType: m5.large
81
+ # Run workers on spot by default. Comment this out to use on-demand.
82
+ InstanceMarketOptions:
83
+ MarketType: spot
84
+ # Additional options can be found in the boto docs, e.g.
85
+ # SpotOptions:
86
+ # MaxPrice: MAX_HOURLY_PRICE
87
+ # Additional options in the boto docs.
88
+
89
+ # Specify the node type of the head node (as configured above).
90
+ head_node_type: ray.head.default
91
+
92
+ # Files or directories to copy to the head and worker nodes. The format is a
93
+ # dictionary from REMOTE_PATH: LOCAL_PATH, e.g.
94
+ file_mounts: {
95
+ # "/path1/on/remote/machine": "/path1/on/local/machine",
96
+ # "/path2/on/remote/machine": "/path2/on/local/machine",
97
+ }
98
+
99
+ # Files or directories to copy from the head node to the worker nodes. The format is a
100
+ # list of paths. The same path on the head node will be copied to the worker node.
101
+ # This behavior is a subset of the file_mounts behavior. In the vast majority of cases
102
+ # you should just use file_mounts. Only use this if you know what you're doing!
103
+ cluster_synced_files: []
104
+
105
+ # Whether changes to directories in file_mounts or cluster_synced_files in the head node
106
+ # should sync to the worker node continuously
107
+ file_mounts_sync_continuously: False
108
+
109
+ # Patterns for files to exclude when running rsync up or rsync down
110
+ rsync_exclude: []
111
+
112
+ # Pattern files to use for filtering out files when running rsync up or rsync down. The file is searched for
113
+ # in the source directory and recursively through all subdirectories. For example, if .gitignore is provided
114
+ # as a value, the behavior will match git's behavior for finding and using .gitignore files.
115
+ rsync_filter: []
116
+
117
+ # List of commands that will be run before `setup_commands`. If docker is
118
+ # enabled, these commands will run outside the container and before docker
119
+ # is setup.
120
+ initialization_commands: []
121
+
122
+ # List of shell commands to run to set up nodes.
123
+ setup_commands:
124
+ - >-
125
+ (stat $HOME/anaconda3/envs/tensorflow2_p38/ &> /dev/null &&
126
+ echo 'export PATH="$HOME/anaconda3/envs/tensorflow2_p38/bin:$PATH"' >> ~/.bashrc) || true
127
+ - which ray || pip install -U "ray[default] @ https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-3.0.0.dev0-cp38-cp38-manylinux2014_x86_64.whl"
128
+
129
+ # Custom commands that will be run on the head node after common setup.
130
+ head_setup_commands:
131
+ - pip install 'boto3>=1.4.8' # 1.4.8 adds InstanceMarketOptions
132
+
133
+ # Custom commands that will be run on worker nodes after common setup.
134
+ worker_setup_commands: []
135
+
136
+ # Command to start ray on the head node. You don't need to change this.
137
+ head_start_ray_commands:
138
+ - ray stop
139
+ - ulimit -n 65536; ray start --head --port=6379 --object-manager-port=8076 --autoscaling-config=~/ray_bootstrap_config.yaml --dashboard-host=0.0.0.0
140
+
141
+ # Command to start ray on worker nodes. You don't need to change this.
142
+ worker_start_ray_commands:
143
+ - ray stop
144
+ - ulimit -n 65536; ray start --address=$RAY_HEAD_IP:6379 --object-manager-port=8076
.venv/lib/python3.11/site-packages/ray/autoscaler/azure/__init__.py ADDED
File without changes
.venv/lib/python3.11/site-packages/ray/autoscaler/azure/__pycache__/__init__.cpython-311.pyc ADDED
Binary file (193 Bytes). View file
 
.venv/lib/python3.11/site-packages/ray/autoscaler/azure/defaults.yaml ADDED
@@ -0,0 +1,152 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # An unique identifier for the head node and workers of this cluster.
2
+ cluster_name: default
3
+
4
+ # The maximum number of workers nodes to launch in addition to the head
5
+ # node.
6
+ max_workers: 2
7
+
8
+ # The autoscaler will scale up the cluster faster with higher upscaling speed.
9
+ # E.g., if the task requires adding more nodes then autoscaler will gradually
10
+ # scale up the cluster in chunks of upscaling_speed*currently_running_nodes.
11
+ # This number should be > 0.
12
+ upscaling_speed: 1.0
13
+
14
+ # This executes all commands on all nodes in the docker container,
15
+ # and opens all the necessary ports to support the Ray cluster.
16
+ # Empty object means disabled.
17
+ docker: {}
18
+
19
+ # If a node is idle for this many minutes, it will be removed.
20
+ idle_timeout_minutes: 5
21
+
22
+ # Cloud-provider specific configuration.
23
+ provider:
24
+ type: azure
25
+ # https://azure.microsoft.com/en-us/global-infrastructure/locations
26
+ location: westus2
27
+ resource_group: ray-cluster
28
+ # set subscription id otherwise the default from az cli will be used
29
+ # subscription_id: 00000000-0000-0000-0000-000000000000
30
+ # set unique subnet mask or a random mask will be used
31
+ # subnet_mask: 10.0.0.0/16
32
+ # set unique id for resources in this cluster
33
+ # if not set a default id will be generated based on the resource group and cluster name
34
+ # unique_id: RAY1
35
+
36
+ # How Ray will authenticate with newly launched nodes.
37
+ auth:
38
+ ssh_user: ubuntu
39
+ # you must specify paths to matching private and public key pair files
40
+ # use `ssh-keygen -t rsa -b 4096` to generate a new ssh key pair
41
+ ssh_private_key: ~/.ssh/id_rsa
42
+ # changes to this should match what is specified in file_mounts
43
+ ssh_public_key: ~/.ssh/id_rsa.pub
44
+
45
+ # More specific customization to node configurations can be made using the ARM template azure-vm-template.json file
46
+ # See documentation here: https://docs.microsoft.com/en-us/azure/templates/microsoft.compute/2019-03-01/virtualmachines
47
+ # Changes to the local file will be used during deployment of the head node, however worker nodes deployment occurs
48
+ # on the head node, so changes to the template must be included in the wheel file used in setup_commands section below
49
+
50
+ # Tell the autoscaler the allowed node types and the resources they provide.
51
+ # The key is the name of the node type, which is just for debugging purposes.
52
+ # The node config specifies the launch config and physical instance type.
53
+ available_node_types:
54
+ ray.head.default:
55
+ resources: {"CPU": 2}
56
+ # Provider-specific config, e.g. instance type.
57
+ node_config:
58
+ azure_arm_parameters:
59
+ vmSize: Standard_D2s_v3
60
+ # List images https://docs.microsoft.com/en-us/azure/virtual-machines/linux/cli-ps-findimage
61
+ imagePublisher: microsoft-dsvm
62
+ imageOffer: ubuntu-1804
63
+ imageSku: 1804-gen2
64
+ imageVersion: latest
65
+
66
+ ray.worker.default:
67
+ # The minimum number of nodes of this type to launch.
68
+ # This number should be >= 0.
69
+ min_workers: 0
70
+ # The resources provided by this node type.
71
+ resources: {"CPU": 2}
72
+ # Provider-specific config, e.g. instance type.
73
+ node_config:
74
+ azure_arm_parameters:
75
+ vmSize: Standard_D2s_v3
76
+ # List images https://docs.microsoft.com/en-us/azure/virtual-machines/linux/cli-ps-findimage
77
+ imagePublisher: microsoft-dsvm
78
+ imageOffer: ubuntu-1804
79
+ imageSku: 1804-gen2
80
+ imageVersion: latest
81
+ # comment lines below to not use Spot instances
82
+ priority: Spot
83
+ # set a maximum price for spot instances if desired
84
+ # billingProfile:
85
+ # maxPrice: -1
86
+
87
+ # Specify the node type of the head node (as configured above).
88
+ head_node_type: ray.head.default
89
+
90
+ # Files or directories to copy to the head and worker nodes. The format is a
91
+ # dictionary from REMOTE_PATH: LOCAL_PATH, e.g.
92
+ file_mounts: {
93
+ # "/path1/on/remote/machine": "/path1/on/local/machine",
94
+ # "/path2/on/remote/machine": "/path2/on/local/machine",
95
+ "~/.ssh/id_rsa.pub": "~/.ssh/id_rsa.pub"
96
+ }
97
+
98
+ # Files or directories to copy from the head node to the worker nodes. The format is a
99
+ # list of paths. The same path on the head node will be copied to the worker node.
100
+ # This behavior is a subset of the file_mounts behavior. In the vast majority of cases
101
+ # you should just use file_mounts. Only use this if you know what you're doing!
102
+ cluster_synced_files: []
103
+
104
+ # Whether changes to directories in file_mounts or cluster_synced_files in the head node
105
+ # should sync to the worker node continuously
106
+ file_mounts_sync_continuously: False
107
+
108
+ # Patterns for files to exclude when running rsync up or rsync down
109
+ rsync_exclude: []
110
+
111
+ # Pattern files to use for filtering out files when running rsync up or rsync down. The file is searched for
112
+ # in the source directory and recursively through all subdirectories. For example, if .gitignore is provided
113
+ # as a value, the behavior will match git's behavior for finding and using .gitignore files.
114
+ rsync_filter: []
115
+
116
+ # List of commands that will be run before `setup_commands`. If docker is
117
+ # enabled, these commands will run outside the container and before docker
118
+ # is setup.
119
+ initialization_commands:
120
+ # get rid of annoying Ubuntu message
121
+ - touch ~/.sudo_as_admin_successful
122
+
123
+ # List of shell commands to run to set up nodes.
124
+ setup_commands:
125
+ # Note: if you're developing Ray, you probably want to create an AMI that
126
+ # has your Ray repo pre-cloned. Then, you can replace the pip installs
127
+ # below with a git checkout <your_sha> (and possibly a recompile).
128
+ - (which conda && echo 'eval "$(conda shell.bash hook)"' >> ~/.bashrc) || true
129
+ # - (conda activate py38_pytorch &> /dev/null && echo 'conda activate py38_pytorch' >> ~/.bashrc) || true
130
+ - (conda activate py38_tensorflow &> /dev/null && echo 'conda activate py38_tensorflow' >> ~/.bashrc) || true
131
+ - which ray || pip install -U "ray[default] @ https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-3.0.0.dev0-cp38-cp38-manylinux2014_x86_64.whl"
132
+ # Consider uncommenting these if you also want to run apt-get commands during setup
133
+ # - sudo pkill -9 apt-get || true
134
+ # - sudo pkill -9 dpkg || true
135
+ # - sudo dpkg --configure -a
136
+
137
+ # Custom commands that will be run on the head node after common setup.
138
+ head_setup_commands:
139
+ - pip install -U azure-cli-core==2.29.1 azure-identity==1.7.0 azure-mgmt-compute==23.1.0 azure-mgmt-network==19.0.0 azure-mgmt-resource==20.0.0 msrestazure==0.6.4
140
+
141
+ # Custom commands that will be run on worker nodes after common setup.
142
+ worker_setup_commands: []
143
+
144
+ # Command to start ray on the head node. You don't need to change this.
145
+ head_start_ray_commands:
146
+ - ray stop
147
+ - ulimit -n 65536; ray start --head --port=6379 --object-manager-port=8076 --autoscaling-config=~/ray_bootstrap_config.yaml --dashboard-host=0.0.0.0
148
+
149
+ # Command to start ray on worker nodes. You don't need to change this.
150
+ worker_start_ray_commands:
151
+ - ray stop
152
+ - ulimit -n 65536; ray start --address=$RAY_HEAD_IP:6379 --object-manager-port=8076
.venv/lib/python3.11/site-packages/ray/autoscaler/sdk/__init__.py ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from ray.autoscaler.sdk.sdk import (
2
+ bootstrap_config,
3
+ configure_logging,
4
+ create_or_update_cluster,
5
+ fillout_defaults,
6
+ get_docker_host_mount_location,
7
+ get_head_node_ip,
8
+ get_worker_node_ips,
9
+ register_callback_handler,
10
+ request_resources,
11
+ rsync,
12
+ run_on_cluster,
13
+ teardown_cluster,
14
+ )
15
+
16
+ __all__ = [
17
+ "create_or_update_cluster",
18
+ "teardown_cluster",
19
+ "run_on_cluster",
20
+ "rsync",
21
+ "get_head_node_ip",
22
+ "get_worker_node_ips",
23
+ "request_resources",
24
+ "configure_logging",
25
+ "bootstrap_config",
26
+ "fillout_defaults",
27
+ "register_callback_handler",
28
+ "get_docker_host_mount_location",
29
+ ]
.venv/lib/python3.11/site-packages/ray/autoscaler/sdk/__pycache__/__init__.cpython-311.pyc ADDED
Binary file (804 Bytes). View file
 
.venv/lib/python3.11/site-packages/ray/autoscaler/sdk/__pycache__/sdk.cpython-311.pyc ADDED
Binary file (15.8 kB). View file
 
.venv/lib/python3.11/site-packages/ray/autoscaler/sdk/sdk.py ADDED
@@ -0,0 +1,343 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """IMPORTANT: this is an experimental interface and not currently stable."""
2
+
3
+ import json
4
+ import os
5
+ import tempfile
6
+ from contextlib import contextmanager
7
+ from typing import Any, Callable, Dict, Iterator, List, Optional, Union
8
+
9
+ from ray.autoscaler._private import commands
10
+ from ray.autoscaler._private.cli_logger import cli_logger
11
+ from ray.autoscaler._private.event_system import CreateClusterEvent # noqa: F401
12
+ from ray.autoscaler._private.event_system import global_event_system # noqa: F401
13
+ from ray.util.annotations import DeveloperAPI
14
+
15
+
16
+ @DeveloperAPI
17
+ def create_or_update_cluster(
18
+ cluster_config: Union[dict, str],
19
+ *,
20
+ no_restart: bool = False,
21
+ restart_only: bool = False,
22
+ no_config_cache: bool = False
23
+ ) -> Dict[str, Any]:
24
+ """Create or updates an autoscaling Ray cluster from a config json.
25
+
26
+ Args:
27
+ cluster_config (Union[str, dict]): Either the config dict of the
28
+ cluster, or a path pointing to a file containing the config.
29
+ no_restart: Whether to skip restarting Ray services during the
30
+ update. This avoids interrupting running jobs and can be used to
31
+ dynamically adjust autoscaler configuration.
32
+ restart_only: Whether to skip running setup commands and only
33
+ restart Ray. This cannot be used with 'no-restart'.
34
+ no_config_cache: Whether to disable the config cache and fully
35
+ resolve all environment settings from the Cloud provider again.
36
+ """
37
+ with _as_config_file(cluster_config) as config_file:
38
+ return commands.create_or_update_cluster(
39
+ config_file=config_file,
40
+ override_min_workers=None,
41
+ override_max_workers=None,
42
+ no_restart=no_restart,
43
+ restart_only=restart_only,
44
+ yes=True,
45
+ override_cluster_name=None,
46
+ no_config_cache=no_config_cache,
47
+ redirect_command_output=None,
48
+ use_login_shells=True,
49
+ )
50
+
51
+
52
+ @DeveloperAPI
53
+ def teardown_cluster(
54
+ cluster_config: Union[dict, str],
55
+ workers_only: bool = False,
56
+ keep_min_workers: bool = False,
57
+ ) -> None:
58
+ """Destroys all nodes of a Ray cluster described by a config json.
59
+
60
+ Args:
61
+ cluster_config (Union[str, dict]): Either the config dict of the
62
+ cluster, or a path pointing to a file containing the config.
63
+ workers_only: Whether to keep the head node running and only
64
+ teardown worker nodes.
65
+ keep_min_workers: Whether to keep min_workers (as specified
66
+ in the YAML) still running.
67
+ """
68
+ with _as_config_file(cluster_config) as config_file:
69
+ return commands.teardown_cluster(
70
+ config_file=config_file,
71
+ yes=True,
72
+ workers_only=workers_only,
73
+ override_cluster_name=None,
74
+ keep_min_workers=keep_min_workers,
75
+ )
76
+
77
+
78
+ @DeveloperAPI
79
+ def run_on_cluster(
80
+ cluster_config: Union[dict, str],
81
+ *,
82
+ cmd: Optional[str] = None,
83
+ run_env: str = "auto",
84
+ tmux: bool = False,
85
+ stop: bool = False,
86
+ no_config_cache: bool = False,
87
+ port_forward: Optional[commands.Port_forward] = None,
88
+ with_output: bool = False
89
+ ) -> Optional[str]:
90
+ """Runs a command on the specified cluster.
91
+
92
+ Args:
93
+ cluster_config (Union[str, dict]): Either the config dict of the
94
+ cluster, or a path pointing to a file containing the config.
95
+ cmd: the command to run, or None for a no-op command.
96
+ run_env: whether to run the command on the host or in a
97
+ container. Select between "auto", "host" and "docker".
98
+ tmux: whether to run in a tmux session
99
+ stop: whether to stop the cluster after command run
100
+ no_config_cache: Whether to disable the config cache and fully
101
+ resolve all environment settings from the Cloud provider again.
102
+ port_forward ( (int,int) or list[(int,int)]): port(s) to forward.
103
+ with_output: Whether to capture command output.
104
+
105
+ Returns:
106
+ The output of the command as a string.
107
+ """
108
+ with _as_config_file(cluster_config) as config_file:
109
+ return commands.exec_cluster(
110
+ config_file,
111
+ cmd=cmd,
112
+ run_env=run_env,
113
+ screen=False,
114
+ tmux=tmux,
115
+ stop=stop,
116
+ start=False,
117
+ override_cluster_name=None,
118
+ no_config_cache=no_config_cache,
119
+ port_forward=port_forward,
120
+ with_output=with_output,
121
+ )
122
+
123
+
124
+ @DeveloperAPI
125
+ def rsync(
126
+ cluster_config: Union[dict, str],
127
+ *,
128
+ source: Optional[str],
129
+ target: Optional[str],
130
+ down: bool,
131
+ ip_address: Optional[str] = None,
132
+ use_internal_ip: bool = False,
133
+ no_config_cache: bool = False,
134
+ should_bootstrap: bool = True
135
+ ):
136
+ """Rsyncs files to or from the cluster.
137
+
138
+ Args:
139
+ cluster_config (Union[str, dict]): Either the config dict of the
140
+ cluster, or a path pointing to a file containing the config.
141
+ source: rsync source argument.
142
+ target: rsync target argument.
143
+ down: whether we're syncing remote -> local.
144
+ ip_address: Address of node.
145
+ use_internal_ip: Whether the provided ip_address is
146
+ public or private.
147
+ no_config_cache: Whether to disable the config cache and fully
148
+ resolve all environment settings from the Cloud provider again.
149
+ should_bootstrap: whether to bootstrap cluster config before syncing
150
+
151
+ Raises:
152
+ RuntimeError if the cluster head node is not found.
153
+ """
154
+ with _as_config_file(cluster_config) as config_file:
155
+ return commands.rsync(
156
+ config_file=config_file,
157
+ source=source,
158
+ target=target,
159
+ override_cluster_name=None,
160
+ down=down,
161
+ ip_address=ip_address,
162
+ use_internal_ip=use_internal_ip,
163
+ no_config_cache=no_config_cache,
164
+ all_nodes=False,
165
+ should_bootstrap=should_bootstrap,
166
+ )
167
+
168
+
169
+ @DeveloperAPI
170
+ def get_head_node_ip(cluster_config: Union[dict, str]) -> str:
171
+ """Returns head node IP for given configuration file if exists.
172
+
173
+ Args:
174
+ cluster_config (Union[str, dict]): Either the config dict of the
175
+ cluster, or a path pointing to a file containing the config.
176
+
177
+ Returns:
178
+ The ip address of the cluster head node.
179
+
180
+ Raises:
181
+ RuntimeError if the cluster is not found.
182
+ """
183
+ with _as_config_file(cluster_config) as config_file:
184
+ return commands.get_head_node_ip(config_file)
185
+
186
+
187
+ @DeveloperAPI
188
+ def get_worker_node_ips(cluster_config: Union[dict, str]) -> List[str]:
189
+ """Returns worker node IPs for given configuration file.
190
+
191
+ Args:
192
+ cluster_config (Union[str, dict]): Either the config dict of the
193
+ cluster, or a path pointing to a file containing the config.
194
+
195
+ Returns:
196
+ List of worker node ip addresses.
197
+
198
+ Raises:
199
+ RuntimeError if the cluster is not found.
200
+ """
201
+ with _as_config_file(cluster_config) as config_file:
202
+ return commands.get_worker_node_ips(config_file)
203
+
204
+
205
+ @DeveloperAPI
206
+ def request_resources(
207
+ num_cpus: Optional[int] = None, bundles: Optional[List[dict]] = None
208
+ ) -> None:
209
+ """Command the autoscaler to scale to accommodate the specified requests.
210
+
211
+ The cluster will immediately attempt to scale to accommodate the requested
212
+ resources, bypassing normal upscaling speed constraints. This takes into
213
+ account existing resource usage.
214
+
215
+ For example, suppose you call ``request_resources(num_cpus=100)`` and
216
+ there are 45 currently running tasks, each requiring 1 CPU. Then, enough
217
+ nodes will be added so up to 100 tasks can run concurrently. It does
218
+ **not** add enough nodes so that 145 tasks can run.
219
+
220
+ This call is only a hint to the autoscaler. The actual resulting cluster
221
+ size may be slightly larger or smaller than expected depending on the
222
+ internal bin packing algorithm and max worker count restrictions.
223
+
224
+ Args:
225
+ num_cpus: Scale the cluster to ensure this number of CPUs are
226
+ available. This request is persistent until another call to
227
+ request_resources() is made to override.
228
+ bundles (List[ResourceDict]): Scale the cluster to ensure this set of
229
+ resource shapes can fit. This request is persistent until another
230
+ call to request_resources() is made to override.
231
+
232
+ Examples:
233
+ >>> from ray.autoscaler.sdk import request_resources
234
+ >>> # Request 1000 CPUs.
235
+ >>> request_resources(num_cpus=1000) # doctest: +SKIP
236
+ >>> # Request 64 CPUs and also fit a 1-GPU/4-CPU task.
237
+ >>> request_resources( # doctest: +SKIP
238
+ ... num_cpus=64, bundles=[{"GPU": 1, "CPU": 4}])
239
+ >>> # Same as requesting num_cpus=3.
240
+ >>> request_resources( # doctest: +SKIP
241
+ ... bundles=[{"CPU": 1}, {"CPU": 1}, {"CPU": 1}])
242
+ """
243
+ if num_cpus is not None and not isinstance(num_cpus, int):
244
+ raise TypeError("num_cpus should be of type int.")
245
+ if bundles is not None:
246
+ if isinstance(bundles, List):
247
+ for bundle in bundles:
248
+ if isinstance(bundle, Dict):
249
+ for key in bundle.keys():
250
+ if not (isinstance(key, str) and isinstance(bundle[key], int)):
251
+ raise TypeError(
252
+ "each bundle key should be str and value as int."
253
+ )
254
+ else:
255
+ raise TypeError("each bundle should be a Dict.")
256
+ else:
257
+ raise TypeError("bundles should be of type List")
258
+
259
+ return commands.request_resources(num_cpus, bundles)
260
+
261
+
262
+ @DeveloperAPI
263
+ def configure_logging(
264
+ log_style: Optional[str] = None,
265
+ color_mode: Optional[str] = None,
266
+ verbosity: Optional[int] = None,
267
+ ):
268
+ """Configures logging for cluster command calls.
269
+
270
+ Args:
271
+ log_style: If 'pretty', outputs with formatting and color.
272
+ If 'record', outputs record-style without formatting.
273
+ 'auto' defaults to 'pretty', and disables pretty logging
274
+ if stdin is *not* a TTY. Defaults to "auto".
275
+ color_mode (str):
276
+ Can be "true", "false", or "auto".
277
+
278
+ Enables or disables `colorful`.
279
+
280
+ If `color_mode` is "auto", is set to `not stdout.isatty()`
281
+ vebosity (int):
282
+ Output verbosity (0, 1, 2, 3).
283
+
284
+ Low verbosity will disable `verbose` and `very_verbose` messages.
285
+
286
+ """
287
+ cli_logger.configure(
288
+ log_style=log_style, color_mode=color_mode, verbosity=verbosity
289
+ )
290
+
291
+
292
+ @contextmanager
293
+ @DeveloperAPI
294
+ def _as_config_file(cluster_config: Union[dict, str]) -> Iterator[str]:
295
+ if isinstance(cluster_config, dict):
296
+ tmp = tempfile.NamedTemporaryFile("w", prefix="autoscaler-sdk-tmp-")
297
+ tmp.write(json.dumps(cluster_config))
298
+ tmp.flush()
299
+ cluster_config = tmp.name
300
+ if not os.path.exists(cluster_config):
301
+ raise ValueError("Cluster config not found {}".format(cluster_config))
302
+ yield cluster_config
303
+
304
+
305
+ @DeveloperAPI
306
+ def bootstrap_config(
307
+ cluster_config: Dict[str, Any], no_config_cache: bool = False
308
+ ) -> Dict[str, Any]:
309
+ """Validate and add provider-specific fields to the config. For example,
310
+ IAM/authentication may be added here."""
311
+ return commands._bootstrap_config(cluster_config, no_config_cache)
312
+
313
+
314
+ @DeveloperAPI
315
+ def fillout_defaults(config: Dict[str, Any]) -> Dict[str, Any]:
316
+ """Fillout default values for a cluster_config based on the provider."""
317
+ from ray.autoscaler._private.util import fillout_defaults
318
+
319
+ return fillout_defaults(config)
320
+
321
+
322
+ @DeveloperAPI
323
+ def register_callback_handler(
324
+ event_name: str,
325
+ callback: Union[Callable[[Dict], None], List[Callable[[Dict], None]]],
326
+ ) -> None:
327
+ """Registers a callback handler for autoscaler events.
328
+
329
+ Args:
330
+ event_name: Event that callback should be called on. See
331
+ CreateClusterEvent for details on the events available to be
332
+ registered against.
333
+ callback: Callable object that is invoked
334
+ when specified event occurs.
335
+ """
336
+ global_event_system.add_callback_handler(event_name, callback)
337
+
338
+
339
+ @DeveloperAPI
340
+ def get_docker_host_mount_location(cluster_name: str) -> str:
341
+ """Return host path that Docker mounts attach to."""
342
+ docker_mount_prefix = "/tmp/ray_tmp_mount/{cluster_name}"
343
+ return docker_mount_prefix.format(cluster_name=cluster_name)
.venv/lib/python3.11/site-packages/ray/autoscaler/v2/__init__.py ADDED
File without changes
.venv/lib/python3.11/site-packages/ray/autoscaler/v2/autoscaler.py ADDED
@@ -0,0 +1,201 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ from queue import Queue
3
+ from typing import List, Optional
4
+
5
+ from ray._raylet import GcsClient
6
+ from ray.autoscaler._private.providers import _get_node_provider
7
+ from ray.autoscaler.v2.event_logger import AutoscalerEventLogger
8
+ from ray.autoscaler.v2.instance_manager.cloud_providers.kuberay.cloud_provider import (
9
+ KubeRayProvider,
10
+ )
11
+ from ray.autoscaler.v2.instance_manager.cloud_providers.read_only.cloud_provider import ( # noqa
12
+ ReadOnlyProvider,
13
+ )
14
+ from ray.autoscaler.v2.instance_manager.config import (
15
+ AutoscalingConfig,
16
+ IConfigReader,
17
+ Provider,
18
+ )
19
+ from ray.autoscaler.v2.instance_manager.instance_manager import (
20
+ InstanceManager,
21
+ InstanceUpdatedSubscriber,
22
+ )
23
+ from ray.autoscaler.v2.instance_manager.instance_storage import InstanceStorage
24
+ from ray.autoscaler.v2.instance_manager.node_provider import (
25
+ ICloudInstanceProvider,
26
+ NodeProviderAdapter,
27
+ )
28
+ from ray.autoscaler.v2.instance_manager.reconciler import Reconciler
29
+ from ray.autoscaler.v2.instance_manager.storage import InMemoryStorage
30
+ from ray.autoscaler.v2.instance_manager.subscribers.cloud_instance_updater import (
31
+ CloudInstanceUpdater,
32
+ )
33
+ from ray.autoscaler.v2.instance_manager.subscribers.ray_stopper import RayStopper
34
+ from ray.autoscaler.v2.metrics_reporter import AutoscalerMetricsReporter
35
+ from ray.autoscaler.v2.scheduler import ResourceDemandScheduler
36
+ from ray.autoscaler.v2.sdk import get_cluster_resource_state
37
+ from ray.core.generated.autoscaler_pb2 import AutoscalingState
38
+
39
+ logger = logging.getLogger(__name__)
40
+
41
+
42
+ class Autoscaler:
43
+ def __init__(
44
+ self,
45
+ session_name: str,
46
+ config_reader: IConfigReader,
47
+ gcs_client: GcsClient,
48
+ event_logger: Optional[AutoscalerEventLogger] = None,
49
+ metrics_reporter: Optional[AutoscalerMetricsReporter] = None,
50
+ ) -> None:
51
+ """
52
+ Args:
53
+ session_name: The name of the ray session.
54
+ config_reader: The config reader.
55
+ gcs_client: The GCS client.
56
+ event_logger: The event logger for emitting cluster events.
57
+ metrics_reporter: The metrics reporter for emitting cluster metrics.
58
+ """
59
+
60
+ self._config_reader = config_reader
61
+
62
+ config = config_reader.get_cached_autoscaling_config()
63
+ logger.info(f"Using Autoscaling Config: \n{config.dump()}")
64
+
65
+ self._gcs_client = gcs_client
66
+ self._cloud_instance_provider = None
67
+ self._instance_manager = None
68
+ self._ray_stop_errors_queue = Queue()
69
+ self._ray_install_errors_queue = Queue()
70
+ self._event_logger = event_logger
71
+ self._metrics_reporter = metrics_reporter
72
+
73
+ self._init_cloud_instance_provider(config, config_reader)
74
+ self._init_instance_manager(
75
+ session_name=session_name,
76
+ config=config,
77
+ cloud_provider=self._cloud_instance_provider,
78
+ gcs_client=self._gcs_client,
79
+ )
80
+ self._scheduler = ResourceDemandScheduler(self._event_logger)
81
+
82
+ def _init_cloud_instance_provider(
83
+ self, config: AutoscalingConfig, config_reader: IConfigReader
84
+ ):
85
+ """
86
+ Initialize the cloud provider, and its dependencies (the v1 node provider)
87
+
88
+ Args:
89
+ config: The autoscaling config.
90
+ config_reader: The config reader.
91
+
92
+ """
93
+ provider_config = config.get_provider_config()
94
+ if provider_config["type"] == "kuberay":
95
+ provider_config["head_node_type"] = config.get_head_node_type()
96
+ self._cloud_instance_provider = KubeRayProvider(
97
+ config.get_config("cluster_name"),
98
+ provider_config,
99
+ )
100
+ elif config.provider == Provider.READ_ONLY:
101
+ provider_config["gcs_address"] = self._gcs_client.address
102
+ self._cloud_instance_provider = ReadOnlyProvider(
103
+ provider_config=provider_config,
104
+ )
105
+ else:
106
+ node_provider_v1 = _get_node_provider(
107
+ provider_config,
108
+ config.get_config("cluster_name"),
109
+ )
110
+
111
+ self._cloud_instance_provider = NodeProviderAdapter(
112
+ v1_provider=node_provider_v1,
113
+ config_reader=config_reader,
114
+ )
115
+
116
+ def _init_instance_manager(
117
+ self,
118
+ session_name: str,
119
+ cloud_provider: ICloudInstanceProvider,
120
+ gcs_client: GcsClient,
121
+ config: AutoscalingConfig,
122
+ ):
123
+ """
124
+ Initialize the instance manager, and its dependencies.
125
+ """
126
+
127
+ instance_storage = InstanceStorage(
128
+ cluster_id=session_name,
129
+ storage=InMemoryStorage(),
130
+ )
131
+ subscribers: List[InstanceUpdatedSubscriber] = []
132
+ subscribers.append(CloudInstanceUpdater(cloud_provider=cloud_provider))
133
+ subscribers.append(
134
+ RayStopper(gcs_client=gcs_client, error_queue=self._ray_stop_errors_queue)
135
+ )
136
+ if not config.disable_node_updaters():
137
+ # Supporting ray installer is only needed for providers that doesn't
138
+ # install or manage ray (e.g. AWS, GCP). These providers will be
139
+ # supported in the future.
140
+ raise NotImplementedError(
141
+ "RayInstaller is not supported yet in current "
142
+ "release of the Autoscaler V2. Therefore, providers "
143
+ "that update nodes (with `disable_node_updaters` set to True) "
144
+ "are not supported yet. Only KubeRay is supported for now which sets "
145
+ "disable_node_updaters to True in provider's config."
146
+ )
147
+
148
+ self._instance_manager = InstanceManager(
149
+ instance_storage=instance_storage,
150
+ instance_status_update_subscribers=subscribers,
151
+ )
152
+
153
+ def update_autoscaling_state(
154
+ self,
155
+ ) -> Optional[AutoscalingState]:
156
+ """
157
+ Update the autoscaling state of the cluster by reconciling the current
158
+ state of the cluster resources, the cloud providers as well as instance
159
+ update subscribers with the desired state.
160
+
161
+ Returns:
162
+ AutoscalingState: The new autoscaling state of the cluster or None if
163
+ the state is not updated.
164
+
165
+ Raises:
166
+ No exception.
167
+ """
168
+
169
+ try:
170
+ ray_stop_errors = []
171
+ while not self._ray_stop_errors_queue.empty():
172
+ ray_stop_errors.append(self._ray_stop_errors_queue.get())
173
+
174
+ ray_install_errors = []
175
+ while not self._ray_install_errors_queue.empty():
176
+ ray_install_errors.append(self._ray_install_errors_queue.get())
177
+
178
+ # Get the current state of the ray cluster resources.
179
+ ray_cluster_resource_state = get_cluster_resource_state(self._gcs_client)
180
+
181
+ # Refresh the config from the source
182
+ self._config_reader.refresh_cached_autoscaling_config()
183
+ autoscaling_config = self._config_reader.get_cached_autoscaling_config()
184
+
185
+ return Reconciler.reconcile(
186
+ instance_manager=self._instance_manager,
187
+ scheduler=self._scheduler,
188
+ cloud_provider=self._cloud_instance_provider,
189
+ ray_cluster_resource_state=ray_cluster_resource_state,
190
+ non_terminated_cloud_instances=(
191
+ self._cloud_instance_provider.get_non_terminated()
192
+ ),
193
+ cloud_provider_errors=self._cloud_instance_provider.poll_errors(),
194
+ ray_install_errors=ray_install_errors,
195
+ ray_stop_errors=ray_stop_errors,
196
+ autoscaling_config=autoscaling_config,
197
+ metrics_reporter=self._metrics_reporter,
198
+ )
199
+ except Exception as e:
200
+ logger.exception(e)
201
+ return None
.venv/lib/python3.11/site-packages/ray/autoscaler/v2/event_logger.py ADDED
@@ -0,0 +1,157 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ from collections import defaultdict
3
+ from typing import Dict, List, Optional
4
+
5
+ from ray._private.event.event_logger import EventLoggerAdapter
6
+ from ray.autoscaler.v2.instance_manager.config import NodeTypeConfig
7
+ from ray.autoscaler.v2.schema import NodeType
8
+ from ray.autoscaler.v2.utils import ResourceRequestUtil
9
+ from ray.core.generated.autoscaler_pb2 import (
10
+ ClusterResourceConstraint,
11
+ GangResourceRequest,
12
+ ResourceRequest,
13
+ )
14
+ from ray.core.generated.instance_manager_pb2 import LaunchRequest, TerminationRequest
15
+
16
+ logger = logging.getLogger(__name__)
17
+
18
+
19
+ class AutoscalerEventLogger:
20
+ """
21
+ Logs events related to the autoscaler.
22
+
23
+ # TODO:
24
+ - Add more logging for other events.
25
+ - Rate limit the events if too spammy.
26
+ """
27
+
28
+ def __init__(self, logger: EventLoggerAdapter):
29
+ self._logger = logger
30
+
31
+ def log_cluster_scheduling_update(
32
+ self,
33
+ node_type_configs: Dict[NodeType, NodeTypeConfig],
34
+ cluster_shape: Dict[NodeType, int],
35
+ launch_requests: Optional[List[LaunchRequest]] = None,
36
+ terminate_requests: Optional[List[TerminationRequest]] = None,
37
+ infeasible_requests: Optional[List[ResourceRequest]] = None,
38
+ infeasible_gang_requests: Optional[List[GangResourceRequest]] = None,
39
+ infeasible_cluster_resource_constraints: Optional[
40
+ List[ClusterResourceConstraint]
41
+ ] = None,
42
+ ) -> None:
43
+ """
44
+ Log any update of the cluster scheduling state.
45
+ """
46
+
47
+ # Log any launch events.
48
+ if launch_requests:
49
+ launch_type_count = defaultdict(int)
50
+ for req in launch_requests:
51
+ launch_type_count[req.instance_type] += req.count
52
+
53
+ for idx, (instance_type, count) in enumerate(launch_type_count.items()):
54
+ log_str = f"Adding {count} node(s) of type {instance_type}."
55
+ self._logger.info(f"{log_str}")
56
+ logger.info(f"{log_str}")
57
+
58
+ # Log any terminate events.
59
+ if terminate_requests:
60
+ termination_by_causes_and_type = defaultdict(int)
61
+ for req in terminate_requests:
62
+ termination_by_causes_and_type[(req.cause, req.instance_type)] += 1
63
+
64
+ cause_reason_map = {
65
+ TerminationRequest.Cause.OUTDATED: "outdated",
66
+ TerminationRequest.Cause.MAX_NUM_NODES: "max number of worker nodes reached", # noqa
67
+ TerminationRequest.Cause.MAX_NUM_NODE_PER_TYPE: "max number of worker nodes per type reached", # noqa
68
+ TerminationRequest.Cause.IDLE: "idle",
69
+ }
70
+
71
+ for idx, ((cause, instance_type), count) in enumerate(
72
+ termination_by_causes_and_type.items()
73
+ ):
74
+ log_str = f"Removing {count} nodes of type {instance_type} ({cause_reason_map[cause]})." # noqa
75
+ self._logger.info(f"{log_str}")
76
+ logger.info(f"{log_str}")
77
+
78
+ # Cluster shape changes.
79
+ if launch_requests or terminate_requests:
80
+ total_resources = defaultdict(float)
81
+
82
+ for node_type, count in cluster_shape.items():
83
+ node_config = node_type_configs[node_type]
84
+ for resource_name, resource_quantity in node_config.resources.items():
85
+ total_resources[resource_name] += resource_quantity * count
86
+
87
+ num_cpus = total_resources.get("CPU", 0)
88
+ log_str = f"Resized to {int(num_cpus)} CPUs"
89
+
90
+ if "GPU" in total_resources:
91
+ log_str += f", {int(total_resources['GPU'])} GPUs"
92
+ if "TPU" in total_resources:
93
+ log_str += f", {int(total_resources['TPU'])} TPUs"
94
+
95
+ self._logger.info(f"{log_str}.")
96
+ self._logger.debug(f"Current cluster shape: {dict(cluster_shape)}.")
97
+
98
+ # Log any infeasible requests.
99
+ if infeasible_requests:
100
+ requests_by_count = ResourceRequestUtil.group_by_count(infeasible_requests)
101
+ log_str = "No available node types can fulfill resource requests "
102
+ for idx, req_count in enumerate(requests_by_count):
103
+ resource_map = ResourceRequestUtil.to_resource_map(req_count.request)
104
+ log_str += f"{resource_map}*{req_count.count}"
105
+ if idx < len(requests_by_count) - 1:
106
+ log_str += ", "
107
+
108
+ log_str += (
109
+ ". Add suitable node types to this cluster to resolve this issue."
110
+ )
111
+ self._logger.warning(log_str)
112
+
113
+ if infeasible_gang_requests:
114
+ # Log for each placement group requests.
115
+ for gang_request in infeasible_gang_requests:
116
+ log_str = (
117
+ "No available node types can fulfill "
118
+ "placement group requests (detail={details}): ".format(
119
+ details=gang_request.details
120
+ )
121
+ )
122
+ requests_by_count = ResourceRequestUtil.group_by_count(
123
+ gang_request.requests
124
+ )
125
+ for idx, req_count in enumerate(requests_by_count):
126
+ resource_map = ResourceRequestUtil.to_resource_map(
127
+ req_count.request
128
+ )
129
+ log_str += f"{resource_map}*{req_count.count}"
130
+ if idx < len(requests_by_count) - 1:
131
+ log_str += ", "
132
+
133
+ log_str += (
134
+ ". Add suitable node types to this cluster to resolve this issue."
135
+ )
136
+ self._logger.warning(log_str)
137
+
138
+ if infeasible_cluster_resource_constraints:
139
+ # We will only have max 1 cluster resource constraint for now since it's
140
+ # from `request_resources()` sdk, where the most recent call would override
141
+ # the previous one.
142
+ for infeasible_constraint in infeasible_cluster_resource_constraints:
143
+ log_str = "No available node types can fulfill cluster constraint: "
144
+ for i, requests_by_count in enumerate(
145
+ infeasible_constraint.resource_requests
146
+ ):
147
+ resource_map = ResourceRequestUtil.to_resource_map(
148
+ requests_by_count.request
149
+ )
150
+ log_str += f"{resource_map}*{requests_by_count.count}"
151
+ if i < len(infeasible_constraint.resource_requests) - 1:
152
+ log_str += ", "
153
+
154
+ log_str += (
155
+ ". Add suitable node types to this cluster to resolve this issue."
156
+ )
157
+ self._logger.warning(log_str)
.venv/lib/python3.11/site-packages/ray/autoscaler/v2/instance_manager/__init__.py ADDED
File without changes
.venv/lib/python3.11/site-packages/ray/autoscaler/v2/instance_manager/common.py ADDED
@@ -0,0 +1,472 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import time
2
+ import uuid
3
+ from typing import Dict, List, Optional, Set
4
+
5
+ from ray.core.generated.instance_manager_pb2 import Instance, InstanceUpdateEvent
6
+
7
+
8
+ class InstanceUtil:
9
+ """
10
+ A helper class to group updates and operations on an Instance object defined
11
+ in instance_manager.proto
12
+ """
13
+
14
+ # Memoized reachable from sets, where the key is the instance status, and
15
+ # the value is the set of instance status that is reachable from the key
16
+ # instance status.
17
+ _reachable_from: Optional[
18
+ Dict["Instance.InstanceStatus", Set["Instance.InstanceStatus"]]
19
+ ] = None
20
+
21
+ @staticmethod
22
+ def new_instance(
23
+ instance_id: str,
24
+ instance_type: str,
25
+ status: Instance.InstanceStatus,
26
+ details: str = "",
27
+ ) -> Instance:
28
+ """
29
+ Returns a new instance with the given status.
30
+
31
+ Args:
32
+ instance_id: The instance id.
33
+ instance_type: The instance type.
34
+ status: The status of the new instance.
35
+ details: The details of the status transition.
36
+ """
37
+ instance = Instance()
38
+ instance.version = 0 # it will be populated by the underlying storage.
39
+ instance.instance_id = instance_id
40
+ instance.instance_type = instance_type
41
+ instance.status = status
42
+ InstanceUtil._record_status_transition(instance, status, details)
43
+ return instance
44
+
45
+ @staticmethod
46
+ def random_instance_id() -> str:
47
+ """
48
+ Returns a random instance id.
49
+ """
50
+ return str(uuid.uuid4())
51
+
52
+ @staticmethod
53
+ def is_cloud_instance_allocated(instance_status: Instance.InstanceStatus) -> bool:
54
+ """
55
+ Returns True if the instance is in a status where there could exist
56
+ a cloud instance allocated by the cloud provider.
57
+ """
58
+ assert instance_status != Instance.UNKNOWN
59
+ return instance_status in {
60
+ Instance.ALLOCATED,
61
+ Instance.RAY_INSTALLING,
62
+ Instance.RAY_RUNNING,
63
+ Instance.RAY_STOPPING,
64
+ Instance.RAY_STOP_REQUESTED,
65
+ Instance.RAY_STOPPED,
66
+ Instance.TERMINATING,
67
+ Instance.RAY_INSTALL_FAILED,
68
+ Instance.TERMINATION_FAILED,
69
+ }
70
+
71
+ @staticmethod
72
+ def is_ray_running(instance_status: Instance.InstanceStatus) -> bool:
73
+ """
74
+ Returns True if the instance is in a status where the ray process is
75
+ running on the cloud instance.
76
+ i.e. RAY_RUNNING, RAY_STOP_REQUESTED, RAY_STOPPING
77
+ """
78
+ assert instance_status != Instance.UNKNOWN
79
+
80
+ if instance_status in InstanceUtil.get_reachable_statuses(
81
+ Instance.RAY_STOPPING
82
+ ):
83
+ return False
84
+
85
+ if instance_status in InstanceUtil.get_reachable_statuses(Instance.RAY_RUNNING):
86
+ return True
87
+
88
+ return False
89
+
90
+ @staticmethod
91
+ def is_ray_pending(instance_status: Instance.InstanceStatus) -> bool:
92
+ """
93
+ Returns True if the instance is in a status where the ray process is
94
+ pending to be started on the cloud instance.
95
+
96
+ """
97
+ assert instance_status != Instance.UNKNOWN
98
+ # Not gonna be in a RAY_RUNNING status.
99
+ if Instance.RAY_RUNNING not in InstanceUtil.get_reachable_statuses(
100
+ instance_status
101
+ ):
102
+ return False
103
+
104
+ # Already running ray.
105
+ if instance_status in InstanceUtil.get_reachable_statuses(Instance.RAY_RUNNING):
106
+ return False
107
+
108
+ return True
109
+
110
+ def is_ray_running_reachable(instance_status: Instance.InstanceStatus) -> bool:
111
+ """
112
+ Returns True if the instance is in a status where it may transition
113
+ to RAY_RUNNING status.
114
+ """
115
+ return Instance.RAY_RUNNING in InstanceUtil.get_reachable_statuses(
116
+ instance_status
117
+ )
118
+
119
+ @staticmethod
120
+ def set_status(
121
+ instance: Instance,
122
+ new_instance_status: Instance.InstanceStatus,
123
+ details: str = "",
124
+ ) -> bool:
125
+ """Transitions the instance to the new state.
126
+
127
+ Args:
128
+ instance: The instance to update.
129
+ new_instance_status: The new status to transition to.
130
+ details: The details of the transition.
131
+
132
+ Returns:
133
+ True if the status transition is successful, False otherwise.
134
+ """
135
+ if (
136
+ new_instance_status
137
+ not in InstanceUtil.get_valid_transitions()[instance.status]
138
+ ):
139
+ return False
140
+ instance.status = new_instance_status
141
+ InstanceUtil._record_status_transition(instance, new_instance_status, details)
142
+ return True
143
+
144
+ @staticmethod
145
+ def _record_status_transition(
146
+ instance: Instance, status: Instance.InstanceStatus, details: str
147
+ ):
148
+ """Records the status transition.
149
+
150
+ Args:
151
+ instance: The instance to update.
152
+ status: The new status to transition to.
153
+ """
154
+ now_ns = time.time_ns()
155
+ instance.status_history.append(
156
+ Instance.StatusHistory(
157
+ instance_status=status,
158
+ timestamp_ns=now_ns,
159
+ details=details,
160
+ )
161
+ )
162
+
163
+ @staticmethod
164
+ def has_timeout(instance: Instance, timeout_s: int) -> bool:
165
+ """
166
+ Returns True if the instance has been in the current status for more
167
+ than the timeout_seconds.
168
+
169
+ Args:
170
+ instance: The instance to check.
171
+ timeout_seconds: The timeout in seconds.
172
+
173
+ Returns:
174
+ True if the instance has been in the current status for more than
175
+ the timeout_s seconds.
176
+ """
177
+ cur_status = instance.status
178
+
179
+ status_times_ns = InstanceUtil.get_status_transition_times_ns(
180
+ instance, select_instance_status=cur_status
181
+ )
182
+ assert len(status_times_ns) >= 1, (
183
+ f"instance {instance.instance_id} has {len(status_times_ns)} "
184
+ f"{Instance.InstanceStatus.Name(cur_status)} status"
185
+ )
186
+ status_time_ns = sorted(status_times_ns)[-1]
187
+ if time.time_ns() - status_time_ns <= (timeout_s * 1e9):
188
+ return False
189
+
190
+ return True
191
+
192
+ @staticmethod
193
+ def get_valid_transitions() -> Dict[
194
+ "Instance.InstanceStatus", Set["Instance.InstanceStatus"]
195
+ ]:
196
+ return {
197
+ # This is the initial status of a new instance.
198
+ Instance.QUEUED: {
199
+ # Cloud provider requested to launch a node for the instance.
200
+ # This happens when the a launch request is made to the node provider.
201
+ Instance.REQUESTED,
202
+ },
203
+ # When in this status, a launch request to the node provider is made.
204
+ Instance.REQUESTED: {
205
+ # Cloud provider allocated a cloud instance for the instance.
206
+ # This happens when the cloud instance first appears in the list of
207
+ # running cloud instances from the cloud instance provider.
208
+ Instance.ALLOCATED,
209
+ # Retry the allocation, become queueing again.
210
+ Instance.QUEUED,
211
+ # Cloud provider fails to allocate one. Either as a timeout or
212
+ # the launch request fails immediately.
213
+ Instance.ALLOCATION_FAILED,
214
+ },
215
+ # When in this status, the cloud instance is allocated and running. This
216
+ # happens when the cloud instance is present in node provider's list of
217
+ # running cloud instances.
218
+ Instance.ALLOCATED: {
219
+ # Ray needs to be install and launch on the provisioned cloud instance.
220
+ # This happens when the cloud instance is allocated, and the autoscaler
221
+ # is responsible for installing and launching ray on the cloud instance.
222
+ # For node provider that manages the ray installation and launching,
223
+ # this state is skipped.
224
+ Instance.RAY_INSTALLING,
225
+ # Ray is already installed on the provisioned cloud
226
+ # instance. It could be any valid ray status.
227
+ Instance.RAY_RUNNING,
228
+ Instance.RAY_STOPPING,
229
+ Instance.RAY_STOPPED,
230
+ # Instance is requested to be stopped, e.g. instance leaked: no matching
231
+ # Instance with the same type is found in the autoscaler's state.
232
+ Instance.TERMINATING,
233
+ # cloud instance somehow failed.
234
+ Instance.TERMINATED,
235
+ },
236
+ # Ray process is being installed and started on the cloud instance.
237
+ # This status is skipped for node provider that manages the ray
238
+ # installation and launching. (e.g. Ray-on-Spark)
239
+ Instance.RAY_INSTALLING: {
240
+ # Ray installed and launched successfully, reported by the ray cluster.
241
+ # Similar to the Instance.ALLOCATED -> Instance.RAY_RUNNING transition,
242
+ # where the ray process is managed by the node provider.
243
+ Instance.RAY_RUNNING,
244
+ # Ray installation failed. This happens when the ray process failed to
245
+ # be installed and started on the cloud instance.
246
+ Instance.RAY_INSTALL_FAILED,
247
+ # Wen the ray node is reported as stopped by the ray cluster.
248
+ # This could happen that the ray process was stopped quickly after start
249
+ # such that a ray running node wasn't discovered and the RAY_RUNNING
250
+ # transition was skipped.
251
+ Instance.RAY_STOPPED,
252
+ # A cloud instance is being terminated (when the instance itself is no
253
+ # longer needed, e.g. instance is outdated, autoscaler is scaling down)
254
+ Instance.TERMINATING,
255
+ # cloud instance somehow failed during the installation process.
256
+ Instance.TERMINATED,
257
+ },
258
+ # Ray process is installed and running on the cloud instance. When in this
259
+ # status, a ray node must be present in the ray cluster.
260
+ Instance.RAY_RUNNING: {
261
+ # Ray is requested to be stopped.
262
+ Instance.RAY_STOP_REQUESTED,
263
+ # Ray is stopping (currently draining),
264
+ # e.g. idle termination.
265
+ Instance.RAY_STOPPING,
266
+ # Ray is already stopped, as reported by the ray cluster.
267
+ Instance.RAY_STOPPED,
268
+ # A cloud instance is being terminated (when the instance itself is no
269
+ # longer needed, e.g. instance is outdated, autoscaler is scaling down)
270
+ Instance.TERMINATING,
271
+ # cloud instance somehow failed.
272
+ Instance.TERMINATED,
273
+ },
274
+ # Ray process should be stopped on the cloud instance. The RayStopper
275
+ # subscriber will listen to this status and stop the ray process.
276
+ Instance.RAY_STOP_REQUESTED: {
277
+ # Ray is stopping on the cloud instance.
278
+ Instance.RAY_STOPPING,
279
+ # Ray stopped already.
280
+ Instance.RAY_STOPPED,
281
+ # Ray stop request failed (e.g. idle node no longer idle),
282
+ # ray is still running.
283
+ Instance.RAY_RUNNING,
284
+ # cloud instance somehow failed.
285
+ Instance.TERMINATED,
286
+ },
287
+ # When in this status, the ray process is requested to be stopped to the
288
+ # ray cluster, but not yet present in the dead ray node list reported by
289
+ # the ray cluster.
290
+ Instance.RAY_STOPPING: {
291
+ # Ray is stopped, and the ray node is present in the dead ray node list
292
+ # reported by the ray cluster.
293
+ Instance.RAY_STOPPED,
294
+ # A cloud instance is being terminated (when the instance itself is no
295
+ # longer needed, e.g. instance is outdated, autoscaler is scaling down)
296
+ Instance.TERMINATING,
297
+ # cloud instance somehow failed.
298
+ Instance.TERMINATED,
299
+ },
300
+ # When in this status, the ray process is stopped, and the ray node is
301
+ # present in the dead ray node list reported by the ray cluster.
302
+ Instance.RAY_STOPPED: {
303
+ # A cloud instance is being terminated (when the instance itself is no
304
+ # longer needed, e.g. instance is outdated, autoscaler is scaling down)
305
+ Instance.TERMINATING,
306
+ # cloud instance somehow failed.
307
+ Instance.TERMINATED,
308
+ },
309
+ # When in this status, the cloud instance is requested to be stopped to
310
+ # the node provider.
311
+ Instance.TERMINATING: {
312
+ # When a cloud instance no longer appears in the list of running cloud
313
+ # instances from the node provider.
314
+ Instance.TERMINATED,
315
+ # When the cloud instance failed to be terminated.
316
+ Instance.TERMINATION_FAILED,
317
+ },
318
+ # When in this status, the cloud instance failed to be terminated by the
319
+ # node provider. We will keep retrying.
320
+ Instance.TERMINATION_FAILED: {
321
+ # Retry the termination, become terminating again.
322
+ Instance.TERMINATING,
323
+ },
324
+ # Whenever a cloud instance disappears from the list of running cloud
325
+ # instances from the node provider, the instance is marked as stopped. Since
326
+ # we guarantee 1:1 mapping of a Instance to a cloud instance, this is a
327
+ # terminal state.
328
+ Instance.TERMINATED: set(), # Terminal state.
329
+ # When in this status, the cloud instance failed to be allocated by the
330
+ # node provider.
331
+ Instance.ALLOCATION_FAILED: set(), # Terminal state.
332
+ Instance.RAY_INSTALL_FAILED: {
333
+ # Autoscaler requests to shutdown the instance when ray install failed.
334
+ Instance.TERMINATING,
335
+ # cloud instance somehow failed.
336
+ Instance.TERMINATED,
337
+ },
338
+ # Initial state before the instance is created. Should never be used.
339
+ Instance.UNKNOWN: set(),
340
+ }
341
+
342
+ @staticmethod
343
+ def get_status_transitions(
344
+ instance: Instance,
345
+ select_instance_status: Optional["Instance.InstanceStatus"] = None,
346
+ ) -> List["Instance.StatusHistory"]:
347
+ """
348
+ Returns the status history of the instance.
349
+
350
+ Args:
351
+ instance: The instance.
352
+ select_instance_status: The go-to status to search for, i.e. select
353
+ only status history when the instance transitions into the status.
354
+ If None, returns all status updates.
355
+ """
356
+ history = []
357
+ for status_update in instance.status_history:
358
+ if (
359
+ select_instance_status
360
+ and status_update.instance_status != select_instance_status
361
+ ):
362
+ continue
363
+ history.append(status_update)
364
+ return history
365
+
366
+ @staticmethod
367
+ def get_last_status_transition(
368
+ instance: Instance,
369
+ select_instance_status: Optional["Instance.InstanceStatus"] = None,
370
+ ) -> Optional["Instance.StatusHistory"]:
371
+ """
372
+ Returns the last status transition of the instance.
373
+
374
+ Args:
375
+ instance: The instance.
376
+ instance_status: The status to search for. If None, returns the last
377
+ status update.
378
+ """
379
+ history = InstanceUtil.get_status_transitions(instance, select_instance_status)
380
+ history.sort(key=lambda x: x.timestamp_ns)
381
+ if history:
382
+ return history[-1]
383
+ return None
384
+
385
+ @staticmethod
386
+ def get_status_transition_times_ns(
387
+ instance: Instance,
388
+ select_instance_status: Optional["Instance.InstanceStatus"] = None,
389
+ ) -> List[int]:
390
+ """
391
+ Returns a list of timestamps of the instance status update.
392
+
393
+ Args:
394
+ instance: The instance.
395
+ instance_status: The status to search for. If None, returns all
396
+ status updates timestamps.
397
+
398
+ Returns:
399
+ The list of timestamps of the instance status updates.
400
+ """
401
+ return [
402
+ e.timestamp_ns
403
+ for e in InstanceUtil.get_status_transitions(
404
+ instance, select_instance_status
405
+ )
406
+ ]
407
+
408
+ @classmethod
409
+ def get_reachable_statuses(
410
+ cls,
411
+ instance_status: Instance.InstanceStatus,
412
+ ) -> Set["Instance.InstanceStatus"]:
413
+ """
414
+ Returns the set of instance status that is reachable from the given
415
+ instance status following the status transitions.
416
+ This method is memoized.
417
+ Args:
418
+ instance_status: The instance status to start from.
419
+ Returns:
420
+ The set of instance status that is reachable from the given instance
421
+ status.
422
+ """
423
+ if cls._reachable_from is None:
424
+ cls._compute_reachable()
425
+ return cls._reachable_from[instance_status]
426
+
427
+ @staticmethod
428
+ def get_log_str_for_update(instance: Instance, update: InstanceUpdateEvent) -> str:
429
+ """Returns a log string for the given instance update."""
430
+ if update.upsert:
431
+ return (
432
+ f"New instance "
433
+ f"{Instance.InstanceStatus.Name(update.new_instance_status)} (id="
434
+ f"{instance.instance_id}, type={instance.instance_type}, "
435
+ f"cloud_instance_id={instance.cloud_instance_id}, "
436
+ f"ray_id={instance.node_id}): {update.details}"
437
+ )
438
+ return (
439
+ f"Update instance "
440
+ f"{Instance.InstanceStatus.Name(instance.status)}->"
441
+ f"{Instance.InstanceStatus.Name(update.new_instance_status)} (id="
442
+ f"{instance.instance_id}, type={instance.instance_type}, "
443
+ f"cloud_instance_id={instance.cloud_instance_id}, "
444
+ f"ray_id={instance.node_id}): {update.details}"
445
+ )
446
+
447
+ @classmethod
448
+ def _compute_reachable(cls):
449
+ """
450
+ Computes and memorize the from status sets for each status machine with
451
+ a DFS search.
452
+ """
453
+ valid_transitions = cls.get_valid_transitions()
454
+
455
+ def dfs(graph, start, visited):
456
+ """
457
+ Regular DFS algorithm to find all reachable nodes from a given node.
458
+ """
459
+ for next_node in graph[start]:
460
+ if next_node not in visited:
461
+ # We delay adding the visited set here so we could capture
462
+ # the self loop.
463
+ visited.add(next_node)
464
+ dfs(graph, next_node, visited)
465
+ return visited
466
+
467
+ # Initialize the graphs
468
+ cls._reachable_from = {}
469
+ for status in Instance.InstanceStatus.values():
470
+ # All nodes reachable from 'start'
471
+ visited = set()
472
+ cls._reachable_from[status] = dfs(valid_transitions, status, visited)
.venv/lib/python3.11/site-packages/ray/autoscaler/v2/instance_manager/config.py ADDED
@@ -0,0 +1,541 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import copy
2
+ import logging
3
+ from abc import ABC, abstractmethod
4
+ from dataclasses import dataclass, field
5
+ from enum import Enum
6
+ from pathlib import Path
7
+ from typing import Any, Dict, List, Optional
8
+
9
+ import yaml
10
+
11
+ from ray._private.ray_constants import env_integer
12
+ from ray._private.utils import binary_to_hex
13
+ from ray._raylet import GcsClient
14
+ from ray.autoscaler._private.constants import (
15
+ AUTOSCALER_MAX_CONCURRENT_LAUNCHES,
16
+ DEFAULT_UPSCALING_SPEED,
17
+ DISABLE_LAUNCH_CONFIG_CHECK_KEY,
18
+ DISABLE_NODE_UPDATERS_KEY,
19
+ )
20
+ from ray.autoscaler._private.kuberay.autoscaling_config import AutoscalingConfigProducer
21
+ from ray.autoscaler._private.monitor import BASE_READONLY_CONFIG
22
+ from ray.autoscaler._private.util import (
23
+ format_readonly_node_type,
24
+ hash_launch_conf,
25
+ hash_runtime_conf,
26
+ prepare_config,
27
+ validate_config,
28
+ )
29
+ from ray.autoscaler.v2.schema import NodeType
30
+ from ray.autoscaler.v2.sdk import get_cluster_resource_state
31
+ from ray.autoscaler.v2.utils import is_head_node
32
+
33
+ logger = logging.getLogger(__name__)
34
+
35
+
36
+ class Provider(Enum):
37
+ UNKNOWN = 0
38
+ ALIYUN = 1
39
+ AWS = 2
40
+ AZURE = 3
41
+ GCP = 4
42
+ KUBERAY = 5
43
+ LOCAL = 6
44
+ READ_ONLY = 7
45
+
46
+
47
+ class IConfigReader(ABC):
48
+ """An interface for reading Autoscaling config.
49
+
50
+ A utility class that reads autoscaling configs from various sources:
51
+ - File
52
+ - In-memory dict
53
+ - Remote config service (e.g. KubeRay's config)
54
+
55
+ Example:
56
+ reader = FileConfigReader("path/to/config.yaml")
57
+ # Get the recently cached config.
58
+ config = reader.get_cached_autoscaling_config()
59
+
60
+ ...
61
+ # Refresh the cached config.
62
+ reader.refresh_cached_autoscaling_config()
63
+ config = reader.get_cached_autoscaling_config()
64
+
65
+ """
66
+
67
+ @abstractmethod
68
+ def get_cached_autoscaling_config(self) -> "AutoscalingConfig":
69
+ """Returns the recently read autoscaling config.
70
+
71
+ Returns:
72
+ AutoscalingConfig: The recently read autoscaling config.
73
+ """
74
+ pass
75
+
76
+ @abstractmethod
77
+ def refresh_cached_autoscaling_config(self):
78
+ """Read the config from the source."""
79
+ pass
80
+
81
+
82
+ @dataclass(frozen=True)
83
+ class InstanceReconcileConfig:
84
+ # The timeout for waiting for a REQUESTED instance to be ALLOCATED.
85
+ request_status_timeout_s: int = env_integer(
86
+ "RAY_AUTOSCALER_RECONCILE_REQUEST_STATUS_TIMEOUT_S", 10 * 60
87
+ )
88
+ # The timeout for waiting for a ALLOCATED instance to be RAY_RUNNING.
89
+ allocate_status_timeout_s: int = env_integer(
90
+ "RAY_AUTOSCALER_RECONCILE_ALLOCATE_STATUS_TIMEOUT_S", 300
91
+ )
92
+ # The timeout for waiting for a RAY_INSTALLING instance to be RAY_RUNNING.
93
+ ray_install_status_timeout_s: int = env_integer(
94
+ "RAY_AUTOSCALER_RECONCILE_RAY_INSTALL_STATUS_TIMEOUT_S", 30 * 60
95
+ )
96
+ # The timeout for waiting for a TERMINATING instance to be TERMINATED.
97
+ terminating_status_timeout_s: int = env_integer(
98
+ "RAY_AUTOSCALER_RECONCILE_TERMINATING_STATUS_TIMEOUT_S", 300
99
+ )
100
+ # The timeout for waiting for a RAY_STOP_REQUESTED instance
101
+ # to be RAY_STOPPING or RAY_STOPPED.
102
+ ray_stop_requested_status_timeout_s: int = env_integer(
103
+ "RAY_AUTOSCALER_RECONCILE_RAY_STOP_REQUESTED_STATUS_TIMEOUT_S", 300
104
+ )
105
+ # The interval for raise a warning when an instance in transient status
106
+ # is not updated for a long time.
107
+ transient_status_warn_interval_s: int = env_integer(
108
+ "RAY_AUTOSCALER_RECONCILE_TRANSIENT_STATUS_WARN_INTERVAL_S", 90
109
+ )
110
+ # The number of times to retry requesting to allocate an instance.
111
+ max_num_retry_request_to_allocate: int = env_integer(
112
+ "RAY_AUTOSCALER_RECONCILE_MAX_NUM_RETRY_REQUEST_TO_ALLOCATE", 3
113
+ )
114
+
115
+
116
+ @dataclass
117
+ class NodeTypeConfig:
118
+ """
119
+ NodeTypeConfig is the helper class to provide node type specific configs.
120
+ This maps to subset of the `available_node_types` field in the
121
+ autoscaling config.
122
+ """
123
+
124
+ # Node type name
125
+ name: NodeType
126
+ # The minimal number of worker nodes to be launched for this node type.
127
+ min_worker_nodes: int
128
+ # The maximal number of worker nodes can be launched for this node type.
129
+ max_worker_nodes: int
130
+ # Idle timeout seconds for worker nodes of this node type.
131
+ idle_timeout_s: Optional[float] = None
132
+ # The total resources on the node.
133
+ resources: Dict[str, float] = field(default_factory=dict)
134
+ # The labels on the node.
135
+ labels: Dict[str, str] = field(default_factory=dict)
136
+ # The node config's launch config hash. It's calculated from the auth
137
+ # config, and the node's config in the `AutoscalingConfig` for the node
138
+ # type when launching the node. It's used to detect config changes.
139
+ launch_config_hash: str = ""
140
+
141
+ def __post_init__(self):
142
+ assert self.min_worker_nodes <= self.max_worker_nodes
143
+ assert self.min_worker_nodes >= 0
144
+
145
+
146
+ class AutoscalingConfig:
147
+ """
148
+ AutoscalingConfig is the helper class to provide autoscaling
149
+ related configs.
150
+
151
+ # TODO(rickyx):
152
+ 1. Move the config validation logic here.
153
+ 2. Deprecate the ray-schema.json for validation because it's
154
+ static thus not possible to validate the config with interdependency
155
+ of each other.
156
+ """
157
+
158
+ def __init__(
159
+ self,
160
+ configs: Dict[str, Any],
161
+ skip_content_hash: bool = False,
162
+ ) -> None:
163
+ """
164
+ Args:
165
+ configs : The raw configs dict.
166
+ skip_content_hash :
167
+ Whether to skip file mounts/ray command hash calculation.
168
+ """
169
+ self._sync_continuously = False
170
+ self.update_configs(configs, skip_content_hash)
171
+
172
+ def update_configs(self, configs: Dict[str, Any], skip_content_hash: bool) -> None:
173
+ self._configs = prepare_config(configs)
174
+ validate_config(self._configs)
175
+ if skip_content_hash:
176
+ return
177
+ self._calculate_hashes()
178
+ self._sync_continuously = self._configs.get(
179
+ "generate_file_mounts_contents_hash", True
180
+ )
181
+
182
+ def _calculate_hashes(self) -> None:
183
+ logger.info("Calculating hashes for file mounts and ray commands.")
184
+ self._runtime_hash, self._file_mounts_contents_hash = hash_runtime_conf(
185
+ self._configs.get("file_mounts", {}),
186
+ self._configs.get("cluster_synced_files", []),
187
+ [
188
+ self._configs.get("worker_setup_commands", []),
189
+ self._configs.get("worker_start_ray_commands", []),
190
+ ],
191
+ generate_file_mounts_contents_hash=self._configs.get(
192
+ "generate_file_mounts_contents_hash", True
193
+ ),
194
+ )
195
+
196
+ def get_cloud_node_config(self, ray_node_type: NodeType) -> Dict[str, Any]:
197
+ return copy.deepcopy(
198
+ self.get_node_type_specific_config(ray_node_type, "node_config") or {}
199
+ )
200
+
201
+ def get_docker_config(self, ray_node_type: NodeType) -> Dict[str, Any]:
202
+ """
203
+ Return the docker config for the specified node type.
204
+ If it's a head node, the image will be chosen in the following order:
205
+ 1. Node specific docker image.
206
+ 2. The 'docker' config's 'head_image' field.
207
+ 3. The 'docker' config's 'image' field.
208
+ If it's a worker node, the image will be chosen in the following order:
209
+ 1. Node specific docker image.
210
+ 2. The 'docker' config's 'worker_image' field.
211
+ 3. The 'docker' config's 'image' field.
212
+ """
213
+ # TODO(rickyx): It's unfortunate we have multiple fields in ray-schema.json
214
+ # that can specify docker images. We should consolidate them.
215
+ docker_config = copy.deepcopy(self._configs.get("docker", {}))
216
+ node_specific_docker_config = self._configs["available_node_types"][
217
+ ray_node_type
218
+ ].get("docker", {})
219
+ # Override the global docker config with node specific docker config.
220
+ docker_config.update(node_specific_docker_config)
221
+
222
+ if self._configs.get("head_node_type") == ray_node_type:
223
+ if "head_image" in docker_config:
224
+ logger.info(
225
+ "Overwriting image={} by head_image({}) for head node docker.".format( # noqa: E501
226
+ docker_config["image"], docker_config["head_image"]
227
+ )
228
+ )
229
+ docker_config["image"] = docker_config["head_image"]
230
+ else:
231
+ if "worker_image" in docker_config:
232
+ logger.info(
233
+ "Overwriting image={} by worker_image({}) for worker node docker.".format( # noqa: E501
234
+ docker_config["image"], docker_config["worker_image"]
235
+ )
236
+ )
237
+ docker_config["image"] = docker_config["worker_image"]
238
+
239
+ # These fields should be merged.
240
+ docker_config.pop("head_image", None)
241
+ docker_config.pop("worker_image", None)
242
+ return docker_config
243
+
244
+ def get_worker_start_ray_commands(self) -> List[str]:
245
+ return self._configs.get("worker_start_ray_commands", [])
246
+
247
+ def get_head_setup_commands(self) -> List[str]:
248
+ return self._configs.get("head_setup_commands", [])
249
+
250
+ def get_head_start_ray_commands(self) -> List[str]:
251
+ return self._configs.get("head_start_ray_commands", [])
252
+
253
+ def get_worker_setup_commands(self, ray_node_type: NodeType) -> List[str]:
254
+ """
255
+ Return the worker setup commands for the specified node type.
256
+
257
+ If the node type specific worker setup commands are not specified,
258
+ return the global worker setup commands.
259
+ """
260
+ worker_setup_command = self.get_node_type_specific_config(
261
+ ray_node_type, "worker_setup_commands"
262
+ )
263
+ if worker_setup_command is None:
264
+ # Return global worker setup commands if node type specific
265
+ # worker setup commands are not specified.
266
+ logger.info(
267
+ "Using global worker setup commands for {}".format(ray_node_type)
268
+ )
269
+ return self._configs.get("worker_setup_commands", [])
270
+ return worker_setup_command
271
+
272
+ def get_initialization_commands(self, ray_node_type: NodeType) -> List[str]:
273
+ """
274
+ Return the initialization commands for the specified node type.
275
+
276
+ If the node type specific initialization commands are not specified,
277
+ return the global initialization commands.
278
+ """
279
+ initialization_command = self.get_node_type_specific_config(
280
+ ray_node_type, "initialization_commands"
281
+ )
282
+ if initialization_command is None:
283
+ logger.info(
284
+ "Using global initialization commands for {}".format(ray_node_type)
285
+ )
286
+ return self._configs.get("initialization_commands", [])
287
+ return initialization_command
288
+
289
+ def get_node_type_specific_config(
290
+ self, ray_node_type: NodeType, config_name: str
291
+ ) -> Optional[Any]:
292
+ node_specific_config = self._configs["available_node_types"].get(
293
+ ray_node_type, {}
294
+ )
295
+ return node_specific_config.get(config_name, None)
296
+
297
+ def get_node_resources(self, ray_node_type: NodeType) -> Dict[str, float]:
298
+ return copy.deepcopy(
299
+ self.get_node_type_specific_config(ray_node_type, "resources") or {}
300
+ )
301
+
302
+ def get_node_labels(self, ray_node_type: NodeType) -> Dict[str, str]:
303
+ return copy.deepcopy(
304
+ self.get_node_type_specific_config(ray_node_type, "labels") or {}
305
+ )
306
+
307
+ def get_config(self, config_name, default=None) -> Any:
308
+ return self._configs.get(config_name, default)
309
+
310
+ def get_provider_instance_type(self, ray_node_type: NodeType) -> str:
311
+ provider = self.provider
312
+ node_config = self.get_node_type_specific_config(ray_node_type, "node_config")
313
+ if provider in [Provider.AWS, Provider.ALIYUN]:
314
+ return node_config.get("InstanceType", "")
315
+ elif provider == Provider.AZURE:
316
+ return node_config.get("azure_arm_parameters", {}).get("vmSize", "")
317
+ elif provider == Provider.GCP:
318
+ return node_config.get("machineType", "")
319
+ elif provider in [Provider.KUBERAY, Provider.LOCAL, Provider.UNKNOWN]:
320
+ return ""
321
+ else:
322
+ raise ValueError(f"Unknown provider {provider}")
323
+
324
+ def get_node_type_configs(self) -> Dict[NodeType, NodeTypeConfig]:
325
+ """
326
+ Returns the node type configs from the `available_node_types` field.
327
+
328
+ Returns:
329
+ Dict[NodeType, NodeTypeConfig]: The node type configs.
330
+ """
331
+ available_node_types = self._configs.get("available_node_types", {})
332
+ if not available_node_types:
333
+ return None
334
+ node_type_configs = {}
335
+ auth_config = self._configs.get("auth", {})
336
+ head_node_type = self.get_head_node_type()
337
+ assert head_node_type
338
+ for node_type, node_config in available_node_types.items():
339
+ launch_config_hash = hash_launch_conf(
340
+ node_config.get("node_config", {}), auth_config
341
+ )
342
+ max_workers_nodes = node_config.get("max_workers", 0)
343
+ if head_node_type == node_type:
344
+ max_workers_nodes += 1
345
+
346
+ node_type_configs[node_type] = NodeTypeConfig(
347
+ name=node_type,
348
+ min_worker_nodes=node_config.get("min_workers", 0),
349
+ max_worker_nodes=max_workers_nodes,
350
+ idle_timeout_s=node_config.get("idle_timeout_s", None),
351
+ resources=node_config.get("resources", {}),
352
+ labels=node_config.get("labels", {}),
353
+ launch_config_hash=launch_config_hash,
354
+ )
355
+ return node_type_configs
356
+
357
+ def get_head_node_type(self) -> NodeType:
358
+ """
359
+ Returns the head node type.
360
+
361
+ If there is only one node type, return the only node type as the head
362
+ node type.
363
+ If there are multiple node types, return the head node type specified
364
+ in the config.
365
+ """
366
+ available_node_types = self._configs.get("available_node_types", {})
367
+ if len(available_node_types) == 1:
368
+ return list(available_node_types.keys())[0]
369
+ return self._configs.get("head_node_type")
370
+
371
+ def get_max_num_worker_nodes(self) -> Optional[int]:
372
+ return self.get_config("max_workers", None)
373
+
374
+ def get_max_num_nodes(self) -> Optional[int]:
375
+ max_num_workers = self.get_max_num_worker_nodes()
376
+ if max_num_workers is not None:
377
+ return max_num_workers + 1 # For head node
378
+ return None
379
+
380
+ def get_raw_config_mutable(self) -> Dict[str, Any]:
381
+ return self._configs
382
+
383
+ def get_upscaling_speed(self) -> float:
384
+ return self.get_config("upscaling_speed", DEFAULT_UPSCALING_SPEED)
385
+
386
+ def get_max_concurrent_launches(self) -> int:
387
+ return AUTOSCALER_MAX_CONCURRENT_LAUNCHES
388
+
389
+ def disable_node_updaters(self) -> bool:
390
+ provider_config = self._configs.get("provider", {})
391
+ return provider_config.get(DISABLE_NODE_UPDATERS_KEY, True)
392
+
393
+ def get_idle_timeout_s(self) -> Optional[float]:
394
+ """
395
+ Returns the idle timeout in seconds if present in config, otherwise None.
396
+ """
397
+ idle_timeout_s = self.get_config("idle_timeout_minutes", None)
398
+ return idle_timeout_s * 60 if idle_timeout_s is not None else None
399
+
400
+ def disable_launch_config_check(self) -> bool:
401
+ provider_config = self.get_provider_config()
402
+ return provider_config.get(DISABLE_LAUNCH_CONFIG_CHECK_KEY, True)
403
+
404
+ def get_instance_reconcile_config(self) -> InstanceReconcileConfig:
405
+ # TODO(rickyx): we need a way to customize these configs,
406
+ # either extending the current ray-schema.json, or just use another
407
+ # schema validation paths.
408
+ return InstanceReconcileConfig()
409
+
410
+ def get_provider_config(self) -> Dict[str, Any]:
411
+ return self._configs.get("provider", {})
412
+
413
+ def dump(self) -> str:
414
+ return yaml.safe_dump(self._configs)
415
+
416
+ @property
417
+ def provider(self) -> Provider:
418
+ provider_str = self._configs.get("provider", {}).get("type", "")
419
+ if provider_str == "local":
420
+ return Provider.LOCAL
421
+ elif provider_str == "aws":
422
+ return Provider.AWS
423
+ elif provider_str == "azure":
424
+ return Provider.AZURE
425
+ elif provider_str == "gcp":
426
+ return Provider.GCP
427
+ elif provider_str == "aliyun":
428
+ return Provider.ALIYUN
429
+ elif provider_str == "kuberay":
430
+ return Provider.KUBERAY
431
+ elif provider_str == "readonly":
432
+ return Provider.READ_ONLY
433
+ else:
434
+ return Provider.UNKNOWN
435
+
436
+ @property
437
+ def runtime_hash(self) -> str:
438
+ return self._runtime_hash
439
+
440
+ @property
441
+ def file_mounts_contents_hash(self) -> str:
442
+ return self._file_mounts_contents_hash
443
+
444
+
445
+ class FileConfigReader(IConfigReader):
446
+ """A class that reads cluster config from a yaml file."""
447
+
448
+ def __init__(self, config_file: str, skip_content_hash: bool = True) -> None:
449
+ """
450
+ Args:
451
+ config_file: The path to the config file.
452
+ skip_content_hash: Whether to skip file mounts/ray command
453
+ hash calculation. Default to True.
454
+ """
455
+ self._config_file_path = Path(config_file).resolve()
456
+ self._skip_content_hash = skip_content_hash
457
+ self._cached_config = self._read()
458
+
459
+ def _read(self) -> AutoscalingConfig:
460
+ with open(self._config_file_path) as f:
461
+ config = yaml.safe_load(f.read())
462
+ return AutoscalingConfig(config, skip_content_hash=self._skip_content_hash)
463
+
464
+ def get_cached_autoscaling_config(self) -> AutoscalingConfig:
465
+ """
466
+ Returns:
467
+ AutoscalingConfig: The autoscaling config.
468
+ """
469
+
470
+ return self._cached_config
471
+
472
+ def refresh_cached_autoscaling_config(self):
473
+ self._cached_config = self._read()
474
+
475
+
476
+ class KubeRayConfigReader(IConfigReader):
477
+ """A class that reads cluster config from a K8s RayCluster CR."""
478
+
479
+ def __init__(self, config_producer: AutoscalingConfigProducer):
480
+ self._config_producer = config_producer
481
+ self._cached_config = self._generate_configs_from_k8s()
482
+
483
+ def _generate_configs_from_k8s(self) -> AutoscalingConfig:
484
+ return AutoscalingConfig(self._config_producer())
485
+
486
+ def get_cached_autoscaling_config(self) -> AutoscalingConfig:
487
+ """
488
+ Returns:
489
+ AutoscalingConfig: The autoscaling config.
490
+ """
491
+ return self._cached_config
492
+
493
+ def refresh_cached_autoscaling_config(self):
494
+ """
495
+ Reads the configs from the K8s RayCluster CR.
496
+
497
+ This reads from the K8s API server every time to pick up changes.
498
+ """
499
+ self._cached_config = self._generate_configs_from_k8s()
500
+
501
+
502
+ class ReadOnlyProviderConfigReader(IConfigReader):
503
+ """A class that reads cluster config for a read-only provider.
504
+
505
+ This is used for laptop mode / manual cluster setup modes, in order to
506
+ provide status reporting in the same way for users."""
507
+
508
+ def __init__(self, gcs_address: str):
509
+ self._configs = BASE_READONLY_CONFIG
510
+ self._gcs_client = GcsClient(address=gcs_address)
511
+
512
+ def refresh_cached_autoscaling_config(self) -> AutoscalingConfig:
513
+ # Update the config with node types from GCS.
514
+ ray_cluster_resource_state = get_cluster_resource_state(self._gcs_client)
515
+
516
+ # Format each node type's config from the running nodes.
517
+ available_node_types = {}
518
+
519
+ head_node_type = None
520
+ for node_state in ray_cluster_resource_state.node_states:
521
+ node_type = format_readonly_node_type(binary_to_hex(node_state.node_id))
522
+ if is_head_node(node_state):
523
+ head_node_type = node_type
524
+
525
+ available_node_types[node_type] = {
526
+ "resources": dict(node_state.total_resources),
527
+ "min_workers": 0,
528
+ "max_workers": 0 if is_head_node(node_state) else 1,
529
+ "node_config": {},
530
+ }
531
+ if available_node_types:
532
+ self._configs["available_node_types"].update(available_node_types)
533
+ self._configs["max_workers"] = len(available_node_types)
534
+ assert head_node_type, "Head node type should be found."
535
+ self._configs["head_node_type"] = head_node_type
536
+
537
+ # Don't idle terminated nodes in read-only mode.
538
+ self._configs.pop("idle_timeout_minutes", None)
539
+
540
+ def get_cached_autoscaling_config(self) -> AutoscalingConfig:
541
+ return AutoscalingConfig(self._configs, skip_content_hash=True)
.venv/lib/python3.11/site-packages/ray/autoscaler/v2/instance_manager/instance_manager.py ADDED
@@ -0,0 +1,270 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ from abc import ABC, abstractmethod
3
+ from typing import List, Optional
4
+
5
+ from ray.autoscaler.v2.instance_manager.common import InstanceUtil
6
+ from ray.autoscaler.v2.instance_manager.instance_storage import InstanceStorage
7
+ from ray.core.generated.instance_manager_pb2 import (
8
+ GetInstanceManagerStateReply,
9
+ GetInstanceManagerStateRequest,
10
+ Instance,
11
+ InstanceUpdateEvent,
12
+ NodeKind,
13
+ StatusCode,
14
+ UpdateInstanceManagerStateReply,
15
+ UpdateInstanceManagerStateRequest,
16
+ )
17
+
18
+ logger = logging.getLogger(__name__)
19
+
20
+
21
+ class InstanceUpdatedSubscriber(ABC):
22
+ """Subscribers to instance status changes."""
23
+
24
+ @abstractmethod
25
+ def notify(self, events: List[InstanceUpdateEvent]) -> None:
26
+ pass
27
+
28
+
29
+ class InstanceManager:
30
+ """
31
+ See `InstanceManagerService` in instance_manager.proto
32
+
33
+ This handles updates to an instance, or inserts a new instance if
34
+ it's an insert update. We should only be inserting new instances
35
+ of the below statuses:
36
+ 1. ALLOCATED: For unmanaged instance not initialized by InstanceManager,
37
+ e.g. head node
38
+ 2. QUEUED: For new instance being queued to launch.
39
+ 3. TERMINATING: For leaked cloud instance that needs to be terminated.
40
+
41
+ For full status transitions, see:
42
+ https://docs.google.com/document/d/1NzQjA8Mh-oMc-QxXOa529oneWCoA8sDiVoNkBqqDb4U/edit#heading=h.k9a1sp4qpqj4
43
+
44
+ Not thread safe, should be used as a singleton.
45
+ """
46
+
47
+ def __init__(
48
+ self,
49
+ instance_storage: InstanceStorage,
50
+ instance_status_update_subscribers: Optional[List[InstanceUpdatedSubscriber]],
51
+ ):
52
+ self._instance_storage = instance_storage
53
+ self._status_update_subscribers = instance_status_update_subscribers or []
54
+
55
+ def update_instance_manager_state(
56
+ self, request: UpdateInstanceManagerStateRequest
57
+ ) -> UpdateInstanceManagerStateReply:
58
+ """
59
+ Updates the instance manager state.
60
+
61
+ If there's any failure, no updates would be made and the reply
62
+ would contain the latest version of the instance manager state,
63
+ and the error info.
64
+
65
+ Args:
66
+ request: The request to update the instance manager state.
67
+
68
+ Returns:
69
+ The reply to the request.
70
+ """
71
+
72
+ # Handle updates
73
+ ids_to_updates = {update.instance_id: update for update in request.updates}
74
+ to_update_instances, version = self._instance_storage.get_instances(
75
+ instance_ids=ids_to_updates.keys()
76
+ )
77
+
78
+ if request.expected_version >= 0 and request.expected_version != version:
79
+ err_str = (
80
+ f"Version mismatch: expected: {request.expected_version}, "
81
+ f"actual: {version}"
82
+ )
83
+ logger.warning(err_str)
84
+ return self._get_update_im_state_reply(
85
+ StatusCode.VERSION_MISMATCH,
86
+ version,
87
+ err_str,
88
+ )
89
+
90
+ # Handle instances states update.
91
+ to_upsert_instances = []
92
+ for instance_id, update in ids_to_updates.items():
93
+ if instance_id in to_update_instances:
94
+ instance = self._update_instance(
95
+ to_update_instances[instance_id], update
96
+ )
97
+ else:
98
+ instance = self._create_instance(update)
99
+
100
+ to_upsert_instances.append(instance)
101
+
102
+ # Updates the instance storage.
103
+ result = self._instance_storage.batch_upsert_instances(
104
+ updates=to_upsert_instances,
105
+ expected_storage_version=version,
106
+ )
107
+
108
+ if not result.success:
109
+ if result.version != version:
110
+ err_str = (
111
+ f"Version mismatch: expected: {version}, actual: {result.version}"
112
+ )
113
+ logger.warning(err_str)
114
+ return self._get_update_im_state_reply(
115
+ StatusCode.VERSION_MISMATCH, result.version, err_str
116
+ )
117
+ else:
118
+ err_str = "Failed to update instance storage."
119
+ logger.error(err_str)
120
+ return self._get_update_im_state_reply(
121
+ StatusCode.UNKNOWN_ERRORS, result.version, err_str
122
+ )
123
+
124
+ # Successful updates.
125
+ for subscriber in self._status_update_subscribers:
126
+ subscriber.notify(request.updates)
127
+
128
+ return self._get_update_im_state_reply(StatusCode.OK, result.version)
129
+
130
+ def get_instance_manager_state(
131
+ self, request: GetInstanceManagerStateRequest
132
+ ) -> GetInstanceManagerStateReply:
133
+ """
134
+ Gets the instance manager state.
135
+
136
+ Args:
137
+ request: The request to get the instance manager state.
138
+
139
+ Returns:
140
+ The reply to the request.
141
+ """
142
+ reply = GetInstanceManagerStateReply()
143
+ instances, version = self._instance_storage.get_instances()
144
+ reply.state.instances.extend(instances.values())
145
+ reply.state.version = version
146
+ reply.status.code = StatusCode.OK
147
+
148
+ return reply
149
+
150
+ #########################################
151
+ # Private methods
152
+ #########################################
153
+
154
+ @staticmethod
155
+ def _get_update_im_state_reply(
156
+ status_code: StatusCode, version: int, error_message: str = ""
157
+ ) -> UpdateInstanceManagerStateReply:
158
+ """
159
+ Returns a UpdateInstanceManagerStateReply with the given status code and
160
+ version.
161
+
162
+ Args:
163
+ status_code: The status code.
164
+ version: The version.
165
+ error_message: The error message if any.
166
+
167
+ Returns:
168
+ The reply.
169
+ """
170
+ reply = UpdateInstanceManagerStateReply()
171
+ reply.status.code = status_code
172
+ reply.version = version
173
+ if error_message:
174
+ reply.status.message = error_message
175
+ return reply
176
+
177
+ @staticmethod
178
+ def _apply_update(instance: Instance, update: InstanceUpdateEvent):
179
+ """
180
+ Apply status specific update to the instance.
181
+
182
+ Args:
183
+ instance: The instance to update.
184
+ update: The update to apply.
185
+ """
186
+ if update.new_instance_status == Instance.ALLOCATED:
187
+ assert (
188
+ update.cloud_instance_id
189
+ ), "ALLOCATED update must have cloud_instance_id"
190
+ assert update.node_kind in [
191
+ NodeKind.WORKER,
192
+ NodeKind.HEAD,
193
+ ], "ALLOCATED update must have node_kind as WORKER or HEAD"
194
+ assert update.instance_type, "ALLOCATED update must have instance_type"
195
+ assert (
196
+ update.cloud_instance_id
197
+ ), "ALLOCATED update must have cloud_instance_id"
198
+ instance.cloud_instance_id = update.cloud_instance_id
199
+ instance.node_kind = update.node_kind
200
+ instance.instance_type = update.instance_type
201
+ elif update.new_instance_status == Instance.RAY_RUNNING:
202
+ assert update.ray_node_id, "RAY_RUNNING update must have ray_node_id"
203
+ instance.node_id = update.ray_node_id
204
+ elif update.new_instance_status == Instance.REQUESTED:
205
+ assert (
206
+ update.launch_request_id
207
+ ), "REQUESTED update must have launch_request_id"
208
+ assert update.instance_type, "REQUESTED update must have instance_type"
209
+ instance.launch_request_id = update.launch_request_id
210
+ instance.instance_type = update.instance_type
211
+ elif update.new_instance_status == Instance.TERMINATING:
212
+ assert (
213
+ update.cloud_instance_id
214
+ ), "TERMINATING update must have cloud instance id"
215
+
216
+ @staticmethod
217
+ def _create_instance(update: InstanceUpdateEvent) -> Instance:
218
+ """
219
+ Create a new instance from the given update.
220
+ """
221
+
222
+ assert update.upsert, "upsert must be true for creating new instance."
223
+
224
+ assert update.new_instance_status in [
225
+ # For unmanaged instance not initialized by InstanceManager,
226
+ # e.g. head node
227
+ Instance.ALLOCATED,
228
+ # For new instance being queued to launch.
229
+ Instance.QUEUED,
230
+ # For leaked cloud instance that needs to be terminated.
231
+ Instance.TERMINATING,
232
+ ], (
233
+ "Invalid status for new instance, must be one of "
234
+ "[ALLOCATED, QUEUED, TERMINATING]"
235
+ )
236
+
237
+ # Create a new instance first for common fields.
238
+ instance = InstanceUtil.new_instance(
239
+ instance_id=update.instance_id,
240
+ instance_type=update.instance_type,
241
+ status=update.new_instance_status,
242
+ details=update.details,
243
+ )
244
+
245
+ # Apply the status specific updates.
246
+ logger.info(InstanceUtil.get_log_str_for_update(instance, update))
247
+ InstanceManager._apply_update(instance, update)
248
+ return instance
249
+
250
+ @staticmethod
251
+ def _update_instance(instance: Instance, update: InstanceUpdateEvent) -> Instance:
252
+ """
253
+ Update the instance with the given update.
254
+
255
+ Args:
256
+ instance: The instance to update.
257
+ update: The update to apply.
258
+
259
+ Returns:
260
+ The updated instance.
261
+ """
262
+ logger.info(InstanceUtil.get_log_str_for_update(instance, update))
263
+ assert InstanceUtil.set_status(instance, update.new_instance_status), (
264
+ "Invalid status transition from "
265
+ f"{Instance.InstanceStatus.Name(instance.status)} to "
266
+ f"{Instance.InstanceStatus.Name(update.new_instance_status)}"
267
+ )
268
+ InstanceManager._apply_update(instance, update)
269
+
270
+ return instance
.venv/lib/python3.11/site-packages/ray/autoscaler/v2/instance_manager/instance_storage.py ADDED
@@ -0,0 +1,151 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import copy
2
+ import logging
3
+ from typing import Dict, List, Optional, Set, Tuple
4
+
5
+ from ray.autoscaler.v2.instance_manager.storage import Storage, StoreStatus
6
+ from ray.core.generated.instance_manager_pb2 import Instance
7
+
8
+ logger = logging.getLogger(__name__)
9
+
10
+
11
+ class InstanceStorage:
12
+ """Instance storage stores the states of instances in the storage."""
13
+
14
+ def __init__(
15
+ self,
16
+ cluster_id: str,
17
+ storage: Storage,
18
+ ) -> None:
19
+ self._storage = storage
20
+ self._cluster_id = cluster_id
21
+ self._table_name = f"instance_table@{cluster_id}"
22
+
23
+ def batch_upsert_instances(
24
+ self,
25
+ updates: List[Instance],
26
+ expected_storage_version: Optional[int] = None,
27
+ ) -> StoreStatus:
28
+ """Upsert instances into the storage. If the instance already exists,
29
+ it will be updated. Otherwise, it will be inserted. If the
30
+ expected_storage_version is specified, the update will fail if the
31
+ current storage version does not match the expected version.
32
+
33
+ Note the version of the upserted instances will be set to the current
34
+ storage version.
35
+
36
+ Args:
37
+ updates: A list of instances to be upserted.
38
+ expected_storage_version: The expected storage version.
39
+
40
+ Returns:
41
+ StoreStatus: A tuple of (success, storage_version).
42
+ """
43
+ mutations = {}
44
+ version = self._storage.get_version()
45
+ # handle version mismatch
46
+ if expected_storage_version and expected_storage_version != version:
47
+ return StoreStatus(False, version)
48
+
49
+ for instance in updates:
50
+ instance = copy.deepcopy(instance)
51
+ # the instance version is set to 0, it will be
52
+ # populated by the storage entry's verion on read
53
+ instance.version = 0
54
+ mutations[instance.instance_id] = instance.SerializeToString()
55
+
56
+ result, version = self._storage.batch_update(
57
+ self._table_name, mutations, {}, expected_storage_version
58
+ )
59
+
60
+ return StoreStatus(result, version)
61
+
62
+ def upsert_instance(
63
+ self,
64
+ instance: Instance,
65
+ expected_instance_version: Optional[int] = None,
66
+ expected_storage_verison: Optional[int] = None,
67
+ ) -> StoreStatus:
68
+ """Upsert an instance in the storage.
69
+ If the expected_instance_version is specified, the update will fail
70
+ if the current instance version does not match the expected version.
71
+ Similarly, if the expected_storage_version is
72
+ specified, the update will fail if the current storage version does not
73
+ match the expected version.
74
+
75
+ Note the version of the upserted instances will be set to the current
76
+ storage version.
77
+
78
+ Args:
79
+ instance: The instance to be updated.
80
+ expected_instance_version: The expected instance version.
81
+ expected_storage_version: The expected storage version.
82
+
83
+ Returns:
84
+ StoreStatus: A tuple of (success, storage_version).
85
+ """
86
+ instance = copy.deepcopy(instance)
87
+ # the instance version is set to 0, it will be
88
+ # populated by the storage entry's verion on read
89
+ instance.version = 0
90
+ result, version = self._storage.update(
91
+ self._table_name,
92
+ key=instance.instance_id,
93
+ value=instance.SerializeToString(),
94
+ expected_entry_version=expected_instance_version,
95
+ expected_storage_version=expected_storage_verison,
96
+ insert_only=False,
97
+ )
98
+
99
+ return StoreStatus(result, version)
100
+
101
+ def get_instances(
102
+ self,
103
+ instance_ids: List[str] = None,
104
+ status_filter: Set[int] = None,
105
+ ) -> Tuple[Dict[str, Instance], int]:
106
+ """Get instances from the storage.
107
+
108
+ Args:
109
+ instance_ids: A list of instance ids to be retrieved. If empty, all
110
+ instances will be retrieved.
111
+ status_filter: Only instances with the specified status will be returned.
112
+
113
+ Returns:
114
+ Tuple[Dict[str, Instance], int]: A tuple of (instances, version).
115
+ The instances is a dictionary of (instance_id, instance) pairs.
116
+ """
117
+ instance_ids = instance_ids or []
118
+ status_filter = status_filter or set()
119
+ pairs, version = self._storage.get(self._table_name, instance_ids)
120
+ instances = {}
121
+ for instance_id, (instance_data, entry_version) in pairs.items():
122
+ instance = Instance()
123
+ instance.ParseFromString(instance_data)
124
+ instance.version = entry_version
125
+ if status_filter and instance.status not in status_filter:
126
+ continue
127
+ instances[instance_id] = instance
128
+ return instances, version
129
+
130
+ def batch_delete_instances(
131
+ self, instance_ids: List[str], expected_storage_version: Optional[int] = None
132
+ ) -> StoreStatus:
133
+ """Delete instances from the storage. If the expected_version is
134
+ specified, the update will fail if the current storage version does not
135
+ match the expected version.
136
+
137
+ Args:
138
+ to_delete: A list of instances to be deleted.
139
+ expected_version: The expected storage version.
140
+
141
+ Returns:
142
+ StoreStatus: A tuple of (success, storage_version).
143
+ """
144
+ version = self._storage.get_version()
145
+ if expected_storage_version and expected_storage_version != version:
146
+ return StoreStatus(False, version)
147
+
148
+ result = self._storage.batch_update(
149
+ self._table_name, {}, instance_ids, expected_storage_version
150
+ )
151
+ return result
.venv/lib/python3.11/site-packages/ray/autoscaler/v2/instance_manager/node_provider.py ADDED
@@ -0,0 +1,522 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ import math
3
+ import time
4
+ from abc import ABC, abstractmethod
5
+ from concurrent.futures import ThreadPoolExecutor
6
+ from dataclasses import dataclass
7
+ from queue import Queue
8
+ from typing import Any, Dict, List, Optional
9
+
10
+ from ray.autoscaler._private.constants import (
11
+ AUTOSCALER_MAX_CONCURRENT_LAUNCHES,
12
+ AUTOSCALER_MAX_LAUNCH_BATCH,
13
+ )
14
+ from ray.autoscaler._private.util import hash_launch_conf
15
+ from ray.autoscaler.node_provider import NodeProvider as NodeProviderV1
16
+ from ray.autoscaler.tags import (
17
+ NODE_KIND_HEAD,
18
+ NODE_KIND_UNMANAGED,
19
+ NODE_KIND_WORKER,
20
+ STATUS_UNINITIALIZED,
21
+ TAG_RAY_LAUNCH_CONFIG,
22
+ TAG_RAY_LAUNCH_REQUEST,
23
+ TAG_RAY_NODE_KIND,
24
+ TAG_RAY_NODE_NAME,
25
+ TAG_RAY_NODE_STATUS,
26
+ TAG_RAY_USER_NODE_TYPE,
27
+ )
28
+ from ray.autoscaler.v2.instance_manager.config import IConfigReader
29
+ from ray.autoscaler.v2.schema import NodeType
30
+ from ray.core.generated.instance_manager_pb2 import NodeKind
31
+
32
+ logger = logging.getLogger(__name__)
33
+
34
+ # Type Alias. This is a **unique identifier** for a cloud instance in the cluster.
35
+ # The provider should guarantee that this id is unique across the cluster,
36
+ # such that:
37
+ # - When a cloud instance is created and running, no other cloud instance in the
38
+ # cluster has the same id.
39
+ # - When a cloud instance is terminated, no other cloud instance in the cluster will
40
+ # be assigned the same id later.
41
+ CloudInstanceId = str
42
+
43
+
44
+ @dataclass
45
+ class CloudInstance:
46
+ """
47
+ A class that represents a cloud instance in the cluster, with necessary metadata
48
+ of the cloud instance.
49
+ """
50
+
51
+ # The cloud instance id.
52
+ cloud_instance_id: CloudInstanceId
53
+ # The node type of the cloud instance.
54
+ node_type: NodeType
55
+ # The node kind, i.e head or worker.
56
+ node_kind: NodeKind
57
+ # If the cloud instance is already running.
58
+ is_running: bool
59
+ # Update request id from which the cloud instance is launched.
60
+ # This could be None if the cloud instance couldn't be associated with requests
61
+ # by the cloud provider: e.g. cloud provider doesn't support per-instance
62
+ # extra metadata.
63
+ # This is fine for now since the reconciler should be able to know how
64
+ # to handle cloud instances w/o request ids.
65
+ # TODO: make this a required field.
66
+ request_id: Optional[str] = None
67
+
68
+
69
+ class CloudInstanceProviderError(Exception):
70
+ """
71
+ An base error class that represents an error that happened in the cloud instance
72
+ provider.
73
+ """
74
+
75
+ # The timestamp of the error occurred in nanoseconds.
76
+ timestamp_ns: int
77
+
78
+ def __init__(self, msg, timestamp_ns) -> None:
79
+ super().__init__(msg)
80
+ self.timestamp_ns = timestamp_ns
81
+
82
+
83
+ class LaunchNodeError(CloudInstanceProviderError):
84
+ # The node type that failed to launch.
85
+ node_type: NodeType
86
+ # Number of nodes that failed to launch.
87
+ count: int
88
+ # A unique id that identifies from which update request the error originates.
89
+ request_id: str
90
+
91
+ def __init__(
92
+ self,
93
+ node_type: NodeType,
94
+ count: int,
95
+ request_id: str,
96
+ timestamp_ns: int,
97
+ details: str = "",
98
+ cause: Optional[Exception] = None,
99
+ ) -> None:
100
+ msg = (
101
+ f"Failed to launch {count} nodes of type {node_type} with "
102
+ f"request id {request_id}: {details}"
103
+ )
104
+ super().__init__(msg, timestamp_ns=timestamp_ns)
105
+ self.node_type = node_type
106
+ self.count = count
107
+ self.request_id = request_id
108
+ if cause:
109
+ self.__cause__ = cause
110
+
111
+ def __repr__(self) -> str:
112
+ return (
113
+ f"LaunchNodeError(node_type={self.node_type}, count={self.count}, "
114
+ f"request_id={self.request_id}): {self.__cause__}"
115
+ )
116
+
117
+
118
+ class TerminateNodeError(CloudInstanceProviderError):
119
+ # The cloud instance id of the node that failed to terminate.
120
+ cloud_instance_id: CloudInstanceId
121
+ # A unique id that identifies from which update request the error originates.
122
+ request_id: str
123
+
124
+ def __init__(
125
+ self,
126
+ cloud_instance_id: CloudInstanceId,
127
+ request_id: str,
128
+ timestamp_ns: int,
129
+ details: str = "",
130
+ cause: Optional[Exception] = None,
131
+ ) -> None:
132
+ msg = (
133
+ f"Failed to terminate node {cloud_instance_id} with "
134
+ f"request id {request_id}: {details}"
135
+ )
136
+ super().__init__(msg, timestamp_ns=timestamp_ns)
137
+ self.cloud_instance_id = cloud_instance_id
138
+ self.request_id = request_id
139
+ if cause:
140
+ self.__cause__ = cause
141
+
142
+ def __repr__(self) -> str:
143
+ return (
144
+ f"TerminateNodeError(cloud_instance_id={self.cloud_instance_id}, "
145
+ f"request_id={self.request_id}): {self.__cause__}"
146
+ )
147
+
148
+
149
+ class ICloudInstanceProvider(ABC):
150
+ """
151
+ The interface for a cloud instance provider.
152
+
153
+ This interface is a minimal interface that should be implemented by the
154
+ various cloud instance providers (e.g. AWS, and etc).
155
+
156
+ The cloud instance provider is responsible for managing the cloud instances in the
157
+ cluster. It provides the following main functionalities:
158
+ - Launch new cloud instances.
159
+ - Terminate existing running instances.
160
+ - Get the non-terminated cloud instances in the cluster.
161
+ - Poll the errors that happened for the updates to the cloud instance provider.
162
+
163
+ Below properties of the cloud instance provider are assumed with this interface:
164
+
165
+ 1. Eventually consistent
166
+ The cloud instance provider is expected to be eventually consistent with the
167
+ cluster state. For example, when a cloud instance is request to be terminated
168
+ or launched, the provider may not immediately reflect the change in its state.
169
+ However, the provider is expected to eventually reflect the change in its state.
170
+
171
+ 2. Asynchronous
172
+ The provider could also be asynchronous, where the termination/launch
173
+ request may not immediately return the result of the request.
174
+
175
+ 3. Unique cloud instance ids
176
+ Cloud instance ids are expected to be unique across the cluster.
177
+
178
+ 4. Idempotent updates
179
+ For the update APIs (e.g. ensure_min_nodes, terminate), the provider may use the
180
+ request ids to provide idempotency.
181
+
182
+ Usage:
183
+ ```
184
+ provider: ICloudInstanceProvider = ...
185
+
186
+ # Update the cluster with a desired shape.
187
+ provider.launch(
188
+ shape={
189
+ "worker_nodes": 10,
190
+ "ray_head": 1,
191
+ },
192
+ request_id="1",
193
+ )
194
+
195
+ # Get the non-terminated nodes of the cloud instance provider.
196
+ running = provider.get_non_terminated()
197
+
198
+ # Poll the errors
199
+ errors = provider.poll_errors()
200
+
201
+ # Terminate nodes.
202
+ provider.terminate(
203
+ ids=["cloud_instance_id_1", "cloud_instance_id_2"],
204
+ request_id="2",
205
+ )
206
+
207
+ # Process the state of the provider.
208
+ ...
209
+ ```
210
+ """
211
+
212
+ @abstractmethod
213
+ def get_non_terminated(self) -> Dict[CloudInstanceId, CloudInstance]:
214
+ """Get the non-terminated cloud instances in the cluster.
215
+
216
+ Returns:
217
+ A dictionary of the non-terminated cloud instances in the cluster.
218
+ The key is the cloud instance id, and the value is the cloud instance.
219
+ """
220
+ pass
221
+
222
+ @abstractmethod
223
+ def terminate(self, ids: List[CloudInstanceId], request_id: str) -> None:
224
+ """
225
+ Terminate the cloud instances asynchronously.
226
+
227
+ This method is expected to be idempotent, i.e. if the same request id is used
228
+ to terminate the same cloud instances, this should be a no-op if
229
+ the cloud instances are already terminated or being terminated.
230
+
231
+ Args:
232
+ ids: the cloud instance ids to terminate.
233
+ request_id: a unique id that identifies the request.
234
+ """
235
+ pass
236
+
237
+ @abstractmethod
238
+ def launch(
239
+ self,
240
+ shape: Dict[NodeType, int],
241
+ request_id: str,
242
+ ) -> None:
243
+ """Launch the cloud instances asynchronously.
244
+
245
+ Args:
246
+ shape: A map from node type to number of nodes to launch.
247
+ request_id: a unique id that identifies the update request.
248
+ """
249
+ pass
250
+
251
+ @abstractmethod
252
+ def poll_errors(self) -> List[CloudInstanceProviderError]:
253
+ """
254
+ Poll the errors that happened since the last poll.
255
+
256
+ This method would also clear the errors that happened since the last poll.
257
+
258
+ Returns:
259
+ The errors that happened since the last poll.
260
+ """
261
+ pass
262
+
263
+
264
+ @dataclass(frozen=True)
265
+ class CloudInstanceLaunchRequest:
266
+ """
267
+ The arguments to launch a node.
268
+ """
269
+
270
+ # The node type to launch.
271
+ node_type: NodeType
272
+ # Number of nodes to launch.
273
+ count: int
274
+ # A unique id that identifies the request.
275
+ request_id: str
276
+
277
+
278
+ @dataclass(frozen=True)
279
+ class CloudInstanceTerminateRequest:
280
+ """
281
+ The arguments to terminate a node.
282
+ """
283
+
284
+ # The cloud instance id of the node to terminate.
285
+ cloud_instance_id: CloudInstanceId
286
+ # A unique id that identifies the request.
287
+ request_id: str
288
+
289
+
290
+ class NodeProviderAdapter(ICloudInstanceProvider):
291
+ """
292
+ Warps a NodeProviderV1 to a ICloudInstanceProvider.
293
+
294
+ TODO(rickyx):
295
+ The current adapter right now consists of two sets of APIs:
296
+ - v1: the old APIs that are used by the autoscaler, where
297
+ we forward the calls to the NodeProviderV1.
298
+ - v2: the new APIs that are used by the autoscaler v2, this is
299
+ defined in the ICloudInstanceProvider interface.
300
+
301
+ We should eventually remove the v1 APIs and only use the v2 APIs.
302
+ It's currently left as a TODO since changing the v1 APIs would
303
+ requires a lot of changes in the cluster launcher codebase.
304
+ """
305
+
306
+ def __init__(
307
+ self,
308
+ v1_provider: NodeProviderV1,
309
+ config_reader: IConfigReader,
310
+ max_launch_batch_per_type: int = AUTOSCALER_MAX_LAUNCH_BATCH,
311
+ max_concurrent_launches: int = AUTOSCALER_MAX_CONCURRENT_LAUNCHES,
312
+ ) -> None:
313
+ """
314
+ Args:
315
+ v1_provider: The v1 node provider to wrap.
316
+ config_reader: The config reader to read the autoscaling config.
317
+ max_launch_batch_per_type: The maximum number of nodes to launch per
318
+ node type in a single batch.
319
+ max_concurrent_launches: The maximum number of concurrent launches.
320
+ """
321
+
322
+ super().__init__()
323
+ self._v1_provider = v1_provider
324
+ self._config_reader = config_reader
325
+ # Executor to async launching and terminating nodes.
326
+ self._main_executor = ThreadPoolExecutor(
327
+ max_workers=1, thread_name_prefix="ray::NodeProviderAdapter"
328
+ )
329
+
330
+ # v1 legacy rate limiting on the node provider launch calls.
331
+ self._max_launch_batch_per_type = max_launch_batch_per_type
332
+ max_batches = math.ceil(
333
+ max_concurrent_launches / float(max_launch_batch_per_type)
334
+ )
335
+ self._node_launcher_executors = ThreadPoolExecutor(
336
+ max_workers=max_batches,
337
+ thread_name_prefix="ray::NodeLauncherPool",
338
+ )
339
+
340
+ # Queue to retrieve new errors occur in the multi-thread executors
341
+ # temporarily.
342
+ self._errors_queue = Queue()
343
+
344
+ def get_non_terminated(self) -> Dict[CloudInstanceId, CloudInstance]:
345
+ nodes = {}
346
+
347
+ cloud_instance_ids = self._v1_non_terminated_nodes({})
348
+ # Filter out nodes that are not running.
349
+ # This is efficient since the provider is expected to cache the
350
+ # running status of the nodes.
351
+ for cloud_instance_id in cloud_instance_ids:
352
+ node_tags = self._v1_node_tags(cloud_instance_id)
353
+ node_kind_tag = node_tags.get(TAG_RAY_NODE_KIND, NODE_KIND_UNMANAGED)
354
+ if node_kind_tag == NODE_KIND_UNMANAGED:
355
+ # Filter out unmanaged nodes.
356
+ continue
357
+ elif node_kind_tag == NODE_KIND_WORKER:
358
+ node_kind = NodeKind.WORKER
359
+ elif node_kind_tag == NODE_KIND_HEAD:
360
+ node_kind = NodeKind.HEAD
361
+ else:
362
+ raise ValueError(f"Invalid node kind: {node_kind_tag}")
363
+
364
+ nodes[cloud_instance_id] = CloudInstance(
365
+ cloud_instance_id=cloud_instance_id,
366
+ node_type=node_tags.get(TAG_RAY_USER_NODE_TYPE, ""),
367
+ is_running=self._v1_is_running(cloud_instance_id),
368
+ request_id=node_tags.get(TAG_RAY_LAUNCH_REQUEST, ""),
369
+ node_kind=node_kind,
370
+ )
371
+
372
+ return nodes
373
+
374
+ def poll_errors(self) -> List[CloudInstanceProviderError]:
375
+ errors = []
376
+ while not self._errors_queue.empty():
377
+ errors.append(self._errors_queue.get_nowait())
378
+ return errors
379
+
380
+ def launch(
381
+ self,
382
+ shape: Dict[NodeType, int],
383
+ request_id: str,
384
+ ) -> None:
385
+ self._main_executor.submit(self._do_launch, shape, request_id)
386
+
387
+ def terminate(self, ids: List[CloudInstanceId], request_id: str) -> None:
388
+ self._main_executor.submit(self._do_terminate, ids, request_id)
389
+
390
+ ###########################################
391
+ # Private APIs
392
+ ###########################################
393
+
394
+ def _do_launch(
395
+ self,
396
+ shape: Dict[NodeType, int],
397
+ request_id: str,
398
+ ) -> None:
399
+ """
400
+ Launch the cloud instances by calling into the v1 base node provider.
401
+
402
+ Args:
403
+ shape: The requested to launch node type and number of nodes.
404
+ request_id: The request id that identifies the request.
405
+ """
406
+ for node_type, count in shape.items():
407
+ # Keep submitting the launch requests to the launch pool in batches.
408
+ while count > 0:
409
+ to_launch = min(count, self._max_launch_batch_per_type)
410
+ self._node_launcher_executors.submit(
411
+ self._launch_nodes_by_type,
412
+ node_type,
413
+ to_launch,
414
+ request_id,
415
+ )
416
+ count -= to_launch
417
+
418
+ def _do_terminate(self, ids: List[CloudInstanceId], request_id: str) -> None:
419
+ """
420
+ Terminate the cloud instances by calling into the v1 base node provider.
421
+
422
+ If errors happen during the termination, the errors will be put into the
423
+ errors queue.
424
+
425
+ Args:
426
+ ids: The cloud instance ids to terminate.
427
+ request_id: The request id that identifies the request.
428
+ """
429
+
430
+ try:
431
+ self._v1_terminate_nodes(ids)
432
+ except Exception as e:
433
+ for id in ids:
434
+ error = TerminateNodeError(id, request_id, int(time.time_ns()))
435
+ error.__cause__ = e
436
+ self._errors_queue.put(error)
437
+
438
+ def _launch_nodes_by_type(
439
+ self,
440
+ node_type: NodeType,
441
+ count: int,
442
+ request_id: str,
443
+ ) -> None:
444
+ """
445
+ Launch nodes of the given node type.
446
+
447
+ Args:
448
+ node_type: The node type to launch.
449
+ count: Number of nodes to launch.
450
+ request_id: A unique id that identifies the request.
451
+
452
+ Raises:
453
+ ValueError: If the node type is invalid.
454
+ LaunchNodeError: If the launch failed and raised by the underlying provider.
455
+ """
456
+ # Check node type is valid.
457
+ try:
458
+ config = self._config_reader.get_cached_autoscaling_config()
459
+ launch_config = config.get_cloud_node_config(node_type)
460
+ resources = config.get_node_resources(node_type)
461
+ labels = config.get_node_labels(node_type)
462
+
463
+ # This is to be compatible with the v1 node launcher.
464
+ # See more in https://github.com/ray-project/ray/blob/6f5a189bc463e52c51a70f8aea41fb2950b443e8/python/ray/autoscaler/_private/node_launcher.py#L78-L85 # noqa
465
+ # TODO: this should be synced with what's stored in the IM, it should
466
+ # probably be made as a metadata field in the cloud instance. This is
467
+ # another incompatibility with KubeRay.
468
+ launch_hash = hash_launch_conf(launch_config, config.get_config("auth", {}))
469
+ node_tags = {
470
+ TAG_RAY_NODE_NAME: "ray-{}-worker".format(
471
+ config.get_config("cluster_name", "")
472
+ ),
473
+ TAG_RAY_NODE_KIND: NODE_KIND_WORKER,
474
+ TAG_RAY_NODE_STATUS: STATUS_UNINITIALIZED,
475
+ TAG_RAY_LAUNCH_CONFIG: launch_hash,
476
+ TAG_RAY_LAUNCH_REQUEST: request_id,
477
+ TAG_RAY_USER_NODE_TYPE: node_type,
478
+ }
479
+
480
+ logger.info("Launching {} nodes of type {}.".format(count, node_type))
481
+ self._v1_provider.create_node_with_resources_and_labels(
482
+ launch_config, node_tags, count, resources, labels
483
+ )
484
+ logger.info("Launched {} nodes of type {}.".format(count, node_type))
485
+ except Exception as e:
486
+ error = LaunchNodeError(node_type, count, request_id, int(time.time_ns()))
487
+ error.__cause__ = e
488
+ self._errors_queue.put(error)
489
+
490
+ ###########################################
491
+ # V1 Legacy APIs
492
+ ###########################################
493
+ """
494
+ Below are the necessary legacy APIs from the V1 node provider.
495
+ These are needed as of now to provide the needed features
496
+ for V2 node provider.
497
+ The goal is to eventually remove these APIs and only use the
498
+ V2 APIs by modifying the individual node provider to inherit
499
+ from ICloudInstanceProvider.
500
+ """
501
+
502
+ def _v1_terminate_nodes(
503
+ self, ids: List[CloudInstanceId]
504
+ ) -> Optional[Dict[str, Any]]:
505
+ return self._v1_provider.terminate_nodes(ids)
506
+
507
+ def _v1_non_terminated_nodes(
508
+ self, tag_filters: Dict[str, str]
509
+ ) -> List[CloudInstanceId]:
510
+ return self._v1_provider.non_terminated_nodes(tag_filters)
511
+
512
+ def _v1_is_running(self, node_id: CloudInstanceId) -> bool:
513
+ return self._v1_provider.is_running(node_id)
514
+
515
+ def _v1_post_process(self) -> None:
516
+ self._v1_provider.post_process()
517
+
518
+ def _v1_node_tags(self, node_id: CloudInstanceId) -> Dict[str, str]:
519
+ return self._v1_provider.node_tags(node_id)
520
+
521
+ def _v1_safe_to_scale(self) -> bool:
522
+ return self._v1_provider.safe_to_scale()
.venv/lib/python3.11/site-packages/ray/autoscaler/v2/instance_manager/ray_installer.py ADDED
@@ -0,0 +1,99 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import dataclasses
2
+ import logging
3
+ import subprocess
4
+
5
+ from ray.autoscaler._private.updater import NodeUpdater
6
+ from ray.autoscaler._private.util import with_envs, with_head_node_ip
7
+ from ray.autoscaler.node_provider import NodeProvider as NodeProviderV1
8
+ from ray.autoscaler.v2.instance_manager.config import AutoscalingConfig
9
+ from ray.core.generated.instance_manager_pb2 import Instance
10
+
11
+ logger = logging.getLogger(__name__)
12
+
13
+
14
+ @dataclasses.dataclass(frozen=True)
15
+ class RayInstallError:
16
+ # Instance manager's instance id.
17
+ im_instance_id: str
18
+ # Error details.
19
+ details: str
20
+
21
+
22
+ class RayInstaller(object):
23
+ """
24
+ RayInstaller is responsible for installing ray on the target instance.
25
+ """
26
+
27
+ def __init__(
28
+ self,
29
+ provider: NodeProviderV1,
30
+ config: AutoscalingConfig,
31
+ process_runner=subprocess,
32
+ ) -> None:
33
+ self._provider = provider
34
+ self._config = config
35
+ self._process_runner = process_runner
36
+
37
+ def install_ray(self, instance: Instance, head_node_ip: str) -> bool:
38
+ """
39
+ Install ray on the target instance synchronously.
40
+ TODO:(rickyx): This runs in another thread, and errors are silently
41
+ ignored. We should propagate the error to the main thread.
42
+ """
43
+ setup_commands = self._config.get_worker_setup_commands(instance.instance_type)
44
+ ray_start_commands = self._config.get_worker_start_ray_commands()
45
+ docker_config = self._config.get_docker_config(instance.instance_type)
46
+
47
+ logger.info(
48
+ f"Creating new (spawn_updater) updater thread for node"
49
+ f" {instance.cloud_instance_id}."
50
+ )
51
+ provider_instance_type_name = self._config.get_provider_instance_type(
52
+ instance.instance_type
53
+ )
54
+ updater = NodeUpdater(
55
+ node_id=instance.instance_id,
56
+ provider_config=self._config.get_config("provider"),
57
+ provider=self._provider,
58
+ auth_config=self._config.get_config("auth"),
59
+ cluster_name=self._config.get_config("cluster_name"),
60
+ file_mounts=self._config.get_config("file_mounts"),
61
+ initialization_commands=with_head_node_ip(
62
+ self._config.get_initialization_commands(instance.instance_type),
63
+ head_node_ip,
64
+ ),
65
+ setup_commands=with_head_node_ip(setup_commands, head_node_ip),
66
+ # This will prepend envs to the begin of the ray start commands, e.g.
67
+ # `RAY_HEAD_IP=<head_node_ip> \
68
+ # RAY_CLOUD_INSTANCE_ID=<instance_id> \
69
+ # ray start --head ...`
70
+ # See src/ray/common/constants.h for ENV name definitions.
71
+ ray_start_commands=with_envs(
72
+ ray_start_commands,
73
+ {
74
+ "RAY_HEAD_IP": head_node_ip,
75
+ "RAY_CLOUD_INSTANCE_ID": instance.instance_id,
76
+ "RAY_NODE_TYPE_NAME": instance.instance_type,
77
+ "RAY_CLOUD_INSTANCE_TYPE_NAME": provider_instance_type_name,
78
+ },
79
+ ),
80
+ runtime_hash=self._config.runtime_hash,
81
+ file_mounts_contents_hash=self._config.file_mounts_contents_hash,
82
+ is_head_node=False,
83
+ cluster_synced_files=self._config.get_config("cluster_synced_files"),
84
+ rsync_options={
85
+ "rsync_exclude": self._config.get_config("rsync_exclude"),
86
+ "rsync_filter": self._config.get_config("rsync_filter"),
87
+ },
88
+ use_internal_ip=True,
89
+ docker_config=docker_config,
90
+ node_resources=self._config.get_node_resources(instance.instance_type),
91
+ node_labels=self._config.get_node_labels(instance.instance_type),
92
+ process_runner=self._process_runner,
93
+ )
94
+ try:
95
+ updater.run()
96
+ except Exception:
97
+ # Errors has already been handled.
98
+ return False
99
+ return True
.venv/lib/python3.11/site-packages/ray/autoscaler/v2/instance_manager/reconciler.py ADDED
@@ -0,0 +1,1565 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ import math
3
+ import time
4
+ import uuid
5
+ from collections import defaultdict
6
+ from typing import Dict, List, Optional, Set, Tuple
7
+
8
+ from ray._private.utils import binary_to_hex
9
+ from ray.autoscaler.v2.instance_manager.common import InstanceUtil
10
+ from ray.autoscaler.v2.instance_manager.config import (
11
+ AutoscalingConfig,
12
+ InstanceReconcileConfig,
13
+ Provider,
14
+ )
15
+ from ray.autoscaler.v2.instance_manager.instance_manager import InstanceManager
16
+ from ray.autoscaler.v2.instance_manager.node_provider import (
17
+ CloudInstance,
18
+ CloudInstanceId,
19
+ CloudInstanceProviderError,
20
+ ICloudInstanceProvider,
21
+ LaunchNodeError,
22
+ TerminateNodeError,
23
+ )
24
+ from ray.autoscaler.v2.instance_manager.ray_installer import RayInstallError
25
+ from ray.autoscaler.v2.instance_manager.subscribers.ray_stopper import RayStopError
26
+ from ray.autoscaler.v2.metrics_reporter import AutoscalerMetricsReporter
27
+ from ray.autoscaler.v2.scheduler import IResourceScheduler, SchedulingRequest
28
+ from ray.autoscaler.v2.schema import AutoscalerInstance, NodeType
29
+ from ray.autoscaler.v2.sdk import is_head_node
30
+ from ray.core.generated.autoscaler_pb2 import (
31
+ AutoscalingState,
32
+ ClusterResourceState,
33
+ FailedInstanceRequest,
34
+ NodeState,
35
+ NodeStatus,
36
+ PendingInstance,
37
+ PendingInstanceRequest,
38
+ )
39
+ from ray.core.generated.instance_manager_pb2 import GetInstanceManagerStateRequest
40
+ from ray.core.generated.instance_manager_pb2 import Instance as IMInstance
41
+ from ray.core.generated.instance_manager_pb2 import (
42
+ InstanceUpdateEvent as IMInstanceUpdateEvent,
43
+ )
44
+ from ray.core.generated.instance_manager_pb2 import (
45
+ NodeKind,
46
+ StatusCode,
47
+ UpdateInstanceManagerStateRequest,
48
+ )
49
+
50
+ logger = logging.getLogger(__name__)
51
+
52
+
53
+ class Reconciler:
54
+ """
55
+ A singleton class that reconciles the instance states of the instance manager
56
+ for autoscaler.
57
+
58
+ """
59
+
60
+ @staticmethod
61
+ def reconcile(
62
+ instance_manager: InstanceManager,
63
+ scheduler: IResourceScheduler,
64
+ cloud_provider: ICloudInstanceProvider,
65
+ ray_cluster_resource_state: ClusterResourceState,
66
+ non_terminated_cloud_instances: Dict[CloudInstanceId, CloudInstance],
67
+ autoscaling_config: AutoscalingConfig,
68
+ cloud_provider_errors: Optional[List[CloudInstanceProviderError]] = None,
69
+ ray_install_errors: Optional[List[RayInstallError]] = None,
70
+ ray_stop_errors: Optional[List[RayStopError]] = None,
71
+ metrics_reporter: Optional[AutoscalerMetricsReporter] = None,
72
+ _logger: Optional[logging.Logger] = None,
73
+ ) -> AutoscalingState:
74
+ """
75
+ The reconcile method computes InstanceUpdateEvents for the instance manager
76
+ by:
77
+
78
+ 1. Reconciling the instance manager's instances with external states like
79
+ the cloud provider's, the ray cluster's states, the ray installer's results.
80
+ It performs "passive" status transitions for the instances (where the status
81
+ transition should only be reflecting the external states of the cloud provider
82
+ and the ray cluster, and should not be actively changing them)
83
+
84
+ 2. Stepping the instances to the active states by computing instance status
85
+ transitions that are needed and updating the instance manager's state.
86
+ These transitions should be "active" where the transitions have side effects
87
+ (through InstanceStatusSubscriber) to the cloud provider and the ray cluster.
88
+
89
+ Args:
90
+ instance_manager: The instance manager to reconcile.
91
+ ray_cluster_resource_state: The ray cluster's resource state.
92
+ non_terminated_cloud_instances: The non-terminated cloud instances from
93
+ the cloud provider.
94
+ cloud_provider_errors: The errors from the cloud provider.
95
+ ray_install_errors: The errors from RayInstaller.
96
+ ray_stop_errors: The errors from RayStopper.
97
+ metrics_reporter: The metric reporter to report the autoscaler metrics.
98
+ _logger: The logger (for testing).
99
+
100
+ """
101
+ cloud_provider_errors = cloud_provider_errors or []
102
+ ray_install_errors = ray_install_errors or []
103
+ ray_stop_errors = ray_stop_errors or []
104
+
105
+ autoscaling_state = AutoscalingState()
106
+ autoscaling_state.last_seen_cluster_resource_state_version = (
107
+ ray_cluster_resource_state.cluster_resource_state_version
108
+ )
109
+ Reconciler._sync_from(
110
+ instance_manager=instance_manager,
111
+ ray_nodes=ray_cluster_resource_state.node_states,
112
+ non_terminated_cloud_instances=non_terminated_cloud_instances,
113
+ cloud_provider_errors=cloud_provider_errors,
114
+ ray_install_errors=ray_install_errors,
115
+ ray_stop_errors=ray_stop_errors,
116
+ autoscaling_config=autoscaling_config,
117
+ )
118
+
119
+ Reconciler._step_next(
120
+ autoscaling_state=autoscaling_state,
121
+ instance_manager=instance_manager,
122
+ scheduler=scheduler,
123
+ cloud_provider=cloud_provider,
124
+ ray_cluster_resource_state=ray_cluster_resource_state,
125
+ non_terminated_cloud_instances=non_terminated_cloud_instances,
126
+ autoscaling_config=autoscaling_config,
127
+ _logger=_logger,
128
+ )
129
+
130
+ Reconciler._report_metrics(
131
+ instance_manager=instance_manager,
132
+ autoscaling_config=autoscaling_config,
133
+ metrics_reporter=metrics_reporter,
134
+ )
135
+
136
+ return autoscaling_state
137
+
138
+ @staticmethod
139
+ def _sync_from(
140
+ instance_manager: InstanceManager,
141
+ ray_nodes: List[NodeState],
142
+ non_terminated_cloud_instances: Dict[CloudInstanceId, CloudInstance],
143
+ cloud_provider_errors: List[CloudInstanceProviderError],
144
+ ray_install_errors: List[RayInstallError],
145
+ ray_stop_errors: List[RayStopError],
146
+ autoscaling_config: AutoscalingConfig,
147
+ ):
148
+ """
149
+ Reconcile the instance states of the instance manager from external states like
150
+ the cloud provider's, the ray cluster's states, the ray installer's results,
151
+ etc.
152
+
153
+ For each instance, we try to figure out if we need to transition the instance
154
+ status to a new status, and if so, what the new status should be.
155
+
156
+ These transitions should be purely "passive", meaning they should only be
157
+ reflecting the external states of the cloud provider and the ray cluster,
158
+ and should not be actively changing the states of the cloud provider or the ray
159
+ cluster.
160
+
161
+ More specifically, we will reconcile status transitions for:
162
+ 1. QUEUED/REQUESTED -> ALLOCATED:
163
+ When a instance with launch request id (indicating a previous launch
164
+ request was made) could be assigned to an unassigned cloud instance
165
+ of the same instance type.
166
+ 2. REQUESTED -> ALLOCATION_FAILED:
167
+ When there's an error from the cloud provider for launch failure so
168
+ that the instance becomes ALLOCATION_FAILED.
169
+ 3. * -> RAY_RUNNING:
170
+ When a ray node on a cloud instance joins the ray cluster, we will
171
+ transition the instance to RAY_RUNNING.
172
+ 4. * -> TERMINATED:
173
+ When the cloud instance is already terminated, we will transition the
174
+ instance to TERMINATED.
175
+ 5. TERMINATING -> TERMINATION_FAILED:
176
+ When there's an error from the cloud provider for termination failure.
177
+ 6. * -> RAY_STOPPED:
178
+ When ray was stopped on the cloud instance, we will transition the
179
+ instance to RAY_STOPPED.
180
+ 7. * -> RAY_INSTALL_FAILED:
181
+ When there's an error from RayInstaller.
182
+ 8. RAY_STOP_REQUESTED -> RAY_RUNNING:
183
+ When requested to stop ray, but failed to stop/drain the ray node
184
+ (e.g. idle termination drain rejected by the node).
185
+
186
+ Args:
187
+ instance_manager: The instance manager to reconcile.
188
+ ray_nodes: The ray cluster's states of ray nodes.
189
+ non_terminated_cloud_instances: The non-terminated cloud instances from
190
+ the cloud provider.
191
+ cloud_provider_errors: The errors from the cloud provider.
192
+ ray_install_errors: The errors from RayInstaller.
193
+ ray_stop_errors: The errors from RayStopper.
194
+
195
+ """
196
+
197
+ # Handle 1 & 2 for cloud instance allocation.
198
+ Reconciler._handle_cloud_instance_allocation(
199
+ instance_manager,
200
+ non_terminated_cloud_instances,
201
+ cloud_provider_errors,
202
+ )
203
+ Reconciler._handle_cloud_instance_terminated(
204
+ instance_manager, non_terminated_cloud_instances
205
+ )
206
+
207
+ Reconciler._handle_cloud_instance_termination_errors(
208
+ instance_manager, cloud_provider_errors
209
+ )
210
+
211
+ Reconciler._handle_extra_cloud_instances(
212
+ instance_manager, non_terminated_cloud_instances, ray_nodes
213
+ )
214
+
215
+ Reconciler._handle_ray_status_transition(
216
+ instance_manager, ray_nodes, autoscaling_config
217
+ )
218
+
219
+ Reconciler._handle_ray_install_failed(instance_manager, ray_install_errors)
220
+
221
+ Reconciler._handle_ray_stop_failed(instance_manager, ray_stop_errors, ray_nodes)
222
+
223
+ @staticmethod
224
+ def _step_next(
225
+ autoscaling_state: AutoscalingState,
226
+ instance_manager: InstanceManager,
227
+ scheduler: IResourceScheduler,
228
+ cloud_provider: ICloudInstanceProvider,
229
+ ray_cluster_resource_state: ClusterResourceState,
230
+ non_terminated_cloud_instances: Dict[CloudInstanceId, CloudInstance],
231
+ autoscaling_config: AutoscalingConfig,
232
+ _logger: Optional[logging.Logger] = None,
233
+ ):
234
+ """
235
+ Step the reconciler to the next state by computing instance status transitions
236
+ that are needed and updating the instance manager's state.
237
+
238
+ Specifically, we will:
239
+ 1. Shut down leak cloud instances
240
+ Leaked cloud instances that are not managed by the instance manager.
241
+ 2. Terminating instances with ray stopped or ray install failure.
242
+ 3. Scale down the cluster:
243
+ (* -> RAY_STOP_REQUESTED/TERMINATING)
244
+ b. Extra cloud due to max nodes config.
245
+ c. Cloud instances with outdated configs.
246
+ 4. Scale up the cluster:
247
+ (new QUEUED)
248
+ Create new instances based on the IResourceScheduler's decision for
249
+ scaling up.
250
+ 5. Request cloud provider to launch new instances.
251
+ (QUEUED -> REQUESTED)
252
+ 6. Install ray
253
+ (ALLOCATED -> RAY_INSTALLING)
254
+ When ray could be installed and launched.
255
+ 7. Handle any stuck instances with timeouts.
256
+
257
+ Args:
258
+ instance_manager: The instance manager to reconcile.
259
+ scheduler: The resource scheduler to make scaling decisions.
260
+ ray_cluster_resource_state: The ray cluster's resource state.
261
+ non_terminated_cloud_instances: The non-terminated cloud instances from
262
+ the cloud provider.
263
+ autoscaling_config: The autoscaling config.
264
+ _logger: The logger (for testing).
265
+
266
+ """
267
+
268
+ Reconciler._handle_stuck_instances(
269
+ instance_manager=instance_manager,
270
+ reconcile_config=autoscaling_config.get_instance_reconcile_config(),
271
+ _logger=_logger or logger,
272
+ )
273
+
274
+ Reconciler._scale_cluster(
275
+ autoscaling_state=autoscaling_state,
276
+ instance_manager=instance_manager,
277
+ ray_state=ray_cluster_resource_state,
278
+ scheduler=scheduler,
279
+ autoscaling_config=autoscaling_config,
280
+ )
281
+
282
+ Reconciler._handle_instances_launch(
283
+ instance_manager=instance_manager, autoscaling_config=autoscaling_config
284
+ )
285
+
286
+ Reconciler._terminate_instances(instance_manager=instance_manager)
287
+ if not autoscaling_config.disable_node_updaters():
288
+ Reconciler._install_ray(
289
+ instance_manager=instance_manager,
290
+ non_terminated_cloud_instances=non_terminated_cloud_instances,
291
+ )
292
+
293
+ Reconciler._fill_autoscaling_state(
294
+ instance_manager=instance_manager, autoscaling_state=autoscaling_state
295
+ )
296
+
297
+ #######################################################
298
+ # Utility methods for reconciling instance states.
299
+ #######################################################
300
+
301
+ @staticmethod
302
+ def _handle_cloud_instance_allocation(
303
+ instance_manager: InstanceManager,
304
+ non_terminated_cloud_instances: Dict[CloudInstanceId, CloudInstance],
305
+ cloud_provider_errors: List[CloudInstanceProviderError],
306
+ ):
307
+ im_instances, version = Reconciler._get_im_instances(instance_manager)
308
+ updates = {}
309
+
310
+ # Compute intermediate states.
311
+
312
+ instances_with_launch_requests: List[IMInstance] = []
313
+ for instance in im_instances:
314
+ if instance.status != IMInstance.REQUESTED:
315
+ continue
316
+
317
+ assert (
318
+ instance.launch_request_id
319
+ ), "Instance in REQUESTED status should have launch_request_id set."
320
+ instances_with_launch_requests.append(instance)
321
+
322
+ assigned_cloud_instance_ids: Set[CloudInstanceId] = {
323
+ instance.cloud_instance_id for instance in im_instances
324
+ }
325
+ launch_errors: Dict[str, LaunchNodeError] = {
326
+ error.request_id: error
327
+ for error in cloud_provider_errors
328
+ if isinstance(error, LaunchNodeError)
329
+ }
330
+ unassigned_cloud_instances_by_type: Dict[
331
+ str, List[CloudInstance]
332
+ ] = defaultdict(list)
333
+
334
+ for cloud_instance_id, cloud_instance in non_terminated_cloud_instances.items():
335
+ if cloud_instance_id not in assigned_cloud_instance_ids:
336
+ unassigned_cloud_instances_by_type[cloud_instance.node_type].append(
337
+ cloud_instance
338
+ )
339
+
340
+ # Sort the request instance by the increasing request time.
341
+ instances_with_launch_requests.sort(
342
+ key=lambda instance: InstanceUtil.get_status_transition_times_ns(
343
+ instance, IMInstance.REQUESTED
344
+ )
345
+ )
346
+
347
+ # For each instance, try to allocate or fail the allocation.
348
+ for instance in instances_with_launch_requests:
349
+ # Try allocate or fail with errors.
350
+ update_event = Reconciler._try_resolve_pending_allocation(
351
+ instance, unassigned_cloud_instances_by_type, launch_errors
352
+ )
353
+ if not update_event:
354
+ continue
355
+
356
+ updates[instance.instance_id] = update_event
357
+
358
+ # Update the instance manager for the events.
359
+ Reconciler._update_instance_manager(instance_manager, version, updates)
360
+
361
+ @staticmethod
362
+ def _try_resolve_pending_allocation(
363
+ im_instance: IMInstance,
364
+ unassigned_cloud_instances_by_type: Dict[str, List[CloudInstance]],
365
+ launch_errors: Dict[str, LaunchNodeError],
366
+ ) -> Optional[IMInstanceUpdateEvent]:
367
+ """
368
+ Allocate, or fail the cloud instance allocation for the instance.
369
+
370
+ Args:
371
+ im_instance: The instance to allocate or fail.
372
+ unassigned_cloud_instances_by_type: The unassigned cloud instances by type.
373
+ launch_errors: The launch errors from the cloud provider.
374
+
375
+ Returns:
376
+ Instance update to ALLOCATED: if there's a matching unassigned cloud
377
+ instance with the same type.
378
+ Instance update to ALLOCATION_FAILED: if the instance allocation failed
379
+ with errors.
380
+ None: if there's no update.
381
+
382
+ """
383
+ unassigned_cloud_instance = None
384
+
385
+ # Try to allocate an unassigned cloud instance.
386
+ # TODO(rickyx): We could also look at the launch request id
387
+ # on the cloud node and the im instance later once all node providers
388
+ # support request id. For now, we only look at the instance type.
389
+ if len(unassigned_cloud_instances_by_type.get(im_instance.instance_type, [])):
390
+ unassigned_cloud_instance = unassigned_cloud_instances_by_type[
391
+ im_instance.instance_type
392
+ ].pop()
393
+
394
+ if unassigned_cloud_instance:
395
+ return IMInstanceUpdateEvent(
396
+ instance_id=im_instance.instance_id,
397
+ new_instance_status=IMInstance.ALLOCATED,
398
+ cloud_instance_id=unassigned_cloud_instance.cloud_instance_id,
399
+ node_kind=unassigned_cloud_instance.node_kind,
400
+ instance_type=unassigned_cloud_instance.node_type,
401
+ details=(
402
+ "allocated unassigned cloud instance "
403
+ f"{unassigned_cloud_instance.cloud_instance_id}"
404
+ ),
405
+ )
406
+
407
+ # If there's a launch error, transition to ALLOCATION_FAILED.
408
+ launch_error = launch_errors.get(im_instance.launch_request_id)
409
+ if launch_error and launch_error.node_type == im_instance.instance_type:
410
+ return IMInstanceUpdateEvent(
411
+ instance_id=im_instance.instance_id,
412
+ new_instance_status=IMInstance.ALLOCATION_FAILED,
413
+ details=f"launch failed with {str(launch_error)}",
414
+ )
415
+ # No update.
416
+ return None
417
+
418
+ @staticmethod
419
+ def _handle_ray_stop_failed(
420
+ instance_manager: InstanceManager,
421
+ ray_stop_errors: List[RayStopError],
422
+ ray_nodes: List[NodeState],
423
+ ):
424
+ """
425
+ The instance requested to stop ray, but failed to stop/drain the ray node.
426
+ E.g. connection errors, idle termination drain rejected by the node.
427
+
428
+ We will transition the instance back to RAY_RUNNING.
429
+
430
+ Args:
431
+ instance_manager: The instance manager to reconcile.
432
+ ray_stop_errors: The errors from RayStopper.
433
+
434
+ """
435
+ instances, version = Reconciler._get_im_instances(instance_manager)
436
+ updates = {}
437
+
438
+ ray_stop_errors_by_instance_id = {
439
+ error.im_instance_id: error for error in ray_stop_errors
440
+ }
441
+
442
+ ray_nodes_by_ray_node_id = {binary_to_hex(n.node_id): n for n in ray_nodes}
443
+
444
+ ray_stop_requested_instances = {
445
+ instance.instance_id: instance
446
+ for instance in instances
447
+ if instance.status == IMInstance.RAY_STOP_REQUESTED
448
+ }
449
+
450
+ for instance_id, instance in ray_stop_requested_instances.items():
451
+ stop_error = ray_stop_errors_by_instance_id.get(instance_id)
452
+ if not stop_error:
453
+ continue
454
+
455
+ assert instance.node_id
456
+ ray_node = ray_nodes_by_ray_node_id.get(instance.node_id)
457
+ assert ray_node is not None and ray_node.status in [
458
+ NodeStatus.RUNNING,
459
+ NodeStatus.IDLE,
460
+ ], (
461
+ "There should be a running ray node for instance with ray stop "
462
+ "requested failed."
463
+ )
464
+
465
+ updates[instance_id] = IMInstanceUpdateEvent(
466
+ instance_id=instance_id,
467
+ new_instance_status=IMInstance.RAY_RUNNING,
468
+ details="failed to stop/drain ray",
469
+ ray_node_id=instance.node_id,
470
+ )
471
+
472
+ Reconciler._update_instance_manager(instance_manager, version, updates)
473
+
474
+ @staticmethod
475
+ def _handle_ray_install_failed(
476
+ instance_manager: InstanceManager, ray_install_errors: List[RayInstallError]
477
+ ):
478
+
479
+ instances, version = Reconciler._get_im_instances(instance_manager)
480
+ updates = {}
481
+
482
+ # Get all instances with RAY_INSTALLING status.
483
+ instances_with_ray_installing = {
484
+ instance.instance_id: instance
485
+ for instance in instances
486
+ if instance.status == IMInstance.RAY_INSTALLING
487
+ }
488
+
489
+ install_errors = {error.im_instance_id: error for error in ray_install_errors}
490
+
491
+ # For each instance with RAY_INSTALLING status, check if there's any
492
+ # install error.
493
+ for instance_id, instance in instances_with_ray_installing.items():
494
+ install_error = install_errors.get(instance_id)
495
+ if install_error:
496
+ updates[instance_id] = IMInstanceUpdateEvent(
497
+ instance_id=instance_id,
498
+ new_instance_status=IMInstance.RAY_INSTALL_FAILED,
499
+ details=(
500
+ f"failed to install ray with errors: {install_error.details}"
501
+ ),
502
+ )
503
+
504
+ # Update the instance manager for the events.
505
+ Reconciler._update_instance_manager(instance_manager, version, updates)
506
+
507
+ @staticmethod
508
+ def _handle_cloud_instance_terminated(
509
+ instance_manager: InstanceManager,
510
+ non_terminated_cloud_instances: Dict[CloudInstanceId, CloudInstance],
511
+ ):
512
+ """
513
+ For any IM (instance manager) instance with a cloud node id, if the mapped
514
+ cloud instance is no longer running, transition the instance to TERMINATED.
515
+
516
+ Args:
517
+ instance_manager: The instance manager to reconcile.
518
+ non_terminated_cloud_instances: The non-terminated cloud instances from
519
+ the cloud provider.
520
+ """
521
+ updates = {}
522
+ instances, version = Reconciler._get_im_instances(instance_manager)
523
+
524
+ non_terminated_instances_with_cloud_instance_assigned = {
525
+ instance.cloud_instance_id: instance
526
+ for instance in instances
527
+ if instance.cloud_instance_id and instance.status != IMInstance.TERMINATED
528
+ }
529
+
530
+ for (
531
+ cloud_instance_id,
532
+ instance,
533
+ ) in non_terminated_instances_with_cloud_instance_assigned.items():
534
+ if cloud_instance_id in non_terminated_cloud_instances.keys():
535
+ # The cloud instance is still running.
536
+ continue
537
+
538
+ # The cloud instance is terminated.
539
+ updates[instance.instance_id] = IMInstanceUpdateEvent(
540
+ instance_id=instance.instance_id,
541
+ new_instance_status=IMInstance.TERMINATED,
542
+ details=f"cloud instance {cloud_instance_id} no longer found",
543
+ )
544
+
545
+ Reconciler._update_instance_manager(instance_manager, version, updates)
546
+
547
+ @staticmethod
548
+ def _handle_cloud_instance_termination_errors(
549
+ instance_manager: InstanceManager,
550
+ cloud_provider_errors: List[CloudInstanceProviderError],
551
+ ):
552
+ """
553
+ If any TERMINATING instances have termination errors, transition the instance to
554
+ TERMINATION_FAILED.
555
+
556
+ We will retry the termination for the TERMINATION_FAILED instances in the next
557
+ reconciler step.
558
+
559
+ Args:
560
+ instance_manager: The instance manager to reconcile.
561
+ cloud_provider_errors: The errors from the cloud provider.
562
+
563
+ """
564
+ instances, version = Reconciler._get_im_instances(instance_manager)
565
+ updates = {}
566
+
567
+ termination_errors = {
568
+ error.cloud_instance_id: error
569
+ for error in cloud_provider_errors
570
+ if isinstance(error, TerminateNodeError)
571
+ }
572
+
573
+ terminating_instances_by_cloud_instance_id = {
574
+ instance.cloud_instance_id: instance
575
+ for instance in instances
576
+ if instance.status == IMInstance.TERMINATING
577
+ }
578
+
579
+ for cloud_instance_id, failure in termination_errors.items():
580
+ instance = terminating_instances_by_cloud_instance_id.get(cloud_instance_id)
581
+ if not instance:
582
+ # The instance is no longer in TERMINATING status.
583
+ continue
584
+
585
+ updates[instance.instance_id] = IMInstanceUpdateEvent(
586
+ instance_id=instance.instance_id,
587
+ new_instance_status=IMInstance.TERMINATION_FAILED,
588
+ details=f"termination failed: {str(failure)}",
589
+ )
590
+
591
+ Reconciler._update_instance_manager(instance_manager, version, updates)
592
+
593
+ @staticmethod
594
+ def _get_im_instances(
595
+ instance_manager: InstanceManager,
596
+ ) -> Tuple[List[IMInstance], int]:
597
+ reply = instance_manager.get_instance_manager_state(
598
+ request=GetInstanceManagerStateRequest()
599
+ )
600
+ assert reply.status.code == StatusCode.OK
601
+ im_state = reply.state
602
+ return im_state.instances, im_state.version
603
+
604
+ @staticmethod
605
+ def _update_instance_manager(
606
+ instance_manager: InstanceManager,
607
+ version: int,
608
+ updates: Dict[str, IMInstanceUpdateEvent],
609
+ ) -> None:
610
+ if not updates:
611
+ return
612
+
613
+ updates = list(updates.values()) or []
614
+
615
+ reply = instance_manager.update_instance_manager_state(
616
+ request=UpdateInstanceManagerStateRequest(
617
+ expected_version=version,
618
+ updates=updates,
619
+ )
620
+ )
621
+ # TODO: While it's possible that a version mismatch
622
+ # happens, or some other failures could happen. But given
623
+ # the current implementation:
624
+ # 1. There's only 1 writer (the reconciler) for updating the instance
625
+ # manager states, so there shouldn't be version mismatch.
626
+ # 2. Any failures in one reconciler step should be caught at a higher
627
+ # level and be retried in the next reconciler step. If the IM
628
+ # fails to be updated, we don't have sufficient info to handle it
629
+ # here.
630
+ assert (
631
+ reply.status.code == StatusCode.OK
632
+ ), f"Failed to update instance manager: {reply}"
633
+
634
+ @staticmethod
635
+ def _handle_ray_status_transition(
636
+ instance_manager: InstanceManager,
637
+ ray_nodes: List[NodeState],
638
+ autoscaling_config: AutoscalingConfig,
639
+ ):
640
+ """
641
+ Handle the ray status transition for the instance manager.
642
+
643
+ If a new ray node running on the instance, transition it to RAY_RUNNING.
644
+ If a ray node stopped, transition it to RAY_STOPPED.
645
+ If a ray node is draining, transition it to RAY_STOPPING.
646
+
647
+ Args:
648
+ instance_manager: The instance manager to reconcile.
649
+ ray_nodes: The ray cluster's states of ray nodes.
650
+ """
651
+ instances, version = Reconciler._get_im_instances(instance_manager)
652
+ updates = {}
653
+
654
+ im_instances_by_cloud_instance_id = {
655
+ i.cloud_instance_id: i for i in instances if i.cloud_instance_id
656
+ }
657
+ ray_nodes_by_cloud_instance_id = {}
658
+ for n in ray_nodes:
659
+ if n.instance_id:
660
+ ray_nodes_by_cloud_instance_id[n.instance_id] = n
661
+ else:
662
+ if autoscaling_config.provider == Provider.READ_ONLY:
663
+ # We will use the node id as the cloud instance id for read-only
664
+ # provider.
665
+ ray_nodes_by_cloud_instance_id[binary_to_hex(n.node_id)] = n
666
+ else:
667
+ # This should only happen to a ray node that's not managed by us.
668
+ logger.warning(
669
+ f"Ray node {binary_to_hex(n.node_id)} has no instance id. "
670
+ "This only happens to a ray node not managed by autoscaler. "
671
+ "If not, please file a bug at "
672
+ "https://github.com/ray-project/ray"
673
+ )
674
+
675
+ for cloud_instance_id, ray_node in ray_nodes_by_cloud_instance_id.items():
676
+ assert cloud_instance_id in im_instances_by_cloud_instance_id, (
677
+ f"Ray node {binary_to_hex(ray_node.node_id)} has no matching "
678
+ f"instance with cloud instance id={cloud_instance_id}. We should "
679
+ "not see a ray node with cloud instance id not found in IM since "
680
+ "we have reconciled all cloud instances, and ray nodes by now."
681
+ )
682
+
683
+ im_instance = im_instances_by_cloud_instance_id[cloud_instance_id]
684
+ reconciled_im_status = Reconciler._reconciled_im_status_from_ray_status(
685
+ ray_node.status, im_instance.status
686
+ )
687
+
688
+ if reconciled_im_status != im_instance.status:
689
+ updates[im_instance.instance_id] = IMInstanceUpdateEvent(
690
+ instance_id=im_instance.instance_id,
691
+ new_instance_status=reconciled_im_status,
692
+ details=(
693
+ f"ray node {binary_to_hex(ray_node.node_id)} is "
694
+ f"{NodeStatus.Name(ray_node.status)}"
695
+ ),
696
+ ray_node_id=binary_to_hex(ray_node.node_id),
697
+ )
698
+
699
+ Reconciler._update_instance_manager(instance_manager, version, updates)
700
+
701
+ @staticmethod
702
+ def _reconciled_im_status_from_ray_status(
703
+ ray_status: NodeStatus, cur_im_status: IMInstance.InstanceStatus
704
+ ) -> "IMInstance.InstanceStatus":
705
+ """
706
+ Reconcile the instance status from the ray node status.
707
+ Args:
708
+ ray_status: the current ray node status.
709
+ cur_im_status: the current IM instance status.
710
+ Returns:
711
+ The reconciled IM instance status
712
+
713
+ Raises:
714
+ ValueError: If the ray status is unknown.
715
+ """
716
+ reconciled_im_status = None
717
+ if ray_status in [NodeStatus.RUNNING, NodeStatus.IDLE]:
718
+ reconciled_im_status = IMInstance.RAY_RUNNING
719
+ elif ray_status == NodeStatus.DEAD:
720
+ reconciled_im_status = IMInstance.RAY_STOPPED
721
+ elif ray_status == NodeStatus.DRAINING:
722
+ reconciled_im_status = IMInstance.RAY_STOPPING
723
+ else:
724
+ raise ValueError(f"Unknown ray status: {ray_status}")
725
+
726
+ if (
727
+ cur_im_status == reconciled_im_status
728
+ or cur_im_status
729
+ in InstanceUtil.get_reachable_statuses(reconciled_im_status)
730
+ ):
731
+ # No need to reconcile if the instance is already in the reconciled status
732
+ # or has already transitioned beyond it.
733
+ return cur_im_status
734
+
735
+ return reconciled_im_status
736
+
737
+ @staticmethod
738
+ def _handle_instances_launch(
739
+ instance_manager: InstanceManager, autoscaling_config: AutoscalingConfig
740
+ ):
741
+
742
+ instances, version = Reconciler._get_im_instances(instance_manager)
743
+
744
+ queued_instances = []
745
+ requested_instances = []
746
+ allocated_instances = []
747
+
748
+ for instance in instances:
749
+ if instance.status == IMInstance.QUEUED:
750
+ queued_instances.append(instance)
751
+ elif instance.status == IMInstance.REQUESTED:
752
+ requested_instances.append(instance)
753
+ elif instance.cloud_instance_id:
754
+ allocated_instances.append(instance)
755
+
756
+ if not queued_instances:
757
+ # No QUEUED instances
758
+ return
759
+
760
+ to_launch = Reconciler._compute_to_launch(
761
+ queued_instances,
762
+ requested_instances,
763
+ allocated_instances,
764
+ autoscaling_config.get_upscaling_speed(),
765
+ autoscaling_config.get_max_concurrent_launches(),
766
+ )
767
+
768
+ # Transition the instances to REQUESTED for instance launcher to
769
+ # launch them.
770
+ updates = {}
771
+ new_launch_request_id = str(uuid.uuid4())
772
+ for instance_type, instances in to_launch.items():
773
+ for instance in instances:
774
+ # Reuse launch request id for any QUEUED instances that have been
775
+ # requested before due to retry.
776
+ launch_request_id = (
777
+ new_launch_request_id
778
+ if len(instance.launch_request_id) == 0
779
+ else instance.launch_request_id
780
+ )
781
+ updates[instance.instance_id] = IMInstanceUpdateEvent(
782
+ instance_id=instance.instance_id,
783
+ new_instance_status=IMInstance.REQUESTED,
784
+ launch_request_id=launch_request_id,
785
+ instance_type=instance_type,
786
+ details=(
787
+ f"requested to launch {instance_type} with request id "
788
+ f"{launch_request_id}"
789
+ ),
790
+ )
791
+
792
+ Reconciler._update_instance_manager(instance_manager, version, updates)
793
+
794
+ @staticmethod
795
+ def _compute_to_launch(
796
+ queued_instances: List[IMInstance],
797
+ requested_instances: List[IMInstance],
798
+ allocated_instances: List[IMInstance],
799
+ upscaling_speed: float,
800
+ max_concurrent_launches: int,
801
+ ) -> Dict[NodeType, List[IMInstance]]:
802
+ def _group_by_type(instances):
803
+ instances_by_type = defaultdict(list)
804
+ for instance in instances:
805
+ instances_by_type[instance.instance_type].append(instance)
806
+ return instances_by_type
807
+
808
+ # Sort the instances by the time they were queued.
809
+ def _sort_by_earliest_queued(instance: IMInstance) -> List[int]:
810
+ queue_times = InstanceUtil.get_status_transition_times_ns(
811
+ instance, IMInstance.QUEUED
812
+ )
813
+ return sorted(queue_times)
814
+
815
+ queued_instances_by_type = _group_by_type(queued_instances)
816
+ requested_instances_by_type = _group_by_type(requested_instances)
817
+ allocated_instances_by_type = _group_by_type(allocated_instances)
818
+
819
+ total_num_requested_to_launch = len(requested_instances)
820
+ all_to_launch: Dict[NodeType : List[IMInstance]] = defaultdict(list)
821
+
822
+ for (
823
+ instance_type,
824
+ queued_instances_for_type,
825
+ ) in queued_instances_by_type.items():
826
+ requested_instances_for_type = requested_instances_by_type.get(
827
+ instance_type, []
828
+ )
829
+ allocated_instances_for_type = allocated_instances_by_type.get(
830
+ instance_type, []
831
+ )
832
+
833
+ num_desired_to_upscale = max(
834
+ 1,
835
+ math.ceil(
836
+ upscaling_speed
837
+ * (
838
+ len(requested_instances_for_type)
839
+ + len(allocated_instances_for_type)
840
+ )
841
+ ),
842
+ )
843
+
844
+ # Enforce global limit, at most we can launch `max_concurrent_launches`
845
+ num_to_launch = min(
846
+ max_concurrent_launches - total_num_requested_to_launch,
847
+ num_desired_to_upscale,
848
+ )
849
+
850
+ # Cap both ends 0 <= num_to_launch <= num_queued
851
+ num_to_launch = max(0, num_to_launch)
852
+ num_to_launch = min(len(queued_instances_for_type), num_to_launch)
853
+
854
+ to_launch = sorted(queued_instances_for_type, key=_sort_by_earliest_queued)[
855
+ :num_to_launch
856
+ ]
857
+
858
+ all_to_launch[instance_type].extend(to_launch)
859
+ total_num_requested_to_launch += num_to_launch
860
+
861
+ return all_to_launch
862
+
863
+ @staticmethod
864
+ def _handle_stuck_instances(
865
+ instance_manager: InstanceManager,
866
+ reconcile_config: InstanceReconcileConfig,
867
+ _logger: logging.Logger,
868
+ ):
869
+ """
870
+ Handle stuck instances with timeouts.
871
+
872
+ Instances could be stuck in the following status and needs to be updated:
873
+ - REQUESTED: cloud provider is slow/fails to launch instances.
874
+ - ALLOCATED: ray fails to be started on the instance.
875
+ - RAY_INSTALLING: ray fails to be installed on the instance.
876
+ - TERMINATING: cloud provider is slow/fails to terminate instances.
877
+
878
+ Instances could be in the following status which could be unbounded or
879
+ transient, and we don't have a timeout mechanism to handle them. We would
880
+ warn if they are stuck for too long:
881
+ - RAY_STOPPING: ray taking time to drain.
882
+ - QUEUED: cloud provider is slow to launch instances, resulting in long
883
+ queue.
884
+
885
+ Reconciler should handle below statuses, if not, could be slow
886
+ reconcilation loop or a bug:
887
+ - RAY_INSTALL_FAILED
888
+ - RAY_STOPPED
889
+ - TERMINATION_FAILED
890
+
891
+
892
+ Args:
893
+ instance_manager: The instance manager to reconcile.
894
+ reconcile_config: The instance reconcile config.
895
+ _logger: The logger to log the warning messages. It's used for testing.
896
+
897
+ """
898
+ instances, version = Reconciler._get_im_instances(instance_manager)
899
+
900
+ instances_by_status = defaultdict(list)
901
+ for instance in instances:
902
+ instances_by_status[instance.status].append(instance)
903
+
904
+ im_updates = {}
905
+
906
+ # Fail or retry the cloud instance allocation if it's stuck
907
+ # in the REQUESTED state.
908
+ for instance in instances_by_status[IMInstance.REQUESTED]:
909
+ update = Reconciler._handle_stuck_requested_instance(
910
+ instance,
911
+ reconcile_config.request_status_timeout_s,
912
+ reconcile_config.max_num_retry_request_to_allocate,
913
+ )
914
+ if update:
915
+ im_updates[instance.instance_id] = update
916
+
917
+ # Leaked ALLOCATED instances should be terminated.
918
+ # This usually happens when ray fails to be started on the instance, so
919
+ # it's unable to be RAY_RUNNING after a long time.
920
+ for instance in instances_by_status[IMInstance.ALLOCATED]:
921
+ assert (
922
+ instance.cloud_instance_id
923
+ ), "cloud instance id should be set on ALLOCATED instance"
924
+ update = Reconciler._handle_stuck_instance(
925
+ instance,
926
+ reconcile_config.allocate_status_timeout_s,
927
+ new_status=IMInstance.TERMINATING,
928
+ cloud_instance_id=instance.cloud_instance_id,
929
+ )
930
+ if update:
931
+ im_updates[instance.instance_id] = update
932
+
933
+ # Fail the installation if it's stuck in RAY_INSTALLING for too long.
934
+ # If RAY_INSTALLING is stuck for too long, it's likely that the instance
935
+ # is not able to install ray, so we should also fail the installation.
936
+ for instance in instances_by_status[IMInstance.RAY_INSTALLING]:
937
+ update = Reconciler._handle_stuck_instance(
938
+ instance,
939
+ reconcile_config.ray_install_status_timeout_s,
940
+ new_status=IMInstance.RAY_INSTALL_FAILED,
941
+ )
942
+ if update:
943
+ im_updates[instance.instance_id] = update
944
+
945
+ # If we tried to terminate the instance, but it doesn't terminate (disappear
946
+ # from the cloud provider) after a long time, we fail the termination.
947
+ # This will trigger another attempt to terminate the instance.
948
+ for instance in instances_by_status[IMInstance.TERMINATING]:
949
+ update = Reconciler._handle_stuck_instance(
950
+ instance,
951
+ reconcile_config.terminating_status_timeout_s,
952
+ new_status=IMInstance.TERMINATION_FAILED,
953
+ )
954
+ if update:
955
+ im_updates[instance.instance_id] = update
956
+
957
+ # If we tried to stop ray on the instance, but it doesn't stop after a long
958
+ # time, we will transition it back to RAY_RUNNING as the stop/drain somehow
959
+ # failed. If it had succeed, we should have transitioned it to RAY_STOPPING
960
+ # or RAY_STOPPED.
961
+ for instance in instances_by_status[IMInstance.RAY_STOP_REQUESTED]:
962
+ update = Reconciler._handle_stuck_instance(
963
+ instance,
964
+ reconcile_config.ray_stop_requested_status_timeout_s,
965
+ new_status=IMInstance.RAY_RUNNING,
966
+ ray_node_id=instance.node_id,
967
+ )
968
+ if update:
969
+ im_updates[instance.instance_id] = update
970
+
971
+ # These statues could be unbounded or transient, and we don't have a timeout
972
+ # mechanism to handle them. We only warn if they are stuck for too long.
973
+ for status in [
974
+ # Ray taking time to drain. We could also have a timeout when Drain protocol
975
+ # supports timeout.
976
+ IMInstance.RAY_STOPPING,
977
+ # These should just be transient, we will terminate instances with this
978
+ # status in the next reconciler step.
979
+ IMInstance.RAY_INSTALL_FAILED,
980
+ IMInstance.RAY_STOPPED,
981
+ IMInstance.TERMINATION_FAILED,
982
+ # Instances could be in the QUEUED status for a long time if the cloud
983
+ # provider is slow to launch instances.
984
+ IMInstance.QUEUED,
985
+ ]:
986
+ Reconciler._warn_stuck_instances(
987
+ instances_by_status[status],
988
+ status=status,
989
+ warn_interval_s=reconcile_config.transient_status_warn_interval_s,
990
+ logger=_logger,
991
+ )
992
+
993
+ Reconciler._update_instance_manager(instance_manager, version, im_updates)
994
+
995
+ @staticmethod
996
+ def _warn_stuck_instances(
997
+ instances: List[IMInstance],
998
+ status: IMInstance.InstanceStatus,
999
+ warn_interval_s: int,
1000
+ logger: logging.Logger,
1001
+ ):
1002
+ """Warn if any instance is stuck in a transient/unbounded status for too
1003
+ long.
1004
+ """
1005
+ for instance in instances:
1006
+ status_times_ns = InstanceUtil.get_status_transition_times_ns(
1007
+ instance, select_instance_status=status
1008
+ )
1009
+ assert len(status_times_ns) >= 1
1010
+ status_time_ns = sorted(status_times_ns)[-1]
1011
+
1012
+ if time.time_ns() - status_time_ns > warn_interval_s * 1e9:
1013
+ logger.warning(
1014
+ "Instance {}({}) is stuck in {} for {} seconds.".format(
1015
+ instance.instance_id,
1016
+ IMInstance.InstanceStatus.Name(instance.status),
1017
+ IMInstance.InstanceStatus.Name(status),
1018
+ (time.time_ns() - status_time_ns) // 1e9,
1019
+ )
1020
+ )
1021
+
1022
+ @staticmethod
1023
+ def _is_head_node_running(instance_manager: InstanceManager) -> bool:
1024
+ """
1025
+ Check if the head node is running and ready.
1026
+
1027
+ If we scale up the cluster before head node is running,
1028
+ it would cause issues when launching the worker nodes.
1029
+
1030
+ There are corner cases when the GCS is up (so the ray cluster resource
1031
+ state is retrievable from the GCS), but the head node's raylet is not
1032
+ running so the head node is missing from the reported nodes. This happens
1033
+ when the head node is still starting up, or the raylet is not running
1034
+ due to some issues, and this would yield false.
1035
+
1036
+ Args:
1037
+ instance_manager: The instance manager to reconcile.
1038
+
1039
+ Returns:
1040
+ True if the head node is running and ready, False otherwise.
1041
+ """
1042
+
1043
+ im_instances, _ = Reconciler._get_im_instances(instance_manager)
1044
+
1045
+ for instance in im_instances:
1046
+ if instance.node_kind == NodeKind.HEAD:
1047
+ if instance.status == IMInstance.RAY_RUNNING:
1048
+ return True
1049
+ return False
1050
+
1051
+ @staticmethod
1052
+ def _scale_cluster(
1053
+ autoscaling_state: AutoscalingState,
1054
+ instance_manager: InstanceManager,
1055
+ ray_state: ClusterResourceState,
1056
+ scheduler: IResourceScheduler,
1057
+ autoscaling_config: AutoscalingConfig,
1058
+ ) -> None:
1059
+ """
1060
+ Scale the cluster based on the resource state and the resource scheduler's
1061
+ decision:
1062
+
1063
+ - It launches new instances if needed.
1064
+ - It terminates extra ray nodes if they should be shut down (preemption
1065
+ or idle termination)
1066
+
1067
+ Args:
1068
+ autoscaling_state: The autoscaling state to reconcile.
1069
+ instance_manager: The instance manager to reconcile.
1070
+ ray_state: The ray cluster's resource state.
1071
+ scheduler: The resource scheduler to make scaling decisions.
1072
+ autoscaling_config: The autoscaling config.
1073
+
1074
+ """
1075
+
1076
+ # Get the current instance states.
1077
+ im_instances, version = Reconciler._get_im_instances(instance_manager)
1078
+
1079
+ autoscaler_instances = []
1080
+ ray_nodes_by_id = {
1081
+ binary_to_hex(node.node_id): node for node in ray_state.node_states
1082
+ }
1083
+
1084
+ for im_instance in im_instances:
1085
+ ray_node = ray_nodes_by_id.get(im_instance.node_id)
1086
+ autoscaler_instances.append(
1087
+ AutoscalerInstance(
1088
+ ray_node=ray_node,
1089
+ im_instance=im_instance,
1090
+ cloud_instance_id=(
1091
+ im_instance.cloud_instance_id
1092
+ if im_instance.cloud_instance_id
1093
+ else None
1094
+ ),
1095
+ )
1096
+ )
1097
+
1098
+ # TODO(rickyx): We should probably name it as "Planner" or "Scaler"
1099
+ # or "ClusterScaler"
1100
+ sched_request = SchedulingRequest(
1101
+ node_type_configs=autoscaling_config.get_node_type_configs(),
1102
+ max_num_nodes=autoscaling_config.get_max_num_nodes(),
1103
+ resource_requests=ray_state.pending_resource_requests,
1104
+ gang_resource_requests=ray_state.pending_gang_resource_requests,
1105
+ cluster_resource_constraints=ray_state.cluster_resource_constraints,
1106
+ current_instances=autoscaler_instances,
1107
+ idle_timeout_s=autoscaling_config.get_idle_timeout_s(),
1108
+ disable_launch_config_check=(
1109
+ autoscaling_config.disable_launch_config_check()
1110
+ ),
1111
+ )
1112
+
1113
+ # Ask scheduler for updates to the cluster shape.
1114
+ reply = scheduler.schedule(sched_request)
1115
+
1116
+ # Populate the autoscaling state.
1117
+ autoscaling_state.infeasible_resource_requests.extend(
1118
+ reply.infeasible_resource_requests
1119
+ )
1120
+ autoscaling_state.infeasible_gang_resource_requests.extend(
1121
+ reply.infeasible_gang_resource_requests
1122
+ )
1123
+ autoscaling_state.infeasible_cluster_resource_constraints.extend(
1124
+ reply.infeasible_cluster_resource_constraints
1125
+ )
1126
+
1127
+ if not Reconciler._is_head_node_running(instance_manager):
1128
+ # We shouldn't be scaling the cluster until the head node is ready.
1129
+ # This could happen when the head node (i.e. the raylet) is still
1130
+ # pending registration even though GCS is up.
1131
+ # We will wait until the head node is running and ready to avoid
1132
+ # scaling the cluster from min worker nodes constraint.
1133
+ return
1134
+
1135
+ if autoscaling_config.provider == Provider.READ_ONLY:
1136
+ # We shouldn't be scaling the cluster if the provider is read-only.
1137
+ return
1138
+
1139
+ # Scale the clusters if needed.
1140
+ to_launch = reply.to_launch
1141
+ to_terminate = reply.to_terminate
1142
+ updates = {}
1143
+ # Add terminating instances.
1144
+ for terminate_request in to_terminate:
1145
+ instance_id = terminate_request.instance_id
1146
+ updates[terminate_request.instance_id] = IMInstanceUpdateEvent(
1147
+ instance_id=instance_id,
1148
+ new_instance_status=IMInstance.RAY_STOP_REQUESTED,
1149
+ termination_request=terminate_request,
1150
+ details=f"draining ray: {terminate_request.details}",
1151
+ )
1152
+
1153
+ # Add new instances.
1154
+ for launch_request in to_launch:
1155
+ for _ in range(launch_request.count):
1156
+ instance_id = InstanceUtil.random_instance_id()
1157
+ updates[instance_id] = IMInstanceUpdateEvent(
1158
+ instance_id=instance_id,
1159
+ new_instance_status=IMInstance.QUEUED,
1160
+ instance_type=launch_request.instance_type,
1161
+ upsert=True,
1162
+ details=(
1163
+ f"queuing new instance of {launch_request.instance_type} "
1164
+ "from scheduler"
1165
+ ),
1166
+ )
1167
+
1168
+ Reconciler._update_instance_manager(instance_manager, version, updates)
1169
+
1170
+ @staticmethod
1171
+ def _terminate_instances(instance_manager: InstanceManager):
1172
+ """
1173
+ Terminate instances with the below statuses:
1174
+ - RAY_STOPPED: ray was stopped on the cloud instance.
1175
+ - RAY_INSTALL_FAILED: ray installation failed on the cloud instance,
1176
+ we will not retry.
1177
+ - TERMINATION_FAILED: cloud provider failed to terminate the instance
1178
+ or timeout for termination happened, we will retry again.
1179
+
1180
+ Args:
1181
+ instance_manager: The instance manager to reconcile.
1182
+ """
1183
+
1184
+ im_instances, version = Reconciler._get_im_instances(instance_manager)
1185
+ updates = {}
1186
+ for instance in im_instances:
1187
+ if instance.status not in [
1188
+ IMInstance.RAY_STOPPED,
1189
+ IMInstance.RAY_INSTALL_FAILED,
1190
+ IMInstance.TERMINATION_FAILED,
1191
+ ]:
1192
+ continue
1193
+
1194
+ # Terminate the instance.
1195
+ updates[instance.instance_id] = IMInstanceUpdateEvent(
1196
+ instance_id=instance.instance_id,
1197
+ new_instance_status=IMInstance.TERMINATING,
1198
+ cloud_instance_id=instance.cloud_instance_id,
1199
+ details="terminating instance from "
1200
+ f"{IMInstance.InstanceStatus.Name(instance.status)}",
1201
+ )
1202
+
1203
+ Reconciler._update_instance_manager(instance_manager, version, updates)
1204
+
1205
+ @staticmethod
1206
+ def _install_ray(
1207
+ instance_manager: InstanceManager,
1208
+ non_terminated_cloud_instances: Dict[CloudInstanceId, CloudInstance],
1209
+ ) -> None:
1210
+ """
1211
+ Install ray on the allocated instances when it's ready (cloud instance
1212
+ should be running)
1213
+
1214
+ This is needed if ray installation needs to be performed by
1215
+ the instance manager.
1216
+
1217
+ Args:
1218
+ instance_manager: The instance manager to reconcile.
1219
+ """
1220
+ im_instances, version = Reconciler._get_im_instances(instance_manager)
1221
+ updates = {}
1222
+ for instance in im_instances:
1223
+ if instance.status != IMInstance.ALLOCATED:
1224
+ continue
1225
+
1226
+ if instance.node_kind == NodeKind.HEAD:
1227
+ # Skip head node.
1228
+ continue
1229
+
1230
+ cloud_instance = non_terminated_cloud_instances.get(
1231
+ instance.cloud_instance_id
1232
+ )
1233
+
1234
+ assert cloud_instance, (
1235
+ f"Cloud instance {instance.cloud_instance_id} is not found "
1236
+ "in non_terminated_cloud_instances."
1237
+ )
1238
+
1239
+ if not cloud_instance.is_running:
1240
+ # It might still be pending (e.g. setting up ssh)
1241
+ continue
1242
+
1243
+ # Install ray on the running cloud instance
1244
+ updates[instance.instance_id] = IMInstanceUpdateEvent(
1245
+ instance_id=instance.instance_id,
1246
+ new_instance_status=IMInstance.RAY_INSTALLING,
1247
+ details="installing ray",
1248
+ )
1249
+
1250
+ Reconciler._update_instance_manager(instance_manager, version, updates)
1251
+
1252
+ @staticmethod
1253
+ def _fill_autoscaling_state(
1254
+ instance_manager: InstanceManager,
1255
+ autoscaling_state: AutoscalingState,
1256
+ ) -> None:
1257
+
1258
+ # Use the IM instance version for the autoscaler_state_version
1259
+ instances, version = Reconciler._get_im_instances(instance_manager)
1260
+ autoscaling_state.autoscaler_state_version = version
1261
+
1262
+ # Group instances by status
1263
+ instances_by_status = defaultdict(list)
1264
+ for instance in instances:
1265
+ instances_by_status[instance.status].append(instance)
1266
+
1267
+ # Pending instance requests
1268
+ instances_by_launch_request = defaultdict(list)
1269
+ queued_instances = []
1270
+ for instance in (
1271
+ instances_by_status[IMInstance.REQUESTED]
1272
+ + instances_by_status[IMInstance.QUEUED]
1273
+ ):
1274
+ if instance.launch_request_id:
1275
+ instances_by_launch_request[instance.launch_request_id].append(instance)
1276
+ else:
1277
+ queued_instances.append(instance)
1278
+
1279
+ for _, instances in instances_by_launch_request.items():
1280
+ num_instances_by_type = defaultdict(int)
1281
+ for instance in instances:
1282
+ num_instances_by_type[instance.instance_type] += 1
1283
+
1284
+ # All instances with same request id should have the same
1285
+ # request time.
1286
+ request_update = InstanceUtil.get_last_status_transition(
1287
+ instances[0], IMInstance.REQUESTED
1288
+ )
1289
+ request_time_ns = request_update.timestamp_ns if request_update else 0
1290
+
1291
+ for instance_type, count in num_instances_by_type.items():
1292
+ autoscaling_state.pending_instance_requests.append(
1293
+ PendingInstanceRequest(
1294
+ ray_node_type_name=instance_type,
1295
+ count=int(count),
1296
+ request_ts=int(request_time_ns // 1e9),
1297
+ )
1298
+ )
1299
+
1300
+ # Pending instances
1301
+ for instance in (
1302
+ instances_by_status[IMInstance.ALLOCATED]
1303
+ + instances_by_status[IMInstance.RAY_INSTALLING]
1304
+ ):
1305
+
1306
+ status_history = sorted(
1307
+ instance.status_history, key=lambda x: x.timestamp_ns, reverse=True
1308
+ )
1309
+ autoscaling_state.pending_instances.append(
1310
+ PendingInstance(
1311
+ instance_id=instance.instance_id,
1312
+ ray_node_type_name=instance.instance_type,
1313
+ details=status_history[0].details,
1314
+ )
1315
+ )
1316
+
1317
+ # Failed instance requests
1318
+ for instance in instances_by_status[IMInstance.ALLOCATION_FAILED]:
1319
+ request_status_update = InstanceUtil.get_last_status_transition(
1320
+ instance, IMInstance.REQUESTED
1321
+ )
1322
+ failed_status_update = InstanceUtil.get_last_status_transition(
1323
+ instance, IMInstance.ALLOCATION_FAILED
1324
+ )
1325
+ failed_time = (
1326
+ failed_status_update.timestamp_ns if failed_status_update else 0
1327
+ )
1328
+ request_time = (
1329
+ request_status_update.timestamp_ns if request_status_update else 0
1330
+ )
1331
+ autoscaling_state.failed_instance_requests.append(
1332
+ FailedInstanceRequest(
1333
+ ray_node_type_name=instance.instance_type,
1334
+ start_ts=int(request_time // 1e9),
1335
+ failed_ts=int(
1336
+ failed_time // 1e9,
1337
+ ),
1338
+ reason=failed_status_update.details,
1339
+ count=1,
1340
+ )
1341
+ )
1342
+
1343
+ @staticmethod
1344
+ def _handle_stuck_requested_instance(
1345
+ instance: IMInstance, timeout_s: int, max_num_retry_request_to_allocate: int
1346
+ ) -> Optional[IMInstanceUpdateEvent]:
1347
+ """
1348
+ Fail the cloud instance allocation if it's stuck in the REQUESTED state.
1349
+
1350
+ Args:
1351
+ instance: The instance to handle.
1352
+ timeout_s: The timeout in seconds.
1353
+ max_num_retry_request_to_allocate: The maximum number of times an instance
1354
+ could be requested to allocate.
1355
+
1356
+ Returns:
1357
+ Instance update to ALLOCATION_FAILED: if the instance allocation failed
1358
+ with errors.
1359
+ None: if there's no update.
1360
+
1361
+ """
1362
+ if not InstanceUtil.has_timeout(instance, timeout_s):
1363
+ # Not timeout yet, be patient.
1364
+ return None
1365
+
1366
+ all_request_times_ns = sorted(
1367
+ InstanceUtil.get_status_transition_times_ns(
1368
+ instance, select_instance_status=IMInstance.REQUESTED
1369
+ )
1370
+ )
1371
+
1372
+ # Fail the allocation if we have tried too many times.
1373
+ if len(all_request_times_ns) > max_num_retry_request_to_allocate:
1374
+ return IMInstanceUpdateEvent(
1375
+ instance_id=instance.instance_id,
1376
+ new_instance_status=IMInstance.ALLOCATION_FAILED,
1377
+ details=(
1378
+ "failed to allocate cloud instance after "
1379
+ f"{len(all_request_times_ns)} attempts > "
1380
+ f"max_num_retry_request_to_allocate={max_num_retry_request_to_allocate}" # noqa
1381
+ ),
1382
+ )
1383
+
1384
+ # Retry the allocation if we could by transitioning to QUEUED again.
1385
+ return IMInstanceUpdateEvent(
1386
+ instance_id=instance.instance_id,
1387
+ new_instance_status=IMInstance.QUEUED,
1388
+ details=f"queue again to launch after timeout={timeout_s}s",
1389
+ )
1390
+
1391
+ @staticmethod
1392
+ def _handle_stuck_instance(
1393
+ instance: IMInstance,
1394
+ timeout_s: int,
1395
+ new_status: IMInstance.InstanceStatus,
1396
+ **update_kwargs: Dict,
1397
+ ) -> Optional[IMInstanceUpdateEvent]:
1398
+ """
1399
+ Fail the instance if it's stuck in the status for too long.
1400
+
1401
+ Args:
1402
+ instance: The instance to handle.
1403
+ timeout_s: The timeout in seconds.
1404
+ new_status: The new status to transition to.
1405
+ update_kwargs: The update kwargs for InstanceUpdateEvent
1406
+
1407
+ Returns:
1408
+ Instance update to the new status: if the instance is stuck in the status
1409
+ for too long.
1410
+ None: if there's no update.
1411
+
1412
+ """
1413
+ if not InstanceUtil.has_timeout(instance, timeout_s):
1414
+ # Not timeout yet, be patient.
1415
+ return None
1416
+
1417
+ return IMInstanceUpdateEvent(
1418
+ instance_id=instance.instance_id,
1419
+ new_instance_status=new_status,
1420
+ details=f"timeout={timeout_s}s at status "
1421
+ f"{IMInstance.InstanceStatus.Name(instance.status)}",
1422
+ **update_kwargs,
1423
+ )
1424
+
1425
+ @staticmethod
1426
+ def _handle_extra_cloud_instances(
1427
+ instance_manager: InstanceManager,
1428
+ non_terminated_cloud_instances: Dict[CloudInstanceId, CloudInstance],
1429
+ ray_nodes: List[NodeState],
1430
+ ):
1431
+ """
1432
+ For extra cloud instances (i.e. cloud instances that are non terminated as
1433
+ returned by cloud provider, but not managed by the instance manager), we
1434
+ will create new IM instances with ALLOCATED status.
1435
+
1436
+ Such instances could either be:
1437
+ 1. Leaked instances that are incorrectly started by the cloud instance
1438
+ provider, and they would be terminated eventually if they fail to
1439
+ transition to RAY_RUNNING by stuck instances reconciliation, or they
1440
+ would join the ray cluster and be terminated when the cluster scales down.
1441
+ 2. Instances that are started by the cloud instance provider intentionally
1442
+ but not yet discovered by the instance manager. This could happen for
1443
+ a. Head node that's started before the autoscaler.
1444
+ b. Worker nodes that's started by the cloud provider upon users'
1445
+ actions: i.e. KubeRay scaling up the cluster with ray cluster config
1446
+ change.
1447
+ 3. Ray nodes with cloud instance id not in the cloud provider. This could
1448
+ happen if there's delay in the Ray's state (i.e. cloud instance already
1449
+ terminated, but the ray node is still not dead yet).
1450
+
1451
+ Args:
1452
+ instance_manager: The instance manager to reconcile.
1453
+ non_terminated_cloud_instances: The non-terminated cloud instances from
1454
+ the cloud provider.
1455
+ ray_nodes: The ray cluster's states of ray nodes.
1456
+ """
1457
+ Reconciler._handle_extra_cloud_instances_from_cloud_provider(
1458
+ instance_manager, non_terminated_cloud_instances
1459
+ )
1460
+ Reconciler._handle_extra_cloud_instances_from_ray_nodes(
1461
+ instance_manager, ray_nodes
1462
+ )
1463
+
1464
+ @staticmethod
1465
+ def _handle_extra_cloud_instances_from_cloud_provider(
1466
+ instance_manager: InstanceManager,
1467
+ non_terminated_cloud_instances: Dict[CloudInstanceId, CloudInstance],
1468
+ ):
1469
+ """
1470
+ For extra cloud instances that are not managed by the instance manager but
1471
+ are running in the cloud provider, we will create new IM instances with
1472
+ ALLOCATED status.
1473
+
1474
+ Args:
1475
+ instance_manager: The instance manager to reconcile.
1476
+ non_terminated_cloud_instances: The non-terminated cloud instances from
1477
+ the cloud provider.
1478
+ """
1479
+ updates = {}
1480
+
1481
+ instances, version = Reconciler._get_im_instances(instance_manager)
1482
+ cloud_instance_ids_managed_by_im = {
1483
+ instance.cloud_instance_id
1484
+ for instance in instances
1485
+ if instance.cloud_instance_id
1486
+ }
1487
+
1488
+ # Find the extra cloud instances that are not managed by the instance manager.
1489
+ for cloud_instance_id, cloud_instance in non_terminated_cloud_instances.items():
1490
+ if cloud_instance_id in cloud_instance_ids_managed_by_im:
1491
+ continue
1492
+ updates[cloud_instance_id] = IMInstanceUpdateEvent(
1493
+ instance_id=InstanceUtil.random_instance_id(), # Assign a new id.
1494
+ cloud_instance_id=cloud_instance_id,
1495
+ new_instance_status=IMInstance.ALLOCATED,
1496
+ node_kind=cloud_instance.node_kind,
1497
+ instance_type=cloud_instance.node_type,
1498
+ details=(
1499
+ "allocated unmanaged cloud instance :"
1500
+ f"{cloud_instance.cloud_instance_id} "
1501
+ f"({NodeKind.Name(cloud_instance.node_kind)}) from cloud provider"
1502
+ ),
1503
+ upsert=True,
1504
+ )
1505
+ Reconciler._update_instance_manager(instance_manager, version, updates)
1506
+
1507
+ @staticmethod
1508
+ def _handle_extra_cloud_instances_from_ray_nodes(
1509
+ instance_manager: InstanceManager, ray_nodes: List[NodeState]
1510
+ ):
1511
+ """
1512
+ For extra cloud instances reported by Ray but not managed by the instance
1513
+ manager, we will create new IM instances with ALLOCATED status.
1514
+
1515
+ Args:
1516
+ instance_manager: The instance manager to reconcile.
1517
+ ray_nodes: The ray cluster's states of ray nodes.
1518
+ """
1519
+ updates = {}
1520
+
1521
+ instances, version = Reconciler._get_im_instances(instance_manager)
1522
+ cloud_instance_ids_managed_by_im = {
1523
+ instance.cloud_instance_id
1524
+ for instance in instances
1525
+ if instance.cloud_instance_id
1526
+ }
1527
+
1528
+ for ray_node in ray_nodes:
1529
+ if not ray_node.instance_id:
1530
+ continue
1531
+
1532
+ cloud_instance_id = ray_node.instance_id
1533
+ if cloud_instance_id in cloud_instance_ids_managed_by_im:
1534
+ continue
1535
+
1536
+ is_head = is_head_node(ray_node)
1537
+ updates[cloud_instance_id] = IMInstanceUpdateEvent(
1538
+ instance_id=InstanceUtil.random_instance_id(), # Assign a new id.
1539
+ cloud_instance_id=cloud_instance_id,
1540
+ new_instance_status=IMInstance.ALLOCATED,
1541
+ node_kind=NodeKind.HEAD if is_head else NodeKind.WORKER,
1542
+ instance_type=ray_node.ray_node_type_name,
1543
+ details=(
1544
+ "allocated unmanaged worker cloud instance from ray node: "
1545
+ f"{binary_to_hex(ray_node.node_id)}"
1546
+ ),
1547
+ upsert=True,
1548
+ )
1549
+
1550
+ Reconciler._update_instance_manager(instance_manager, version, updates)
1551
+
1552
+ @staticmethod
1553
+ def _report_metrics(
1554
+ instance_manager: InstanceManager,
1555
+ autoscaling_config: AutoscalingConfig,
1556
+ metrics_reporter: Optional[AutoscalerMetricsReporter] = None,
1557
+ ):
1558
+ if not metrics_reporter:
1559
+ return
1560
+
1561
+ instances, _ = Reconciler._get_im_instances(instance_manager)
1562
+ node_type_configs = autoscaling_config.get_node_type_configs()
1563
+
1564
+ metrics_reporter.report_instances(instances, node_type_configs)
1565
+ metrics_reporter.report_resources(instances, node_type_configs)
.venv/lib/python3.11/site-packages/ray/autoscaler/v2/instance_manager/storage.py ADDED
@@ -0,0 +1,180 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import copy
2
+ from abc import ABCMeta, abstractmethod
3
+ from collections import defaultdict, namedtuple
4
+ from threading import Lock
5
+ from typing import Dict, List, Optional, Tuple
6
+
7
+ StoreStatus = namedtuple("StoreStatus", ["success", "version"])
8
+ VersionedValue = namedtuple("VersionedValue", ["value", "version"])
9
+
10
+
11
+ class Storage(metaclass=ABCMeta):
12
+ """Interface for a storage backend that stores the state of nodes in the cluster.
13
+
14
+ The storage is thread-safe.
15
+
16
+ The storage is versioned, which means that each successful stage change to the
17
+ storage will bump the version number. The version number can be used to
18
+ implement optimistic concurrency control.
19
+
20
+ Each entry in the storage table is also versioned. The version number of an entry
21
+ is the last version number when the entry is updated.
22
+ """
23
+
24
+ @abstractmethod
25
+ def batch_update(
26
+ self,
27
+ table: str,
28
+ mutation: Optional[Dict[str, str]] = None,
29
+ deletion: Optional[List[str]] = None,
30
+ expected_storage_version: Optional[int] = None,
31
+ ) -> StoreStatus:
32
+ """Batch update the storage table. This method is atomic.
33
+
34
+ Args:
35
+ table: The name of the table.
36
+ mutation: A dictionary of key-value pairs to be updated.
37
+ deletion: A list of keys to be deleted.
38
+ expected_storage_version: The expected storage version. The
39
+ update will fail if the version does not match the
40
+ current storage version.
41
+
42
+ Returns:
43
+ StoreStatus: A tuple of (success, version). If the update is
44
+ successful, returns (True, new_version).
45
+ Otherwise, returns (False, current_version).
46
+ """
47
+ raise NotImplementedError("batch_update() has to be implemented")
48
+
49
+ @abstractmethod
50
+ def update(
51
+ self,
52
+ table: str,
53
+ key: str,
54
+ value: str,
55
+ expected_entry_version: Optional[int] = None,
56
+ insert_only: bool = False,
57
+ ) -> StoreStatus:
58
+ """Update a single entry in the storage table.
59
+
60
+ Args:
61
+ table: The name of the table.
62
+ key: The key of the entry.
63
+ value: The value of the entry.
64
+ expected_entry_version: The expected version of the entry.
65
+ The update will fail if the version does not match the current
66
+ version of the entry.
67
+ insert_only: If True, the update will
68
+ fail if the entry already exists.
69
+ Returns:
70
+ StoreStatus: A tuple of (success, version). If the update is
71
+ successful, returns (True, new_version). Otherwise,
72
+ returns (False, current_version).
73
+ """
74
+ raise NotImplementedError("update() has to be implemented")
75
+
76
+ @abstractmethod
77
+ def get_all(self, table: str) -> Tuple[Dict[str, Tuple[str, int]], int]:
78
+ raise NotImplementedError("get_all() has to be implemented")
79
+
80
+ @abstractmethod
81
+ def get(
82
+ self, table: str, keys: List[str]
83
+ ) -> Tuple[Dict[str, Tuple[str, int]], int]:
84
+ """Get a list of entries from the storage table.
85
+
86
+ Args:
87
+ table: The name of the table.
88
+ keys: A list of keys to be retrieved. If the list is empty,
89
+ all entries in the table will be returned.
90
+
91
+ Returns:
92
+ Tuple[Dict[str, VersionedValue], int]: A tuple of
93
+ (entries, storage_version). The entries is a dictionary of
94
+ (key, (value, entry_version)) pairs. The entry_version is the
95
+ version of the entry when it was last updated. The
96
+ storage_version is the current storage version.
97
+ """
98
+ raise NotImplementedError("get() has to be implemented")
99
+
100
+ @abstractmethod
101
+ def get_version(self) -> int:
102
+ """Get the current storage version.
103
+
104
+ Returns:
105
+ int: The current storage version.
106
+ """
107
+ raise NotImplementedError("get_version() has to be implemented")
108
+
109
+
110
+ class InMemoryStorage(Storage):
111
+ """An in-memory implementation of the Storage interface. This implementation
112
+ is not durable"""
113
+
114
+ def __init__(self):
115
+ self._version = 0
116
+ self._tables = defaultdict(dict)
117
+ self._lock = Lock()
118
+
119
+ def batch_update(
120
+ self,
121
+ table: str,
122
+ mutation: Dict[str, str] = None,
123
+ deletion: List[str] = None,
124
+ expected_version: Optional[int] = None,
125
+ ) -> StoreStatus:
126
+ mutation = mutation if mutation else {}
127
+ deletion = deletion if deletion else []
128
+ with self._lock:
129
+ if expected_version is not None and expected_version != self._version:
130
+ return StoreStatus(False, self._version)
131
+ self._version += 1
132
+ key_value_pairs_with_version = {
133
+ key: VersionedValue(value, self._version)
134
+ for key, value in mutation.items()
135
+ }
136
+ self._tables[table].update(key_value_pairs_with_version)
137
+ for deleted_key in deletion:
138
+ self._tables[table].pop(deleted_key, None)
139
+ return StoreStatus(True, self._version)
140
+
141
+ def update(
142
+ self,
143
+ table: str,
144
+ key: str,
145
+ value: str,
146
+ expected_entry_version: Optional[int] = None,
147
+ expected_storage_version: Optional[int] = None,
148
+ insert_only: bool = False,
149
+ ) -> StoreStatus:
150
+ with self._lock:
151
+ if (
152
+ expected_storage_version is not None
153
+ and expected_storage_version != self._version
154
+ ):
155
+ return StoreStatus(False, self._version)
156
+ if insert_only and key in self._tables[table]:
157
+ return StoreStatus(False, self._version)
158
+ _, version = self._tables[table].get(key, (None, -1))
159
+ if expected_entry_version is not None and expected_entry_version != version:
160
+ return StoreStatus(False, self._version)
161
+ self._version += 1
162
+ self._tables[table][key] = VersionedValue(value, self._version)
163
+ return StoreStatus(True, self._version)
164
+
165
+ def get_all(self, table: str) -> Tuple[Dict[str, VersionedValue], int]:
166
+ with self._lock:
167
+ return (copy.deepcopy(self._tables[table]), self._version)
168
+
169
+ def get(self, table: str, keys: List[str]) -> Tuple[Dict[str, VersionedValue], int]:
170
+ if not keys:
171
+ return self.get_all(table)
172
+ with self._lock:
173
+ result = {}
174
+ for key in keys:
175
+ if key in self._tables.get(table, {}):
176
+ result[key] = self._tables[table][key]
177
+ return StoreStatus(result, self._version)
178
+
179
+ def get_version(self) -> int:
180
+ return self._version
.venv/lib/python3.11/site-packages/ray/autoscaler/v2/metrics_reporter.py ADDED
@@ -0,0 +1,100 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from collections import defaultdict
2
+ from typing import Dict, List
3
+
4
+ from ray.autoscaler._private.prom_metrics import AutoscalerPrometheusMetrics
5
+ from ray.autoscaler.v2.instance_manager.common import InstanceUtil
6
+ from ray.autoscaler.v2.instance_manager.config import NodeTypeConfig
7
+ from ray.autoscaler.v2.schema import NodeType
8
+ from ray.core.generated.instance_manager_pb2 import Instance as IMInstance
9
+
10
+
11
+ class AutoscalerMetricsReporter:
12
+ def __init__(self, prom_metrics: AutoscalerPrometheusMetrics) -> None:
13
+ self._prom_metrics = prom_metrics
14
+
15
+ def report_instances(
16
+ self,
17
+ instances: List[IMInstance],
18
+ node_type_configs: Dict[NodeType, NodeTypeConfig],
19
+ ):
20
+ """
21
+ Record autoscaler metrics for:
22
+ - pending_nodes: Nodes that are launching/pending ray start
23
+ - active_nodes: Active nodes (nodes running ray)
24
+ - recently_failed_nodes: Nodes that are being terminated.
25
+ - stopped_nodes: Nodes that are terminated.
26
+ """
27
+ # map of instance type to a dict of status to count.
28
+ status_count_by_type: Dict[NodeType : Dict[str, int]] = {}
29
+ # initialize the status count by type.
30
+ for instance_type in node_type_configs.keys():
31
+ status_count_by_type[instance_type] = {
32
+ "pending": 0,
33
+ "running": 0,
34
+ "terminating": 0,
35
+ "terminated": 0,
36
+ }
37
+
38
+ for instance in instances:
39
+ if InstanceUtil.is_ray_pending(instance.status):
40
+ status_count_by_type[instance.instance_type]["pending"] += 1
41
+ elif InstanceUtil.is_ray_running(instance.status):
42
+ status_count_by_type[instance.instance_type]["running"] += 1
43
+ elif instance.status == IMInstance.TERMINATING:
44
+ status_count_by_type[instance.instance_type]["terminating"] += 1
45
+ elif instance.status == IMInstance.TERMINATED:
46
+ status_count_by_type[instance.instance_type]["terminated"] += 1
47
+
48
+ for instance_type, status_count in status_count_by_type.items():
49
+ self._prom_metrics.pending_nodes.labels(
50
+ SessionName=self._prom_metrics.session_name, NodeType=instance_type
51
+ ).set(status_count["pending"])
52
+
53
+ self._prom_metrics.active_nodes.labels(
54
+ SessionName=self._prom_metrics.session_name, NodeType=instance_type
55
+ ).set(status_count["running"])
56
+
57
+ self._prom_metrics.recently_failed_nodes.labels(
58
+ SessionName=self._prom_metrics.session_name, NodeType=instance_type
59
+ ).set(status_count["terminating"])
60
+
61
+ self._prom_metrics.stopped_nodes.inc(status_count["terminated"])
62
+
63
+ def report_resources(
64
+ self,
65
+ instances: List[IMInstance],
66
+ node_type_configs: Dict[NodeType, NodeTypeConfig],
67
+ ):
68
+ """
69
+ Record autoscaler metrics for:
70
+ - pending_resources: Pending resources
71
+ - cluster_resources: Cluster resources (resources running on the cluster)
72
+ """
73
+ # pending resources.
74
+ pending_resources = defaultdict(float)
75
+ cluster_resources = defaultdict(float)
76
+
77
+ def _add_resources(resource_map, node_type_configs, node_type, count):
78
+ node_resources = node_type_configs[node_type].resources
79
+ for resource_name, resource_value in node_resources.items():
80
+ resource_map[resource_name] += resource_value * count
81
+
82
+ for instance in instances:
83
+ if InstanceUtil.is_ray_pending(instance.status):
84
+ _add_resources(
85
+ pending_resources, node_type_configs, instance.instance_type, 1
86
+ )
87
+ elif InstanceUtil.is_ray_running(instance.status):
88
+ _add_resources(
89
+ cluster_resources, node_type_configs, instance.instance_type, 1
90
+ )
91
+
92
+ for resource_name, resource_value in pending_resources.items():
93
+ self._prom_metrics.pending_resources.labels(
94
+ SessionName=self._prom_metrics.session_name, resource=resource_name
95
+ ).set(resource_value)
96
+
97
+ for resource_name, resource_value in cluster_resources.items():
98
+ self._prom_metrics.cluster_resources.labels(
99
+ SessionName=self._prom_metrics.session_name, resource=resource_name
100
+ ).set(resource_value)
.venv/lib/python3.11/site-packages/ray/autoscaler/v2/monitor.py ADDED
@@ -0,0 +1,302 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Autoscaler monitoring loop daemon.
2
+
3
+ See autoscaler._private/monitor.py for the legacy implementation. All the legacy flags
4
+ are supported here, but the new implementation uses the new autoscaler v2.
5
+ """
6
+
7
+ import argparse
8
+ import logging
9
+ import os
10
+ import sys
11
+ import time
12
+ from typing import Optional
13
+
14
+ import ray
15
+ import ray._private.ray_constants as ray_constants
16
+ import ray._private.utils
17
+ from ray._private.event.event_logger import get_event_logger
18
+ from ray._private.ray_logging import setup_component_logger
19
+ from ray._private.usage.usage_lib import record_extra_usage_tag
20
+ from ray._private.worker import SCRIPT_MODE
21
+ from ray._raylet import GcsClient
22
+ from ray.autoscaler._private.constants import (
23
+ AUTOSCALER_METRIC_PORT,
24
+ AUTOSCALER_UPDATE_INTERVAL_S,
25
+ )
26
+ from ray.autoscaler._private.prom_metrics import AutoscalerPrometheusMetrics
27
+ from ray.autoscaler.v2.autoscaler import Autoscaler
28
+ from ray.autoscaler.v2.event_logger import AutoscalerEventLogger
29
+ from ray.autoscaler.v2.instance_manager.config import (
30
+ FileConfigReader,
31
+ IConfigReader,
32
+ ReadOnlyProviderConfigReader,
33
+ )
34
+ from ray.autoscaler.v2.metrics_reporter import AutoscalerMetricsReporter
35
+ from ray.core.generated.autoscaler_pb2 import AutoscalingState
36
+ from ray.core.generated.event_pb2 import Event as RayEvent
37
+ from ray.core.generated.usage_pb2 import TagKey
38
+
39
+ try:
40
+ import prometheus_client
41
+ except ImportError:
42
+ prometheus_client = None
43
+
44
+
45
+ logger = logging.getLogger(__name__)
46
+
47
+
48
+ class AutoscalerMonitor:
49
+ """Autoscaling monitor.
50
+
51
+ This process periodically collects stats from the GCS and triggers
52
+ autoscaler updates.
53
+
54
+ TODO:
55
+ We should also handle autoscaler failures properly in the future.
56
+ Right now, we don't restart autoscaler if it fails (internal reconciliation
57
+ however, should not fail the autoscaler process).
58
+ With the Reconciler able to handle extra cloud instances, we could in fact
59
+ recover the autoscaler process from reconciliation.
60
+ """
61
+
62
+ def __init__(
63
+ self,
64
+ address: str,
65
+ config_reader: IConfigReader,
66
+ log_dir: Optional[str] = None,
67
+ monitor_ip: Optional[str] = None,
68
+ ):
69
+ # Record v2 usage (we do this as early as possible to capture usage)
70
+ record_autoscaler_v2_usage(GcsClient(address))
71
+
72
+ self.gcs_address = address
73
+ worker = ray._private.worker.global_worker
74
+ # TODO: eventually plumb ClusterID through to here
75
+ self.gcs_client = GcsClient(address=self.gcs_address)
76
+
77
+ if monitor_ip:
78
+ monitor_addr = f"{monitor_ip}:{AUTOSCALER_METRIC_PORT}"
79
+ self.gcs_client.internal_kv_put(
80
+ b"AutoscalerMetricsAddress", monitor_addr.encode(), True, None
81
+ )
82
+ self._session_name = self._get_session_name(self.gcs_client)
83
+ logger.info(f"session_name: {self._session_name}")
84
+ worker.set_mode(SCRIPT_MODE)
85
+ head_node_ip = self.gcs_address.split(":")[0]
86
+
87
+ self.autoscaler = None
88
+ if log_dir:
89
+ try:
90
+ ray_event_logger = get_event_logger(
91
+ RayEvent.SourceType.AUTOSCALER, log_dir
92
+ )
93
+ self.event_logger = AutoscalerEventLogger(ray_event_logger)
94
+ except Exception:
95
+ self.event_logger = None
96
+ else:
97
+ self.event_logger = None
98
+
99
+ prom_metrics = AutoscalerPrometheusMetrics(session_name=self._session_name)
100
+ self.metric_reporter = AutoscalerMetricsReporter(prom_metrics)
101
+
102
+ if monitor_ip and prometheus_client:
103
+ # If monitor_ip wasn't passed in, then don't attempt to start the
104
+ # metric server to keep behavior identical to before metrics were
105
+ # introduced
106
+ try:
107
+ logger.info(
108
+ "Starting autoscaler metrics server on port {}".format(
109
+ AUTOSCALER_METRIC_PORT
110
+ )
111
+ )
112
+ kwargs = {"addr": "127.0.0.1"} if head_node_ip == "127.0.0.1" else {}
113
+ prometheus_client.start_http_server(
114
+ port=AUTOSCALER_METRIC_PORT,
115
+ registry=prom_metrics.registry,
116
+ **kwargs,
117
+ )
118
+ except Exception:
119
+ logger.exception(
120
+ "An exception occurred while starting the metrics server."
121
+ )
122
+ elif not prometheus_client:
123
+ logger.warning(
124
+ "`prometheus_client` not found, so metrics will not be exported."
125
+ )
126
+
127
+ self.autoscaler = Autoscaler(
128
+ session_name=self._session_name,
129
+ config_reader=config_reader,
130
+ gcs_client=self.gcs_client,
131
+ event_logger=self.event_logger,
132
+ metrics_reporter=self.metric_reporter,
133
+ )
134
+
135
+ @staticmethod
136
+ def _get_session_name(gcs_client: GcsClient) -> Optional[str]:
137
+ """Obtain the session name from the GCS.
138
+
139
+ If the GCS doesn't respond, session name is considered None.
140
+ In this case, the metrics reported from the monitor won't have
141
+ the correct session name.
142
+ """
143
+ session_name = gcs_client.internal_kv_get(
144
+ b"session_name",
145
+ ray_constants.KV_NAMESPACE_SESSION,
146
+ timeout=10,
147
+ )
148
+
149
+ if session_name:
150
+ session_name = session_name.decode()
151
+
152
+ return session_name
153
+
154
+ @staticmethod
155
+ def _report_autoscaling_state(
156
+ gcs_client: GcsClient, autoscaling_state: AutoscalingState
157
+ ):
158
+ """Report the autoscaling state to the GCS."""
159
+ try:
160
+ gcs_client.report_autoscaling_state(autoscaling_state.SerializeToString())
161
+ except Exception:
162
+ logger.exception("Error reporting autoscaling state to GCS.")
163
+
164
+ def _run(self):
165
+ """Run the monitor loop."""
166
+
167
+ while True:
168
+ autoscaling_state = self.autoscaler.update_autoscaling_state()
169
+ if autoscaling_state:
170
+ # report autoscaling state
171
+ self._report_autoscaling_state(self.gcs_client, autoscaling_state)
172
+ else:
173
+ logger.warning("No autoscaling state to report.")
174
+
175
+ # Wait for a autoscaler update interval before processing the next
176
+ # round of messages.
177
+ time.sleep(AUTOSCALER_UPDATE_INTERVAL_S)
178
+
179
+ def run(self):
180
+ try:
181
+ self._run()
182
+ except Exception:
183
+ logger.exception("Error in monitor loop")
184
+ raise
185
+
186
+
187
+ def record_autoscaler_v2_usage(gcs_client: GcsClient) -> None:
188
+ """
189
+ Record usage for autoscaler v2.
190
+ """
191
+ try:
192
+ record_extra_usage_tag(TagKey.AUTOSCALER_VERSION, "v2", gcs_client)
193
+ except Exception:
194
+ logger.exception("Error recording usage for autoscaler v2.")
195
+
196
+
197
+ if __name__ == "__main__":
198
+ parser = argparse.ArgumentParser(
199
+ description=("Parse GCS server for the monitor to connect to.")
200
+ )
201
+ parser.add_argument(
202
+ "--gcs-address", required=False, type=str, help="The address (ip:port) of GCS."
203
+ )
204
+ parser.add_argument(
205
+ "--autoscaling-config",
206
+ required=False,
207
+ type=str,
208
+ help="the path to the autoscaling config file",
209
+ )
210
+ parser.add_argument(
211
+ "--logging-level",
212
+ required=False,
213
+ type=str,
214
+ default=ray_constants.LOGGER_LEVEL,
215
+ choices=ray_constants.LOGGER_LEVEL_CHOICES,
216
+ help=ray_constants.LOGGER_LEVEL_HELP,
217
+ )
218
+ parser.add_argument(
219
+ "--logging-format",
220
+ required=False,
221
+ type=str,
222
+ default=ray_constants.LOGGER_FORMAT,
223
+ help=ray_constants.LOGGER_FORMAT_HELP,
224
+ )
225
+ parser.add_argument(
226
+ "--logging-filename",
227
+ required=False,
228
+ type=str,
229
+ default=ray_constants.MONITOR_LOG_FILE_NAME,
230
+ help="Specify the name of log file, "
231
+ "log to stdout if set empty, default is "
232
+ f'"{ray_constants.MONITOR_LOG_FILE_NAME}"',
233
+ )
234
+ parser.add_argument(
235
+ "--logs-dir",
236
+ required=True,
237
+ type=str,
238
+ help="Specify the path of the temporary directory used by Ray processes.",
239
+ )
240
+ parser.add_argument(
241
+ "--logging-rotate-bytes",
242
+ required=False,
243
+ type=int,
244
+ default=ray_constants.LOGGING_ROTATE_BYTES,
245
+ help="Specify the max bytes for rotating "
246
+ "log file, default is "
247
+ f"{ray_constants.LOGGING_ROTATE_BYTES} bytes.",
248
+ )
249
+ parser.add_argument(
250
+ "--logging-rotate-backup-count",
251
+ required=False,
252
+ type=int,
253
+ default=ray_constants.LOGGING_ROTATE_BACKUP_COUNT,
254
+ help="Specify the backup count of rotated log file, default is "
255
+ f"{ray_constants.LOGGING_ROTATE_BACKUP_COUNT}.",
256
+ )
257
+ parser.add_argument(
258
+ "--monitor-ip",
259
+ required=False,
260
+ type=str,
261
+ default=None,
262
+ help="The IP address of the machine hosting the monitor process.",
263
+ )
264
+
265
+ args = parser.parse_args()
266
+ setup_component_logger(
267
+ logging_level=args.logging_level,
268
+ logging_format=args.logging_format,
269
+ log_dir=args.logs_dir,
270
+ filename=args.logging_filename,
271
+ max_bytes=args.logging_rotate_bytes,
272
+ backup_count=args.logging_rotate_backup_count,
273
+ )
274
+
275
+ logger.info(
276
+ f"Starting autoscaler v2 monitor using ray installation: {ray.__file__}"
277
+ )
278
+ logger.info(f"Ray version: {ray.__version__}")
279
+ logger.info(f"Ray commit: {ray.__commit__}")
280
+ logger.info(f"AutoscalerMonitor started with command: {sys.argv}")
281
+
282
+ gcs_address = args.gcs_address
283
+ if gcs_address is None:
284
+ raise ValueError("--gcs-address must be set!")
285
+
286
+ if not args.autoscaling_config:
287
+ logger.info("No autoscaling config provided: use read only node provider.")
288
+ config_reader = ReadOnlyProviderConfigReader(gcs_address)
289
+ else:
290
+ autoscaling_config = os.path.expanduser(args.autoscaling_config)
291
+ config_reader = FileConfigReader(
292
+ config_file=autoscaling_config, skip_content_hash=True
293
+ )
294
+
295
+ monitor = AutoscalerMonitor(
296
+ gcs_address,
297
+ config_reader,
298
+ log_dir=args.logs_dir,
299
+ monitor_ip=args.monitor_ip,
300
+ )
301
+
302
+ monitor.run()
.venv/lib/python3.11/site-packages/ray/autoscaler/v2/scheduler.py ADDED
@@ -0,0 +1,1642 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import copy
2
+ import logging
3
+ import time
4
+ import uuid
5
+ from abc import ABC, abstractmethod
6
+ from collections import defaultdict
7
+ from dataclasses import dataclass, field
8
+ from enum import Enum
9
+ from typing import Dict, List, Optional, Tuple
10
+
11
+ from ray._private.protobuf_compat import message_to_dict
12
+ from ray.autoscaler._private.constants import AUTOSCALER_CONSERVE_GPU_NODES
13
+ from ray.autoscaler._private.resource_demand_scheduler import (
14
+ UtilizationScore,
15
+ _fits,
16
+ _inplace_subtract,
17
+ )
18
+ from ray.autoscaler.v2.event_logger import AutoscalerEventLogger
19
+ from ray.autoscaler.v2.instance_manager.common import InstanceUtil
20
+ from ray.autoscaler.v2.instance_manager.config import NodeTypeConfig
21
+ from ray.autoscaler.v2.schema import AutoscalerInstance, NodeType
22
+ from ray.autoscaler.v2.utils import ProtobufUtil, ResourceRequestUtil
23
+ from ray.core.generated.autoscaler_pb2 import (
24
+ ClusterResourceConstraint,
25
+ GangResourceRequest,
26
+ ResourceRequest,
27
+ ResourceRequestByCount,
28
+ )
29
+ from ray.core.generated.instance_manager_pb2 import (
30
+ Instance,
31
+ LaunchRequest,
32
+ NodeKind,
33
+ TerminationRequest,
34
+ )
35
+
36
+ # ============= Resource Scheduling Service API =======================
37
+ #
38
+ # ResourceSchedulerService is a service that schedules resource bundles
39
+ # to nodes. It's used by the autoscaler to schedule resource bundles
40
+ # to determine the desired cluster size to satisfy the current resource
41
+ # demands.
42
+ #
43
+ logger = logging.getLogger(__name__)
44
+
45
+
46
+ @dataclass
47
+ class SchedulingRequest:
48
+ # If outdated node check through launch config is disabled.
49
+ disable_launch_config_check: bool
50
+ # Available node type configs
51
+ node_type_configs: Dict[NodeType, NodeTypeConfig] = field(default_factory=dict)
52
+ # Max number of worker nodes.
53
+ max_num_nodes: Optional[int] = None
54
+ # Idle timeout in seconds.
55
+ idle_timeout_s: Optional[float] = None
56
+ # TODO: This prob could be refactored into the ClusterStatus data class later.
57
+ # The current ray resource requests.
58
+ resource_requests: List[ResourceRequestByCount] = field(default_factory=list)
59
+ # The Gang resource requests.
60
+ gang_resource_requests: List[GangResourceRequest] = field(default_factory=list)
61
+ # cluster resource constraints.
62
+ cluster_resource_constraints: List[ClusterResourceConstraint] = field(
63
+ default_factory=list
64
+ )
65
+ # The current instances.
66
+ current_instances: List[AutoscalerInstance] = field(default_factory=list)
67
+
68
+
69
+ @dataclass
70
+ class SchedulingReply:
71
+ # Instances to launch.
72
+ to_launch: List[LaunchRequest] = field(default_factory=list)
73
+ # To terminate.
74
+ to_terminate: List[TerminationRequest] = field(default_factory=list)
75
+ # The infeasible resource bundles.
76
+ infeasible_resource_requests: List[ResourceRequest] = field(default_factory=list)
77
+ # The infeasible gang resource bundles.
78
+ infeasible_gang_resource_requests: List[GangResourceRequest] = field(
79
+ default_factory=list
80
+ )
81
+ # The infeasible cluster resource constraints.
82
+ infeasible_cluster_resource_constraints: List[ClusterResourceConstraint] = field(
83
+ default_factory=list
84
+ )
85
+
86
+
87
+ class IResourceScheduler(ABC):
88
+ """
89
+ Interface for a resource scheduler.
90
+
91
+ Implements the `instance_manager.proto ResourceSchedulerService` interface.
92
+ """
93
+
94
+ @abstractmethod
95
+ def schedule(self, request: SchedulingRequest) -> SchedulingReply:
96
+ """
97
+ Given the resource requests and the current cluster state, calculate the
98
+ target cluster shape by trying to schedule the resource requests on the
99
+ nodes.
100
+ """
101
+ pass
102
+
103
+
104
+ class SchedulingNodeStatus(Enum):
105
+ """
106
+ The status of a scheduling node (`SchedulingNode`)
107
+ """
108
+
109
+ # The node is added by the ResourceDemandScheduler.
110
+ TO_LAUNCH = "TO_LAUNCH"
111
+ # The node is pending, i.e. there's already an autoscaler instance being launched
112
+ # The node is schedulable. It could be running ray or pending to run ray. Either
113
+ # Way, it should be able to accept new resource requests/resource constraints.
114
+ SCHEDULABLE = "SCHEDULABLE"
115
+ # The node is to be terminated by the ResourceDemandScheduler
116
+ TO_TERMINATE = "TO_TERMINATE"
117
+
118
+
119
+ class ResourceRequestSource(Enum):
120
+ """
121
+ The source of the resource request.
122
+ """
123
+
124
+ # The resource request is from demand, e.g. ray tasks/actors,
125
+ # placement groups, etc.
126
+ PENDING_DEMAND = "PENDING_DEMAND"
127
+ # The resource request is from the cluster resource constraints, i.e.
128
+ # from ray.autoscaler.sdk.request_resources().
129
+ CLUSTER_RESOURCE_CONSTRAINT = "CLUSTER_RESOURCE_CONSTRAINT"
130
+
131
+
132
+ @dataclass
133
+ class SchedulingNode:
134
+ """
135
+ A abstraction of a node that can be scheduled on by the resource scheduler.
136
+
137
+ A scheduling node is expected to be used as:
138
+
139
+ node = SchedulingNode.new(instance, node_configs)
140
+ remaining, score = node.try_schedule(requests)
141
+
142
+ .... do something with the score ....
143
+
144
+ NOTE:
145
+ One could also extend the scheduling behavior by overriding `try_schedule`
146
+ """
147
+
148
+ # Node type name.
149
+ node_type: NodeType
150
+ # Status
151
+ status: SchedulingNodeStatus
152
+ # Resource requests scheduled on this nodes for different sources.
153
+ sched_requests: Dict[ResourceRequestSource, List[ResourceRequest]] = field(
154
+ default_factory=lambda: defaultdict(list)
155
+ )
156
+ # Available resources for different sources of requests.
157
+ available_resources_for_sched: Dict[
158
+ ResourceRequestSource, Dict[str, float]
159
+ ] = field(default_factory=dict)
160
+ # The node's current resource capacity.
161
+ total_resources: Dict[str, float] = field(default_factory=dict)
162
+ # Node's labels, including static or dynamic labels.
163
+ labels: Dict[str, str] = field(default_factory=dict)
164
+ # Observability descriptive message for why the node was launched in the
165
+ # first place.
166
+ launch_reason: Optional[str] = None
167
+ # Termination request, none when the node is not being terminated.
168
+ termination_request: Optional[TerminationRequest] = None
169
+ # The instance id of the IM(Instance Manager) instance. None if the node
170
+ # is not yet in IM.
171
+ im_instance_id: Optional[str] = None
172
+ # The ray node id of the ray node. None if the node is not included in
173
+ # ray cluster's GCS report yet (not running ray yet).
174
+ ray_node_id: Optional[str] = None
175
+ # Idle duration in ms. Default not idle.
176
+ idle_duration_ms: int = 0
177
+ # Launch config hash.
178
+ launch_config_hash: Optional[str] = None
179
+ # node kind.
180
+ node_kind: NodeKind = NodeKind.WORKER
181
+
182
+ def __init__(
183
+ self,
184
+ node_type: NodeType,
185
+ total_resources: Dict[str, float],
186
+ available_resources: Dict[str, float],
187
+ labels: Dict[str, str],
188
+ status: SchedulingNodeStatus,
189
+ im_instance_id: str = "",
190
+ ray_node_id: str = "",
191
+ idle_duration_ms: int = 0,
192
+ launch_config_hash: str = "",
193
+ node_kind: NodeKind = NodeKind.WORKER,
194
+ termination_request: Optional[TerminationRequest] = None,
195
+ ):
196
+ self.node_type = node_type
197
+ self.total_resources = total_resources
198
+ self.available_resources_for_sched = {
199
+ ResourceRequestSource.PENDING_DEMAND: dict(available_resources),
200
+ ResourceRequestSource.CLUSTER_RESOURCE_CONSTRAINT: dict(total_resources),
201
+ }
202
+ self.sched_requests = {
203
+ ResourceRequestSource.PENDING_DEMAND: [],
204
+ ResourceRequestSource.CLUSTER_RESOURCE_CONSTRAINT: [],
205
+ }
206
+ self.labels = labels
207
+ self.status = status
208
+ self.im_instance_id = im_instance_id
209
+ self.ray_node_id = ray_node_id
210
+ self.idle_duration_ms = idle_duration_ms
211
+ self.launch_config_hash = launch_config_hash
212
+ self.node_kind = node_kind
213
+ self.termination_request = termination_request
214
+
215
+ def get_available_resources(self, resource_request_source: ResourceRequestSource):
216
+ """Get the available resources for the given resource request source."""
217
+ return self.available_resources_for_sched[resource_request_source]
218
+
219
+ def get_sched_requests(self, resource_request_source: ResourceRequestSource):
220
+ """Get the resource requests for the given resource request source."""
221
+ return self.sched_requests[resource_request_source]
222
+
223
+ def add_sched_request(
224
+ self,
225
+ request: ResourceRequest,
226
+ resource_request_source: ResourceRequestSource,
227
+ ):
228
+ """
229
+ Add the resource requests to the node.
230
+
231
+ Args:
232
+ request: The resource request to be added.
233
+ resource_request_source: The source of the resource request.
234
+ """
235
+ self.sched_requests[resource_request_source].append(request)
236
+
237
+ @staticmethod
238
+ def new(
239
+ instance: AutoscalerInstance,
240
+ node_type_configs: Dict[NodeType, NodeTypeConfig],
241
+ disable_launch_config_check: bool,
242
+ ) -> Optional["SchedulingNode"]:
243
+ """
244
+ Create a new scheduling node from an autoscaler instance.
245
+
246
+ It creates:
247
+ - None if the instance is not schedulable by IM.
248
+ - A schedulable node if the instance is running ray or pending to run ray,
249
+ so it should be considered in the scheduling process.
250
+
251
+ Args:
252
+ instance: The instance.
253
+ node_type_configs: The node type configs.
254
+ disable_launch_config_check: If outdated node check through launch config is
255
+ disabled.
256
+
257
+ """
258
+ if not SchedulingNode.is_schedulable(instance):
259
+ return None
260
+
261
+ if instance.im_instance.status == Instance.RAY_RUNNING:
262
+ assert instance.ray_node is not None, (
263
+ "ray node should not be None "
264
+ f"when the instance is running ray: instance={instance}"
265
+ )
266
+ # An running ray node
267
+ return SchedulingNode(
268
+ node_type=instance.im_instance.instance_type,
269
+ total_resources=dict(instance.ray_node.total_resources),
270
+ # Available resources for scheduling requests of different
271
+ # sources.
272
+ available_resources=dict(instance.ray_node.available_resources),
273
+ # Use ray node's dynamic labels.
274
+ labels=dict(instance.ray_node.dynamic_labels),
275
+ status=SchedulingNodeStatus.SCHEDULABLE,
276
+ im_instance_id=instance.im_instance.instance_id,
277
+ ray_node_id=instance.im_instance.node_id,
278
+ idle_duration_ms=instance.ray_node.idle_duration_ms,
279
+ launch_config_hash=instance.im_instance.launch_config_hash,
280
+ node_kind=instance.im_instance.node_kind,
281
+ )
282
+
283
+ # This is an instance pending to run ray. Initialize a schedulable node
284
+ # from the node type config.
285
+ node_config = node_type_configs.get(instance.im_instance.instance_type, None)
286
+ if node_config is None:
287
+ if disable_launch_config_check:
288
+ # We are not terminating outdated nodes.
289
+ logger.info(
290
+ f"Node config for {instance.im_instance.instance_type} is missing, "
291
+ "but we are not terminating the outdated node because "
292
+ "`disable_launch_config_check` is True in "
293
+ "the autoscaler's provider config."
294
+ )
295
+ return None
296
+
297
+ # Configs might have been updated, and no more
298
+ # node_type_configs for this node type. We should terminate it.
299
+ return SchedulingNode(
300
+ node_type=instance.im_instance.instance_type,
301
+ total_resources={},
302
+ available_resources={},
303
+ labels={},
304
+ status=SchedulingNodeStatus.TO_TERMINATE,
305
+ im_instance_id=instance.im_instance.instance_id,
306
+ termination_request=TerminationRequest(
307
+ id=str(uuid.uuid4()),
308
+ instance_id=instance.im_instance.instance_id,
309
+ cause=TerminationRequest.Cause.OUTDATED,
310
+ instance_type=instance.im_instance.instance_type,
311
+ ),
312
+ node_kind=NodeKind.WORKER,
313
+ )
314
+
315
+ return SchedulingNode.from_node_config(
316
+ node_config,
317
+ SchedulingNodeStatus.SCHEDULABLE,
318
+ node_kind=instance.im_instance.node_kind,
319
+ im_instance_id=instance.im_instance.instance_id,
320
+ )
321
+
322
+ @staticmethod
323
+ def is_schedulable(instance: AutoscalerInstance) -> bool:
324
+ """
325
+ Check if the instance is schedulable by IM.
326
+
327
+ Args:
328
+ instance: The instance.
329
+
330
+ Returns:
331
+ True if the instance is schedulable by IM.
332
+ """
333
+ if instance.im_instance is None:
334
+ # We will skip any instances that are not yet in IM which
335
+ # could be
336
+ # 1. an out-of-band ray node
337
+ # 2. an cloud instance running ray not yet discovered
338
+ # by the IM's Reconciler
339
+ # 3. an cloud instance already terminated but ray state
340
+ # still lagging behind.
341
+ #
342
+ # In all of these cases, the instance is not schedulable or
343
+ # shouldn't be managed by IM, so we don't consider them.
344
+ return False
345
+
346
+ # These are the statuses where there's a running ray node or
347
+ # could eventually run ray.
348
+ if InstanceUtil.is_ray_running_reachable(instance.im_instance.status):
349
+ return True
350
+
351
+ return False
352
+
353
+ @staticmethod
354
+ def from_node_config(
355
+ node_config: NodeTypeConfig,
356
+ status: SchedulingNodeStatus,
357
+ node_kind: NodeKind,
358
+ im_instance_id: Optional[str] = None,
359
+ ) -> "SchedulingNode":
360
+ """
361
+ Create a scheduling node from a node config.
362
+
363
+ Args:
364
+ node_config: The node config.
365
+ status: The status of the node.
366
+ node_kind: The node kind.
367
+ im_instance_id: The instance id of the im instance.
368
+ node_kind: The node kind.
369
+ """
370
+ return SchedulingNode(
371
+ node_type=node_config.name,
372
+ total_resources=dict(node_config.resources),
373
+ available_resources=dict(node_config.resources),
374
+ labels=dict(node_config.labels),
375
+ status=status,
376
+ im_instance_id=im_instance_id,
377
+ node_kind=node_kind,
378
+ )
379
+
380
+ def __post_init__(self):
381
+ assert self.node_type, "node_type should be set"
382
+
383
+ def try_schedule(
384
+ self,
385
+ requests: List[ResourceRequest],
386
+ resource_request_source: ResourceRequestSource,
387
+ ) -> Tuple[List[ResourceRequest], UtilizationScore]:
388
+ """
389
+ Try to schedule the resource requests on this node.
390
+
391
+ This modifies the node's available resources if the requests are schedulable.
392
+ The requests are scheduled one by one in the sorted order, and no
393
+ backtracking is done.
394
+
395
+ Args:
396
+ requests: The resource requests to be scheduled.
397
+ resource_request_source: The source of the resource request, i.e.
398
+ pending demands from ray actors/tasks or cluster resource constraints.
399
+
400
+ Returns:
401
+ A tuple of:
402
+ - list of remaining requests that cannot be scheduled on this node.
403
+ - the utilization score for this node with respect to the current
404
+ resource requests being scheduled.
405
+ """
406
+ # Track the resource requests that cannot be scheduled on this node.
407
+ unschedulable_requests = []
408
+
409
+ # Sort the requests and try schedule them one by one.
410
+ for r in requests:
411
+ if not self._try_schedule_one(r, resource_request_source):
412
+ unschedulable_requests.append(r)
413
+
414
+ score = self._compute_score(resource_request_source)
415
+
416
+ return unschedulable_requests, score
417
+
418
+ def _compute_score(
419
+ self, resource_request_source: ResourceRequestSource
420
+ ) -> UtilizationScore:
421
+ """
422
+ Compute the utilization score for this node with respect to the current resource
423
+ request being scheduled.
424
+
425
+ A "higher" score means that this node is more suitable for scheduling the
426
+ current scheduled resource requests.
427
+
428
+ The score is a tuple of 4 values:
429
+ 1. Whether this node is a GPU node and the current resource request has
430
+ GPU requirements:
431
+ 0: if this node is a GPU node and the current resource request
432
+ placed onto the node has no GPU requirements.
433
+ 1: if this node is not a GPU node or the current resource request
434
+ placed onto the node has GPU requirements.
435
+ 2. The number of resource types being scheduled.
436
+ 3. The minimum utilization rate across all resource types.
437
+ 4. The average utilization rate across all resource types.
438
+
439
+ NOTE:
440
+ This function is adapted from _resource_based_utilization_scorer from
441
+ autoscaler v1.
442
+
443
+ TODO(rickyx,jjyao): We should also consider node labels for
444
+ scoring. For example, if a node has a label that matches the affinity
445
+ label of the resource request, we should give it a higher score.
446
+
447
+ TODO(rickyx): add pluggable scoring functions here.
448
+
449
+ Returns:
450
+ A utilization score for this node.
451
+ """
452
+
453
+ sched_requests = self.get_sched_requests(resource_request_source)
454
+ available_resources = self.get_available_resources(resource_request_source)
455
+
456
+ # Compute the number of resource types being scheduled.
457
+ num_matching_resource_types = 0
458
+ sched_resource_types = set()
459
+ for req in sched_requests:
460
+ for resource_name, v in req.resources_bundle.items():
461
+ if v > 0:
462
+ sched_resource_types.add(resource_name)
463
+
464
+ for sched_resource_type in sched_resource_types:
465
+ if sched_resource_type in self.total_resources:
466
+ num_matching_resource_types += 1
467
+
468
+ # Compute the utilization rate for each resource type
469
+ util_by_resources = []
470
+ for k, v in self.total_resources.items():
471
+ if v == 0:
472
+ # Skip any zero values.
473
+ continue
474
+ if k in available_resources:
475
+ util = (v - available_resources.get(k, 0)) / v
476
+ assert util >= 0 and util <= 1, f"Invalid utilization: {util}"
477
+ util_by_resources.append(v * (util**3))
478
+
479
+ # Prefer not to launch a GPU node if there aren't any GPU requirements in the
480
+ # resource bundle.
481
+ gpu_ok = True
482
+ if AUTOSCALER_CONSERVE_GPU_NODES:
483
+ # TODO: we should also generalize this optimization for accelerators.
484
+ # https://github.com/ray-project/ray/issues/43079
485
+ is_gpu_node = self.total_resources.get("GPU", 0) > 0
486
+ any_gpu_requests = any("GPU" in r.resources_bundle for r in sched_requests)
487
+ if is_gpu_node and not any_gpu_requests:
488
+ gpu_ok = False
489
+
490
+ # Prioritize avoiding gpu nodes for non-gpu workloads first,
491
+ # then prioritize matching multiple resource types,
492
+ # then prioritize using all resources,
493
+ # then prioritize overall balance of multiple resources.
494
+ return (
495
+ gpu_ok,
496
+ num_matching_resource_types,
497
+ min(util_by_resources) if util_by_resources else 0,
498
+ float(sum(util_by_resources)) / len(util_by_resources)
499
+ if util_by_resources
500
+ else 0,
501
+ )
502
+
503
+ def _try_schedule_one(
504
+ self, request: ResourceRequest, resource_request_source: ResourceRequestSource
505
+ ) -> bool:
506
+ """
507
+ Try to schedule one resource request on this node. The request could be from
508
+ various sources, specified by `resource_request_source`.
509
+
510
+ Args:
511
+ request: The resource request to be scheduled.
512
+ resource_request_source: The source of the resource request, i.e.
513
+ pending demands from ray actors/tasks or cluster resource constraints.
514
+
515
+ Returns:
516
+ True if the resource request is scheduled on this node.
517
+ """
518
+
519
+ # Check if there's placement constraints that are not satisfied.
520
+ for constraint in request.placement_constraints:
521
+ if constraint.HasField("anti_affinity"):
522
+ anti_affinity = constraint.anti_affinity
523
+ if (
524
+ anti_affinity.label_name in self.labels
525
+ and anti_affinity.label_value
526
+ == self.labels[anti_affinity.label_name]
527
+ ):
528
+ # The node already has a label that matches the anti-affinity
529
+ return False
530
+
531
+ # We don't need to check for affinity constraints here since
532
+ # we have already combined resource requests with the affinity
533
+ # constraints into the same request at `combine_requests_with_affinity`.
534
+ pass
535
+
536
+ available_resources_dict = self.get_available_resources(resource_request_source)
537
+
538
+ # Check if there's enough resources to schedule the request.
539
+ if not _fits(available_resources_dict, dict(request.resources_bundle)):
540
+ return False
541
+
542
+ # Schedule the request, update resources
543
+ _inplace_subtract(available_resources_dict, dict(request.resources_bundle))
544
+
545
+ # Add the request to the node.
546
+ self.add_sched_request(request, resource_request_source)
547
+
548
+ # Update the dynamic labels if there's any
549
+ for constraint in request.placement_constraints:
550
+ # We don't need to check for affinity constraints here since
551
+ # we have already combined resource requests with the affinity
552
+ # constraints into the same request at `combine_requests_with_affinity`.
553
+ # We don't need node labels for enforcing affinity.
554
+ if constraint.HasField("anti_affinity"):
555
+ anti_affinity = constraint.anti_affinity
556
+ self._add_label(anti_affinity.label_name, anti_affinity.label_value)
557
+
558
+ return True
559
+
560
+ def _add_label(self, label_name: str, label_value: str):
561
+ """
562
+ Add a label to the node.
563
+ This assumes a label key can only have one value.
564
+ """
565
+ assert (
566
+ self.labels.get(label_name) is None
567
+ or self.labels[label_name] == label_value
568
+ ), (
569
+ f"Label {label_name} already exists with value "
570
+ f"{self.labels[label_name]}, cannot set to "
571
+ f"{label_value}"
572
+ )
573
+ self.labels[label_name] = label_value
574
+
575
+ def __repr__(self) -> str:
576
+ return (
577
+ "SchedulingNode(node_type={node_type}, "
578
+ "node_kind={node_kind}, "
579
+ "instance_id={instance_id},"
580
+ "ray_node_id={ray_node_id},"
581
+ "idle_duration_ms={idle_duration_ms},"
582
+ "termination_request={termination_request},"
583
+ "status={status}, "
584
+ "total_resources={total_resources}, "
585
+ "available_resources_for_demand={available_resources_for_demand}, "
586
+ "available_resources_for_cluster_resource_constraints="
587
+ "{available_resources_for_cluster_resource_constraints},"
588
+ "labels={labels}, launch_reason={launch_reason}), "
589
+ "sched_requests_for_demand={sched_requests_for_demand}), "
590
+ "sched_requests_for_cluster_resource_constraints="
591
+ "{sched_requests_for_cluster_resources_constraint})"
592
+ ).format(
593
+ node_type=self.node_type,
594
+ node_kind=self.node_kind,
595
+ instance_id=self.im_instance_id,
596
+ ray_node_id=self.ray_node_id,
597
+ idle_duration_ms=self.idle_duration_ms,
598
+ termination_request=str(message_to_dict(self.termination_request))
599
+ if self.termination_request
600
+ else None,
601
+ status=self.status,
602
+ total_resources=self.total_resources,
603
+ available_resources_for_demand=self.available_resources_for_sched[
604
+ ResourceRequestSource.PENDING_DEMAND
605
+ ],
606
+ available_resources_for_cluster_resource_constraints=self.available_resources_for_sched[ # noqa
607
+ ResourceRequestSource.CLUSTER_RESOURCE_CONSTRAINT
608
+ ],
609
+ labels=self.labels,
610
+ launch_reason=self.launch_reason,
611
+ sched_requests_for_demand="|".join(
612
+ str(message_to_dict(r))
613
+ for r in self.sched_requests[ResourceRequestSource.PENDING_DEMAND]
614
+ ),
615
+ sched_requests_for_cluster_resources_constraint="|".join(
616
+ str(message_to_dict(r))
617
+ for r in self.sched_requests[
618
+ ResourceRequestSource.CLUSTER_RESOURCE_CONSTRAINT
619
+ ]
620
+ ),
621
+ )
622
+
623
+
624
+ class ResourceDemandScheduler(IResourceScheduler):
625
+ """
626
+ A resource demand scheduler that schedules resource requests based on the
627
+ following rules:
628
+ 1. Enforce the minimal count of nodes for each worker node type.
629
+ 2. Enforce the cluster resource constraints.
630
+ 3. Schedule the gang resource requests.
631
+ 4. Schedule the tasks/actor resource requests
632
+ """
633
+
634
+ def __init__(self, event_logger: Optional[AutoscalerEventLogger] = None):
635
+ self._event_logger = event_logger
636
+
637
+ @dataclass
638
+ class ScheduleContext:
639
+ """
640
+ Encapsulates the context for processing one scheduling request.
641
+
642
+ This exposes functions to read and write the scheduling nodes, to prevent
643
+ accidental modification of the internal state.
644
+ """
645
+
646
+ # The node type configs for this scheduling request.
647
+ _node_type_configs: Dict[NodeType, NodeTypeConfig]
648
+ # If outdated node check through launch config is disabled.
649
+ _disable_launch_config_check: bool
650
+ # The max number of nodes for the entire cluster.
651
+ _max_num_nodes: Optional[int] = None
652
+ # The idle timeout in seconds.
653
+ _idle_timeout_s: Optional[float] = None
654
+ # The current schedulable nodes (including pending nodes and pending requests).
655
+ _nodes: List[SchedulingNode] = field(default_factory=list)
656
+ # The number of nodes by node types available for launching based on the max
657
+ # number of workers in the config. This takes into account any pending/running
658
+ # nodes.
659
+ _node_type_available: Dict[NodeType, int] = field(default_factory=dict)
660
+
661
+ def __init__(
662
+ self,
663
+ nodes: List[SchedulingNode],
664
+ node_type_configs: Dict[NodeType, NodeTypeConfig],
665
+ disable_launch_config_check: bool,
666
+ max_num_nodes: Optional[int] = None,
667
+ idle_timeout_s: Optional[float] = None,
668
+ ):
669
+ self._nodes = nodes
670
+ self._node_type_configs = node_type_configs
671
+ self._node_type_available = self._compute_available_node_types(
672
+ nodes, node_type_configs
673
+ )
674
+ self._max_num_nodes = max_num_nodes
675
+ self._idle_timeout_s = idle_timeout_s
676
+ self._disable_launch_config_check = disable_launch_config_check
677
+
678
+ @classmethod
679
+ def from_schedule_request(
680
+ cls, req: SchedulingRequest
681
+ ) -> "ResourceDemandScheduler.ScheduleContext":
682
+ """
683
+ Create a schedule context from a schedule request.
684
+ It will populate the context with the existing nodes and the available node
685
+ types from the config.
686
+
687
+ Args:
688
+ req: The scheduling request. The caller should make sure the
689
+ request is valid.
690
+ """
691
+
692
+ nodes = []
693
+ node_type_configs = req.node_type_configs
694
+
695
+ # Initialize the scheduling nodes.
696
+ for instance in req.current_instances:
697
+ node = SchedulingNode.new(
698
+ instance, node_type_configs, req.disable_launch_config_check
699
+ )
700
+ if node:
701
+ nodes.append(node)
702
+
703
+ return cls(
704
+ nodes=nodes,
705
+ node_type_configs=node_type_configs,
706
+ disable_launch_config_check=req.disable_launch_config_check,
707
+ max_num_nodes=req.max_num_nodes,
708
+ idle_timeout_s=req.idle_timeout_s,
709
+ )
710
+
711
+ @staticmethod
712
+ def _compute_available_node_types(
713
+ nodes: List[SchedulingNode],
714
+ node_type_configs: Dict[NodeType, NodeTypeConfig],
715
+ ) -> Dict[NodeType, int]:
716
+ """
717
+ Compute the number of nodes by node types available for launching based on
718
+ the max number of workers in the config.
719
+ Args:
720
+ nodes: The current existing nodes.
721
+ node_type_configs: The node type configs.
722
+ Returns:
723
+ A dict of node types and the number of nodes available for launching.
724
+ """
725
+ node_type_available: Dict[NodeType, int] = defaultdict(int)
726
+ node_type_existing: Dict[NodeType, int] = defaultdict(int)
727
+ for node in nodes:
728
+ node_type_existing[node.node_type] += 1
729
+
730
+ for (
731
+ node_type,
732
+ node_type_config,
733
+ ) in node_type_configs.items():
734
+ node_type_available[
735
+ node_type
736
+ ] = node_type_config.max_worker_nodes - node_type_existing.get(
737
+ node_type, 0
738
+ )
739
+
740
+ return node_type_available
741
+
742
+ def get_nodes(self) -> List[SchedulingNode]:
743
+ """
744
+ Get the current nodes with filter.
745
+
746
+ Returns:
747
+ A list of nodes.
748
+ """
749
+ nodes = copy.deepcopy(self._nodes)
750
+ return nodes
751
+
752
+ def get_node_type_available(self) -> Dict[NodeType, int]:
753
+ return copy.deepcopy(self._node_type_available)
754
+
755
+ def get_cluster_shape(self) -> Dict[NodeType, int]:
756
+ cluster_shape = defaultdict(int)
757
+ for node in self._nodes:
758
+ if node.status == SchedulingNodeStatus.TO_TERMINATE:
759
+ # Skip the nodes that are to be terminated.
760
+ continue
761
+
762
+ cluster_shape[node.node_type] += 1
763
+ return cluster_shape
764
+
765
+ def get_idle_timeout_s(self) -> Optional[float]:
766
+ return self._idle_timeout_s
767
+
768
+ def update(self, new_nodes: List[SchedulingNode]) -> None:
769
+ """
770
+ Update the context with the new nodes.
771
+ """
772
+ self._nodes = new_nodes
773
+
774
+ # Update the available node types.
775
+ self._node_type_available = self._compute_available_node_types(
776
+ self._nodes, self._node_type_configs
777
+ )
778
+
779
+ def get_max_num_nodes(self) -> Optional[int]:
780
+ """
781
+ Get the max number of nodes for the entire cluster.
782
+ """
783
+ return self._max_num_nodes
784
+
785
+ def get_node_type_configs(self) -> Dict[NodeType, NodeTypeConfig]:
786
+ return self._node_type_configs
787
+
788
+ def __str__(self) -> str:
789
+ return "ScheduleContext({} nodes, node_type_available={})".format(
790
+ len(self._nodes), dict(self._node_type_available)
791
+ )
792
+
793
+ def get_launch_requests(self) -> List[LaunchRequest]:
794
+ """
795
+ Get the launch requests for the nodes that are to be launched.
796
+ """
797
+ launch_by_type = defaultdict(int)
798
+ for node in self._nodes:
799
+ if node.status == SchedulingNodeStatus.TO_LAUNCH:
800
+ launch_by_type[node.node_type] += 1
801
+
802
+ launch_requests = []
803
+ for instance_type, count in launch_by_type.items():
804
+ launch_requests.append(
805
+ LaunchRequest(
806
+ instance_type=instance_type,
807
+ count=count,
808
+ id=str(uuid.uuid4()),
809
+ request_ts_ms=time.time_ns() // 1000,
810
+ )
811
+ )
812
+ return launch_requests
813
+
814
+ def get_terminate_requests(
815
+ self,
816
+ ) -> List[TerminationRequest]:
817
+ """
818
+ Get the terminate requests for the nodes that are to be terminated.
819
+ """
820
+ return [
821
+ node.termination_request
822
+ for node in self._nodes
823
+ if node.termination_request is not None
824
+ ]
825
+
826
+ def schedule(self, request: SchedulingRequest) -> SchedulingReply:
827
+ logger.debug(
828
+ "Scheduling for request: resource_request={}, gang_resource_request={}, "
829
+ "cluster_constraint={}".format(
830
+ ResourceRequestUtil.to_dict_list(request.resource_requests),
831
+ ProtobufUtil.to_dict_list(request.gang_resource_requests),
832
+ ProtobufUtil.to_dict_list(request.cluster_resource_constraints),
833
+ )
834
+ )
835
+
836
+ ctx = ResourceDemandScheduler.ScheduleContext.from_schedule_request(request)
837
+
838
+ # Enforce outdate nodes.
839
+ ResourceDemandScheduler._terminate_outdated_nodes(ctx)
840
+
841
+ # Enforce the minimal count of nodes for each worker node type.
842
+ ResourceDemandScheduler._enforce_min_workers_per_type(ctx)
843
+
844
+ # Enforce the max worker nodes count.
845
+ ResourceDemandScheduler._enforce_max_workers_per_type(ctx)
846
+
847
+ # Enforce the max worker nodes count globally.
848
+ ResourceDemandScheduler._enforce_max_workers_global(ctx)
849
+
850
+ # Enforce the cluster resource constraints.
851
+ infeasible_constraints = ResourceDemandScheduler._enforce_resource_constraints(
852
+ ctx, request.cluster_resource_constraints
853
+ )
854
+
855
+ # Schedule the gang resource requests.
856
+ infeasible_gang_requests = (
857
+ ResourceDemandScheduler._sched_gang_resource_requests(
858
+ ctx, request.gang_resource_requests
859
+ )
860
+ )
861
+
862
+ # Schedule the tasks/actor resource requests
863
+ infeasible_requests = ResourceDemandScheduler._sched_resource_requests(
864
+ ctx,
865
+ ResourceRequestUtil.ungroup_by_count(request.resource_requests),
866
+ )
867
+
868
+ # Shutdown any idle nodes that's not needed (e.g. no resource constraints.
869
+ # not needed by min_worker count, etc.)
870
+ ResourceDemandScheduler._enforce_idle_termination(ctx)
871
+
872
+ # Compute the number of nodes to launch.
873
+ reply = SchedulingReply(
874
+ infeasible_resource_requests=infeasible_requests,
875
+ infeasible_gang_resource_requests=infeasible_gang_requests,
876
+ infeasible_cluster_resource_constraints=infeasible_constraints,
877
+ to_launch=ctx.get_launch_requests(),
878
+ to_terminate=ctx.get_terminate_requests(),
879
+ )
880
+
881
+ if self._event_logger is not None:
882
+ try:
883
+ self._event_logger.log_cluster_scheduling_update(
884
+ launch_requests=reply.to_launch,
885
+ terminate_requests=reply.to_terminate,
886
+ infeasible_requests=infeasible_requests,
887
+ infeasible_gang_requests=infeasible_gang_requests,
888
+ infeasible_cluster_resource_constraints=infeasible_constraints,
889
+ cluster_shape=ctx.get_cluster_shape(),
890
+ node_type_configs=ctx.get_node_type_configs(),
891
+ )
892
+ except Exception:
893
+ logger.exception("Failed to emit event logs.")
894
+
895
+ return reply
896
+
897
+ @staticmethod
898
+ def _enforce_max_workers_per_type(
899
+ ctx: "ResourceDemandScheduler.ScheduleContext",
900
+ ) -> None:
901
+ """
902
+ Enforce the max number of workers for each node type.
903
+ """
904
+
905
+ # Get all the nodes by type
906
+ all_nodes = ctx.get_nodes()
907
+
908
+ non_terminating_nodes_by_type = defaultdict(list)
909
+ terminating_nodes = []
910
+ for node in all_nodes:
911
+ if node.status == SchedulingNodeStatus.TO_TERMINATE:
912
+ terminating_nodes.append(node)
913
+ else:
914
+ non_terminating_nodes_by_type[node.node_type].append(node)
915
+
916
+ # Step 1. Enforce the max number of workers for each node type.
917
+ for node_type in non_terminating_nodes_by_type.keys():
918
+ non_terminate_nodes_of_type = non_terminating_nodes_by_type[node_type]
919
+ node_config = ctx.get_node_type_configs()[node_type]
920
+ num_max_nodes_per_type = node_config.max_worker_nodes
921
+ num_extra_nodes = len(non_terminate_nodes_of_type) - num_max_nodes_per_type
922
+
923
+ if num_extra_nodes <= 0:
924
+ # No extra nodes for this type, continue.
925
+ continue
926
+
927
+ # Terminate the nodes
928
+ (
929
+ to_terminate,
930
+ remained_nodes,
931
+ ) = ResourceDemandScheduler._select_nodes_to_terminate(
932
+ non_terminate_nodes_of_type,
933
+ num_extra_nodes,
934
+ TerminationRequest.Cause.MAX_NUM_NODE_PER_TYPE,
935
+ max_num_nodes_per_type=num_max_nodes_per_type,
936
+ )
937
+
938
+ non_terminating_nodes_by_type[node_type] = remained_nodes
939
+ terminating_nodes.extend(to_terminate)
940
+
941
+ non_terminating_nodes = []
942
+ for nodes in non_terminating_nodes_by_type.values():
943
+ non_terminating_nodes.extend(nodes)
944
+
945
+ # Update the context
946
+ assert len(all_nodes) == len(
947
+ terminating_nodes + non_terminating_nodes
948
+ ), "The number of nodes should be the same after enforcing max nodes per type."
949
+
950
+ ctx.update(terminating_nodes + non_terminating_nodes)
951
+
952
+ if terminating_nodes:
953
+ logger.debug(
954
+ f"Terminating {len(terminating_nodes)} "
955
+ "nodes for per node type max num node's constraints."
956
+ )
957
+
958
+ @staticmethod
959
+ def _enforce_max_workers_global(
960
+ ctx: "ResourceDemandScheduler.ScheduleContext",
961
+ ) -> None:
962
+ """
963
+ Enforce the max number of workers for the entire cluster.
964
+ """
965
+ all_nodes = ctx.get_nodes()
966
+
967
+ terminating_nodes = []
968
+ non_terminating_nodes = []
969
+
970
+ for node in all_nodes:
971
+ if node.status == SchedulingNodeStatus.TO_TERMINATE:
972
+ terminating_nodes.append(node)
973
+ else:
974
+ non_terminating_nodes.append(node)
975
+
976
+ num_max_nodes = ctx.get_max_num_nodes()
977
+
978
+ num_to_terminate = (
979
+ max(len(non_terminating_nodes) - num_max_nodes, 0) if num_max_nodes else 0
980
+ )
981
+
982
+ if num_to_terminate <= 0:
983
+ # No extra nodes needed to terminate.
984
+ return
985
+
986
+ # Terminate the nodes
987
+ (
988
+ to_terminate_nodes,
989
+ non_terminating_nodes,
990
+ ) = ResourceDemandScheduler._select_nodes_to_terminate(
991
+ non_terminating_nodes,
992
+ num_to_terminate,
993
+ TerminationRequest.Cause.MAX_NUM_NODES,
994
+ max_num_nodes=num_max_nodes,
995
+ )
996
+
997
+ assert len(to_terminate_nodes) == num_to_terminate, (
998
+ "Terminating {} nodes, failed to terminate {} nodes to "
999
+ "satisfy max_num_nodes={}".format(
1000
+ len(to_terminate_nodes),
1001
+ num_to_terminate - len(to_terminate_nodes),
1002
+ num_max_nodes,
1003
+ )
1004
+ )
1005
+
1006
+ # Update the context
1007
+ terminating_nodes.extend(to_terminate_nodes)
1008
+ assert len(all_nodes) == len(
1009
+ terminating_nodes + non_terminating_nodes
1010
+ ), "The number of nodes should be the same after enforcing max nodes."
1011
+
1012
+ all_nodes = terminating_nodes + non_terminating_nodes
1013
+ ctx.update(all_nodes)
1014
+
1015
+ @staticmethod
1016
+ def _select_nodes_to_terminate(
1017
+ nodes: List[SchedulingNode],
1018
+ num_to_terminate: int,
1019
+ cause: TerminationRequest.Cause,
1020
+ max_num_nodes: Optional[int] = None,
1021
+ max_num_nodes_per_type: Optional[int] = None,
1022
+ ) -> Tuple[List[SchedulingNode], List[SchedulingNode]]:
1023
+ """
1024
+ Select 'num_to_terminate' of nodes to be terminated
1025
+ from the 'nodes' list. It should never select a head node.
1026
+
1027
+ Args:
1028
+ nodes: The nodes to be terminated.
1029
+ num_to_terminate: The number of nodes to be terminated.
1030
+ cause: The cause of the termination. Should be one of
1031
+ TerminationRequest.Cause.MAX_NUM_NODES or
1032
+ TerminationRequest.Cause.MAX_NUM_NODE_PER_TYPE.
1033
+
1034
+ max_num_nodes: The max number of nodes for the entire cluster only
1035
+ used when the cause is TerminationRequest.Cause.MAX_NUM_NODES.
1036
+ max_num_nodes_per_type: The max number of nodes for each node type.
1037
+ Only used when the cause is
1038
+ TerminationRequest.Cause.MAX_NUM_NODE_PER_TYPE.
1039
+
1040
+ Returns:
1041
+ A tuple of:
1042
+ - The terminated nodes.
1043
+ - The remained nodes.
1044
+ """
1045
+
1046
+ # Sort the nodes for termination.
1047
+ nodes.sort(key=ResourceDemandScheduler._sort_nodes_for_termination)
1048
+
1049
+ # Remove the head node from the list.
1050
+ head_node = None
1051
+ for i, node in enumerate(nodes):
1052
+ if node.node_kind == NodeKind.HEAD:
1053
+ # Remove the head node from the list.
1054
+ head_node = nodes.pop(i)
1055
+ break
1056
+
1057
+ terminated_nodes, remained_nodes = (
1058
+ nodes[:num_to_terminate],
1059
+ # The head could be None if there's no head node being reported yet
1060
+ # from the ray cluster.
1061
+ nodes[num_to_terminate:] + ([head_node] if head_node else []),
1062
+ )
1063
+
1064
+ assert cause in [
1065
+ TerminationRequest.Cause.MAX_NUM_NODES,
1066
+ TerminationRequest.Cause.MAX_NUM_NODE_PER_TYPE,
1067
+ ], "Other termination causes don't have to select nodes for termination."
1068
+
1069
+ for node in terminated_nodes:
1070
+ node.status = SchedulingNodeStatus.TO_TERMINATE
1071
+ node.termination_request = TerminationRequest(
1072
+ id=str(uuid.uuid4()),
1073
+ instance_id=node.im_instance_id,
1074
+ ray_node_id=node.ray_node_id,
1075
+ cause=cause,
1076
+ instance_type=node.node_type,
1077
+ details=(
1078
+ f"Terminating node due to {TerminationRequest.Cause.Name(cause)}: "
1079
+ f"max_num_nodes={max_num_nodes}, "
1080
+ f"max_num_nodes_per_type={max_num_nodes_per_type}"
1081
+ ),
1082
+ )
1083
+ if cause == TerminationRequest.Cause.MAX_NUM_NODES:
1084
+ node.termination_request.max_num_nodes = max_num_nodes
1085
+ elif cause == TerminationRequest.Cause.MAX_NUM_NODE_PER_TYPE:
1086
+ node.termination_request.max_num_nodes_per_type = max_num_nodes_per_type
1087
+ else:
1088
+ raise ValueError("Unknown termination cause: {}".format(cause))
1089
+
1090
+ return terminated_nodes, remained_nodes
1091
+
1092
+ @staticmethod
1093
+ def _sort_nodes_for_termination(node: SchedulingNode) -> Tuple:
1094
+ """
1095
+ Sort the nodes for termination increasingly by:
1096
+
1097
+ 1. First if ray hasn't been started yet
1098
+ 2. Then if the nodes are idle
1099
+ 3. Then with lower resources util nodes first.
1100
+
1101
+ Such that nodes sorted earlier will be terminated first.
1102
+ """
1103
+
1104
+ running_ray = len(node.ray_node_id) > 0
1105
+ # Reverse the idle duration such that the nodes with the largest idle duration
1106
+ # will be terminated first.
1107
+ idle_dur = -1 * node.idle_duration_ms
1108
+ available_resources = node.get_available_resources(
1109
+ ResourceRequestSource.PENDING_DEMAND
1110
+ )
1111
+
1112
+ utils_per_resources = {}
1113
+ for resource, total in node.total_resources.items():
1114
+ if total <= 0:
1115
+ continue
1116
+ utils_per_resources[resource] = (
1117
+ total - available_resources.get(resource, 0)
1118
+ ) / total
1119
+
1120
+ avg_util = (
1121
+ sum(utils_per_resources.values()) / len(utils_per_resources)
1122
+ if utils_per_resources
1123
+ else 0
1124
+ )
1125
+
1126
+ return (running_ray, idle_dur, avg_util)
1127
+
1128
+ @staticmethod
1129
+ def _enforce_min_workers_per_type(
1130
+ ctx: "ResourceDemandScheduler.ScheduleContext",
1131
+ ) -> None:
1132
+ """
1133
+ Enforce the minimal count of nodes for each worker node type.
1134
+ """
1135
+
1136
+ # Count the existing nodes by type
1137
+ count_by_node_type = ctx.get_cluster_shape()
1138
+
1139
+ new_nodes = []
1140
+ # Launch new nodes to satisfy min count for each node type.
1141
+ for (
1142
+ node_type,
1143
+ node_type_config,
1144
+ ) in ctx.get_node_type_configs().items():
1145
+ cur_count = count_by_node_type.get(node_type, 0)
1146
+ min_count = node_type_config.min_worker_nodes
1147
+ if cur_count < min_count:
1148
+ logger.info(
1149
+ f"Adding {min_count - cur_count} nodes to satisfy min count for "
1150
+ f"node type: {node_type}."
1151
+ )
1152
+ new_nodes.extend(
1153
+ [
1154
+ SchedulingNode.from_node_config(
1155
+ copy.deepcopy(node_type_config),
1156
+ status=SchedulingNodeStatus.TO_LAUNCH,
1157
+ node_kind=NodeKind.WORKER,
1158
+ )
1159
+ ]
1160
+ * (min_count - cur_count)
1161
+ )
1162
+ # NOTE: we assume the aggregated number of min workers across all node types
1163
+ # should not exceed any globally enforced max_num_nodes
1164
+
1165
+ # Add the new nodes to the existing nodes and update the context.
1166
+ ctx.update(new_nodes + ctx.get_nodes())
1167
+
1168
+ @staticmethod
1169
+ def _enforce_resource_constraints(
1170
+ ctx: "ResourceDemandScheduler.ScheduleContext",
1171
+ constraints: List[ClusterResourceConstraint],
1172
+ ) -> List[ClusterResourceConstraint]:
1173
+ """
1174
+ Enforce the cluster resource constraints.
1175
+
1176
+ Args:
1177
+ ctx: The schedule context.
1178
+ constraints: The cluster resource constraints.
1179
+
1180
+ Returns:
1181
+ A list of infeasible constraints.
1182
+
1183
+ Notes:
1184
+ It's different from the other scheduling functions since it doesn't actually
1185
+ schedule any resource requests. Instead, it asks if the cluster could be
1186
+ upscale to a certain shape to fulfill the constraints.
1187
+ """
1188
+
1189
+ # NOTE: we currently only have 1 constraint from a cluster, but
1190
+ # we may have multiple in the future.
1191
+ assert len(constraints) <= 1, "Max 1 cluster resource constraint is supported."
1192
+ if len(constraints) == 0:
1193
+ # No cluster resource constraints - nothing needs to be done.
1194
+ return []
1195
+
1196
+ constraint = constraints[0]
1197
+ # Flatten the requests for iterating through.
1198
+ requests = ResourceRequestUtil.ungroup_by_count(constraint.resource_requests)
1199
+
1200
+ # Pass the empty nodes to schedule.
1201
+ scheduled_nodes, infeasible = ResourceDemandScheduler._try_schedule(
1202
+ ctx,
1203
+ requests,
1204
+ resource_request_source=ResourceRequestSource.CLUSTER_RESOURCE_CONSTRAINT,
1205
+ )
1206
+
1207
+ if infeasible:
1208
+ # Unable to satisfy the constraint.
1209
+ return [constraint]
1210
+
1211
+ ctx.update(scheduled_nodes)
1212
+ return []
1213
+
1214
+ @staticmethod
1215
+ def _sched_resource_requests(
1216
+ ctx: "ResourceDemandScheduler.ScheduleContext",
1217
+ requests: List[ResourceRequest],
1218
+ ) -> List[ResourceRequest]:
1219
+ """
1220
+ Schedule the resource requests.
1221
+
1222
+ Args:
1223
+ ctx: The schedule context.
1224
+ requests_by_count: The resource requests.
1225
+
1226
+ Returns:
1227
+ A list of infeasible resource requests.
1228
+ """
1229
+ nodes, infeasible = ResourceDemandScheduler._try_schedule(
1230
+ ctx, requests, resource_request_source=ResourceRequestSource.PENDING_DEMAND
1231
+ )
1232
+
1233
+ # Regardless if there's feasible, we will update the context for schedule nodes.
1234
+ ctx.update(nodes)
1235
+
1236
+ return infeasible
1237
+
1238
+ @staticmethod
1239
+ def _sched_gang_resource_requests(
1240
+ ctx: "ResourceDemandScheduler.ScheduleContext",
1241
+ gang_requests: List[GangResourceRequest],
1242
+ ) -> List[GangResourceRequest]:
1243
+ """
1244
+ Schedule the gang resource requests.
1245
+
1246
+ These requests should be scheduled atomically, i.e. either all of the resources
1247
+ requests in a gang request are scheduled or none of them are scheduled.
1248
+
1249
+ For now, the gang resource requests represent Ray's placement groups, while it
1250
+ could be more general in the future:
1251
+ - For STRICT_PACK placement group requests, we combine them into a single
1252
+ request and try to schedule them together.
1253
+ - For STRICT_SPREAD placement groups requests, they should be scheduled on
1254
+ different nodes by leveraging on the node labels that are associated with
1255
+ the placement group.
1256
+ If there are requests from rescheduling placement groups due to node
1257
+ failures, these requests should not be scheduled on nodes with requests
1258
+ from the same placement group.
1259
+
1260
+
1261
+ Args:
1262
+ ctx: The schedule context.
1263
+ gang_requests: The gang resource requests.
1264
+
1265
+ Returns:
1266
+ A list of infeasible gang resource requests.
1267
+ """
1268
+
1269
+ def _sort_gang_resource_requests(req: GangResourceRequest) -> Tuple:
1270
+ """
1271
+ Key function for sorting the gang resource request by:
1272
+ 1. the number of placement constraints in the gang request.
1273
+ 2. the number of resource requests in the gang request.
1274
+ """
1275
+ total_placement_constraints = 0
1276
+ for resource_request in req.requests:
1277
+ total_placement_constraints += len(
1278
+ resource_request.placement_constraints
1279
+ )
1280
+
1281
+ return (total_placement_constraints, len(req.requests))
1282
+
1283
+ infeasible_gang_requests = []
1284
+ # Try fulfilling the gang requests one by one.
1285
+ for gang_req in sorted(
1286
+ gang_requests, key=_sort_gang_resource_requests, reverse=True
1287
+ ):
1288
+ requests = gang_req.requests
1289
+ # Try to combine requests with affinity constraints into the same request.
1290
+ requests = ResourceRequestUtil.combine_requests_with_affinity(requests)
1291
+
1292
+ nodes, infeasible = ResourceDemandScheduler._try_schedule(
1293
+ ctx, requests, ResourceRequestSource.PENDING_DEMAND
1294
+ )
1295
+
1296
+ if infeasible:
1297
+ # Unable to satisfy the constraint. We will skip the gang request.
1298
+ # Don't update the context.
1299
+ infeasible_gang_requests.append(gang_req)
1300
+ continue
1301
+
1302
+ # We are able to satisfy the constraint and thus update the context.
1303
+ ctx.update(nodes)
1304
+
1305
+ return infeasible_gang_requests
1306
+
1307
+ @staticmethod
1308
+ def _try_schedule(
1309
+ ctx: "ResourceDemandScheduler.ScheduleContext",
1310
+ requests_to_sched: List[ResourceRequest],
1311
+ resource_request_source: ResourceRequestSource,
1312
+ ) -> Tuple[List[SchedulingNode], List[ResourceRequest]]:
1313
+ """
1314
+ Try to schedule the resource requests on the current context.
1315
+
1316
+ It tries to schedule the requests on the existing nodes first, and
1317
+ then try to schedule the requests on new nodes if possible.
1318
+
1319
+ Args:
1320
+ requests_to_sched: The resource requests to be scheduled.
1321
+ ctx: The current scheduling context.
1322
+ resource_request_source: The source of the resource request, i.e.
1323
+ pending demands from ray actors/tasks or cluster resource
1324
+ constraints.
1325
+
1326
+ Returns:
1327
+ - List of scheduled nodes to that have part or all of the requests
1328
+ scheduled.
1329
+ - List of infeasible requests remained that cannot be scheduled.
1330
+ """
1331
+ # First sort the requests.
1332
+ def _sort_resource_request(req: ResourceRequest) -> Tuple:
1333
+ """
1334
+ Sort the resource requests by:
1335
+ 1. The length of it's placement constraints.
1336
+ 2. The number of resources it requests.
1337
+ 3. The values of resources it requests.
1338
+ 4. lexicographically for each resource (for stable ordering)
1339
+
1340
+ This is a legacy sorting function for the autoscaler's binpacking
1341
+ algo - we do this so that we could have a deterministic scheduling
1342
+ results with reasonable fragmentation.
1343
+ """
1344
+ return (
1345
+ len(req.placement_constraints),
1346
+ len(req.resources_bundle.values()),
1347
+ sum(req.resources_bundle.values()),
1348
+ sorted(req.resources_bundle.items()),
1349
+ )
1350
+
1351
+ requests_to_sched = sorted(
1352
+ requests_to_sched, key=_sort_resource_request, reverse=True
1353
+ )
1354
+
1355
+ existing_nodes = ctx.get_nodes()
1356
+ node_type_available = ctx.get_node_type_available()
1357
+
1358
+ # A list of nodes that are either:
1359
+ # 1. existing nodes in the cluster. or
1360
+ # 2. new nodes that are launched to satisfy the resource requests.
1361
+ target_nodes = []
1362
+
1363
+ # Try scheduling resource requests with existing nodes first.
1364
+ while len(requests_to_sched) > 0 and len(existing_nodes) > 0:
1365
+ (
1366
+ best_node,
1367
+ requests_to_sched,
1368
+ existing_nodes,
1369
+ ) = ResourceDemandScheduler._sched_best_node(
1370
+ requests_to_sched, existing_nodes, resource_request_source
1371
+ )
1372
+ if best_node is None:
1373
+ # No existing nodes can schedule any more requests.
1374
+ break
1375
+
1376
+ target_nodes.append(best_node)
1377
+
1378
+ # If there's any existing nodes left, we will add to the target nodes
1379
+ target_nodes.extend(existing_nodes)
1380
+
1381
+ # Try scheduling resource requests with new nodes.
1382
+ node_pools = [
1383
+ SchedulingNode.from_node_config(
1384
+ ctx.get_node_type_configs()[node_type],
1385
+ status=SchedulingNodeStatus.TO_LAUNCH,
1386
+ node_kind=NodeKind.WORKER,
1387
+ )
1388
+ for node_type, num_available in node_type_available.items()
1389
+ if num_available > 0
1390
+ ]
1391
+ while len(requests_to_sched) > 0 and len(node_pools) > 0:
1392
+ # Max number of nodes reached.
1393
+ max_num_nodes = ctx.get_max_num_nodes()
1394
+ if max_num_nodes is not None and len(target_nodes) >= max_num_nodes:
1395
+ logger.debug(
1396
+ "Max number of nodes reached: {}, "
1397
+ "cannot launch more nodes.".format(max_num_nodes)
1398
+ )
1399
+ break
1400
+
1401
+ (
1402
+ best_node,
1403
+ requests_to_sched,
1404
+ node_pools,
1405
+ ) = ResourceDemandScheduler._sched_best_node(
1406
+ requests_to_sched, node_pools, resource_request_source
1407
+ )
1408
+ if best_node is None:
1409
+ break
1410
+
1411
+ target_nodes.append(best_node)
1412
+ # Update the node pool if a node with the same node type of the
1413
+ # added node can be launched.
1414
+ node_type_available[best_node.node_type] -= 1
1415
+ if node_type_available[best_node.node_type] > 0:
1416
+ node_pools.append(
1417
+ SchedulingNode.from_node_config(
1418
+ ctx.get_node_type_configs()[best_node.node_type],
1419
+ status=SchedulingNodeStatus.TO_LAUNCH,
1420
+ node_kind=NodeKind.WORKER,
1421
+ )
1422
+ )
1423
+
1424
+ return target_nodes, requests_to_sched
1425
+
1426
+ @staticmethod
1427
+ def _sched_best_node(
1428
+ requests: List[ResourceRequest],
1429
+ nodes: List[SchedulingNode],
1430
+ resource_request_source: ResourceRequestSource,
1431
+ ) -> Tuple[SchedulingNode, List[ResourceRequest], List[SchedulingNode]]:
1432
+ """
1433
+ Schedule the requests on the best node.
1434
+ A simple greedy algorithm is used to schedule the requests:
1435
+ 1. Try to schedule the requests on each node.
1436
+ 2. Sort the nodes by a score
1437
+ 3. Return the node with the highest score.
1438
+
1439
+ The highest score node is updated with the scheduled requests, and the node is
1440
+ removed from the node list.
1441
+
1442
+ Args:
1443
+ requests: The resource requests to be scheduled.
1444
+ nodes: The node candidates to be scheduled on. The nodes will be updated
1445
+ after the scheduling attempt, i.e. the node that is scheduled will be
1446
+ removed from the list.
1447
+ resource_request_source: The source of the resource request, i.e.
1448
+ pending demands from ray actors/tasks or cluster resource constraints.
1449
+
1450
+ Returns:
1451
+ best_node: The best node to schedule the requests.
1452
+ infeasible: The infeasible requests that cannot be scheduled on the best
1453
+ node.
1454
+ nodes: Remaining nodes after the best node is removed.
1455
+ """
1456
+ results = []
1457
+
1458
+ # A temporary data class to store the scheduling result.
1459
+ @dataclass
1460
+ class ScheduleResult:
1461
+ # The node candidate after a scheduling attempt.
1462
+ node: SchedulingNode
1463
+ # The infeasible resource requests that are not scheduled.
1464
+ infeasible_requests: List[ResourceRequest]
1465
+ # The index of the node in the original node list.
1466
+ idx: int
1467
+ # the score of the scheduling node to compare with others.
1468
+ score: UtilizationScore
1469
+
1470
+ nodes_copy = copy.deepcopy(nodes)
1471
+
1472
+ # Iterate through each node and modify the node's available resources
1473
+ # if the requests are schedulable.
1474
+ for idx, node in enumerate(nodes_copy):
1475
+ remaining, score = node.try_schedule(requests, resource_request_source)
1476
+
1477
+ if len(remaining) == len(requests):
1478
+ # The node cannot schedule any of the requests.
1479
+ continue
1480
+
1481
+ results.append(ScheduleResult(node, remaining, idx, score))
1482
+
1483
+ # No nodes can schedule any of the requests.
1484
+ if len(results) == 0:
1485
+ logger.debug(
1486
+ "No nodes can schedule the requests: {}, for nodes: {}".format(
1487
+ ResourceRequestUtil.to_dict_list(requests), nodes
1488
+ )
1489
+ )
1490
+ return None, requests, nodes
1491
+
1492
+ # Sort the results by score.
1493
+ results = sorted(results, key=lambda r: r.score, reverse=True)
1494
+ best_result = results[0]
1495
+
1496
+ # Remove the best node from the nodes.
1497
+ nodes.pop(best_result.idx)
1498
+ logger.debug(
1499
+ "Best node: {}, score: {}, remaining requests: {}".format(
1500
+ best_result.node,
1501
+ best_result.score,
1502
+ ResourceRequestUtil.to_dict_list(best_result.infeasible_requests),
1503
+ )
1504
+ )
1505
+ return best_result.node, best_result.infeasible_requests, nodes
1506
+
1507
+ @staticmethod
1508
+ def _terminate_outdated_nodes(
1509
+ ctx: "ResourceDemandScheduler.ScheduleContext",
1510
+ ) -> None:
1511
+ """
1512
+ Terminate the nodes that are outdated, i.e. the node type config has been
1513
+ updated or the node's launch config hash is outdated.
1514
+
1515
+ Args:
1516
+ ctx: The schedule context.
1517
+ """
1518
+ nodes = ctx.get_nodes()
1519
+
1520
+ if ctx._disable_launch_config_check:
1521
+ # Outdated nodes check through launch config check is disabled.
1522
+ return
1523
+
1524
+ for node in nodes:
1525
+ if node.status != SchedulingNodeStatus.SCHEDULABLE:
1526
+ # We don't need to care about the non-running nodes.
1527
+ continue
1528
+
1529
+ if node.node_kind == NodeKind.HEAD:
1530
+ # We should not be terminating the head node even if it's outdated.
1531
+ logger.warning(
1532
+ f"Head node {node.im_instance_id}(ray={node.ray_node_id}) is "
1533
+ "outdated with node config changes. "
1534
+ "Please check the node's config or restart the cluster or restart "
1535
+ "the head node. Autoscaler is not able to shutdown the outdated "
1536
+ "head node"
1537
+ )
1538
+ continue
1539
+ node_type = node.node_type
1540
+ node_type_config = ctx.get_node_type_configs().get(node_type)
1541
+ if node_type_config is None or (
1542
+ node_type_config.launch_config_hash
1543
+ and node_type_config.launch_config_hash != node.launch_config_hash
1544
+ ):
1545
+ # The node type config has been updated, and the node's launch config
1546
+ # hash is outdated.
1547
+ node.status = SchedulingNodeStatus.TO_TERMINATE
1548
+ node.termination_request = TerminationRequest(
1549
+ id=str(time.time_ns()),
1550
+ instance_id=node.im_instance_id,
1551
+ ray_node_id=node.ray_node_id,
1552
+ instance_type=node.node_type,
1553
+ cause=TerminationRequest.Cause.OUTDATED,
1554
+ details=f"node from {node.node_type} has outdated config",
1555
+ )
1556
+
1557
+ ctx.update(nodes)
1558
+
1559
+ @staticmethod
1560
+ def _enforce_idle_termination(
1561
+ ctx: "ResourceDemandScheduler.ScheduleContext",
1562
+ ) -> None:
1563
+ """
1564
+ Enforce the idle termination for the nodes that are not needed by the cluster
1565
+ resource constraints and idle for too long.
1566
+
1567
+ Args:
1568
+ ctx: The schedule context.
1569
+ """
1570
+ count_by_node_type = ctx.get_cluster_shape()
1571
+ node_type_configs = ctx.get_node_type_configs()
1572
+ terminate_nodes_by_type: Dict[NodeType, int] = defaultdict(int)
1573
+
1574
+ nodes = ctx.get_nodes()
1575
+ s_to_ms = 1000
1576
+ for node in nodes:
1577
+ if node.status != SchedulingNodeStatus.SCHEDULABLE:
1578
+ # We don't need to care about the non-running nodes.
1579
+ continue
1580
+
1581
+ if node.node_kind == NodeKind.HEAD:
1582
+ # The head node is not subject to idle termination.
1583
+ continue
1584
+
1585
+ idle_timeout_s = ctx.get_idle_timeout_s()
1586
+ # Override the scheduler idle_timeout_s if set for this node_type.
1587
+ node_type = node.node_type
1588
+ if node_type in node_type_configs:
1589
+ if node_type_configs[node_type].idle_timeout_s is not None:
1590
+ idle_timeout_s = node_type_configs[node_type].idle_timeout_s
1591
+ if idle_timeout_s is None:
1592
+ # No idle timeout is set, skip the idle termination.
1593
+ continue
1594
+
1595
+ if node.idle_duration_ms <= idle_timeout_s * s_to_ms:
1596
+ # The node is not idle for too long, skip it.
1597
+ continue
1598
+
1599
+ if node.sched_requests[ResourceRequestSource.CLUSTER_RESOURCE_CONSTRAINT]:
1600
+ # The node is needed by the resource constraints.
1601
+ # Skip it.
1602
+ if node.idle_duration_ms > ctx.get_idle_timeout_s() * s_to_ms:
1603
+ logger.debug(
1604
+ "Node {} (idle for {} secs) is needed by the cluster resource "
1605
+ "constraints, skip idle termination.".format(
1606
+ node.ray_node_id, node.idle_duration_ms / s_to_ms
1607
+ )
1608
+ )
1609
+ continue
1610
+
1611
+ # Honor the min_worker_nodes setting for the node type.
1612
+ min_count = 0
1613
+ if node_type in node_type_configs:
1614
+ min_count = node_type_configs[node_type].min_worker_nodes
1615
+ if (
1616
+ count_by_node_type.get(node_type, 0)
1617
+ - terminate_nodes_by_type[node_type]
1618
+ <= min_count
1619
+ ):
1620
+ logger.info(
1621
+ "Node {} (idle for {} secs) belongs to node_type {} and is "
1622
+ "required by min_worker_nodes, skipping idle termination.".format(
1623
+ node.ray_node_id, node.idle_duration_ms / s_to_ms, node_type
1624
+ )
1625
+ )
1626
+ continue
1627
+
1628
+ terminate_nodes_by_type[node.node_type] += 1
1629
+ # The node is idle for too long, terminate it.
1630
+ node.status = SchedulingNodeStatus.TO_TERMINATE
1631
+ node.termination_request = TerminationRequest(
1632
+ id=str(uuid.uuid4()),
1633
+ instance_id=node.im_instance_id,
1634
+ ray_node_id=node.ray_node_id,
1635
+ cause=TerminationRequest.Cause.IDLE,
1636
+ instance_type=node.node_type,
1637
+ idle_duration_ms=node.idle_duration_ms,
1638
+ details=f"idle for {node.idle_duration_ms/s_to_ms} secs > "
1639
+ f"timeout={idle_timeout_s} secs",
1640
+ )
1641
+
1642
+ ctx.update(nodes)
.venv/lib/python3.11/site-packages/ray/autoscaler/v2/schema.py ADDED
@@ -0,0 +1,351 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ from dataclasses import dataclass, field
3
+ from enum import Enum
4
+ from typing import Dict, List, Optional, Tuple
5
+
6
+ from ray.autoscaler.v2.instance_manager.common import InstanceUtil
7
+ from ray.core.generated.autoscaler_pb2 import NodeState, NodeStatus
8
+ from ray.core.generated.instance_manager_pb2 import Instance
9
+
10
+ # TODO(rickyx): once we have graceful shutdown, we could populate
11
+ # the failure detail with the actual termination message. As of now,
12
+ # we will use a more generic message to include cases such as:
13
+ # (idle termination, node death, crash, preemption, etc)
14
+ NODE_DEATH_CAUSE_RAYLET_DIED = "NodeTerminated"
15
+
16
+
17
+ # e.g., cpu_4_ondemand.
18
+ NodeType = str
19
+
20
+
21
+ @dataclass
22
+ class ResourceUsage:
23
+ # Resource name.
24
+ resource_name: str = ""
25
+ # Total resource.
26
+ total: float = 0.0
27
+ # Resource used.
28
+ used: float = 0.0
29
+
30
+
31
+ @dataclass
32
+ class NodeUsage:
33
+ # The node resource usage.
34
+ usage: List[ResourceUsage]
35
+ # How long the node has been idle.
36
+ idle_time_ms: int
37
+
38
+
39
+ @dataclass
40
+ class NodeInfo:
41
+ # The instance type name, e.g. p3.2xlarge
42
+ instance_type_name: str
43
+ # ray node type name.
44
+ ray_node_type_name: str
45
+ # Cloud instance id.
46
+ instance_id: str
47
+ # Ip address of the node when alive.
48
+ ip_address: str
49
+ # The status of the node. Optional for pending nodes.
50
+ node_status: Optional[str] = None
51
+ # ray node id in hex. None if still pending.
52
+ node_id: Optional[str] = None
53
+ # Resource usage breakdown if node is running.
54
+ resource_usage: Optional[NodeUsage] = None
55
+ # Failure detail if the node failed.
56
+ failure_detail: Optional[str] = None
57
+ # Descriptive details.
58
+ details: Optional[str] = None
59
+ # Activity on the node.
60
+ node_activity: Optional[List[str]] = None
61
+
62
+ def total_resources(self) -> Dict[str, float]:
63
+ if self.resource_usage is None:
64
+ return {}
65
+ return {r.resource_name: r.total for r in self.resource_usage.usage}
66
+
67
+ def available_resources(self) -> Dict[str, float]:
68
+ if self.resource_usage is None:
69
+ return {}
70
+ return {r.resource_name: r.total - r.used for r in self.resource_usage.usage}
71
+
72
+ def used_resources(self) -> Dict[str, float]:
73
+ if self.resource_usage is None:
74
+ return {}
75
+ return {r.resource_name: r.used for r in self.resource_usage.usage}
76
+
77
+
78
+ @dataclass
79
+ class LaunchRequest:
80
+ class Status(Enum):
81
+ FAILED = "FAILED"
82
+ PENDING = "PENDING"
83
+
84
+ # The instance type name, e.g. p3.2xlarge
85
+ instance_type_name: str
86
+ # ray node type name.
87
+ ray_node_type_name: str
88
+ # count.
89
+ count: int
90
+ # State: (e.g. PENDING, FAILED)
91
+ state: Status
92
+ # When the launch request was made in unix timestamp in secs.
93
+ request_ts_s: int
94
+ # When the launch request failed unix timestamp in secs if failed.
95
+ failed_ts_s: Optional[int] = None
96
+ # Request details, e.g. error reason if the launch request failed.
97
+ details: Optional[str] = None
98
+
99
+
100
+ @dataclass
101
+ class ResourceRequestByCount:
102
+ # Bundles in the demand.
103
+ bundle: Dict[str, float]
104
+ # Number of bundles with the same shape.
105
+ count: int
106
+
107
+ def __str__(self) -> str:
108
+ return f"[{self.count} {self.bundle}]"
109
+
110
+
111
+ @dataclass
112
+ class ResourceDemand:
113
+ # The bundles in the demand with shape and count info.
114
+ bundles_by_count: List[ResourceRequestByCount]
115
+
116
+
117
+ @dataclass
118
+ class PlacementGroupResourceDemand(ResourceDemand):
119
+ # Details string (parsed into below information)
120
+ details: str
121
+ # Placement group's id.
122
+ pg_id: Optional[str] = None
123
+ # Strategy, e.g. STRICT_SPREAD
124
+ strategy: Optional[str] = None
125
+ # Placement group's state, e.g. PENDING
126
+ state: Optional[str] = None
127
+
128
+ def __post_init__(self):
129
+ if not self.details:
130
+ return
131
+
132
+ # Details in the format of <pg_id>:<strategy>|<state>, parse
133
+ # it into the above fields.
134
+ pattern = r"^.*:.*\|.*$"
135
+ match = re.match(pattern, self.details)
136
+ if not match:
137
+ return
138
+
139
+ pg_id, details = self.details.split(":")
140
+ strategy, state = details.split("|")
141
+ self.pg_id = pg_id
142
+ self.strategy = strategy
143
+ self.state = state
144
+
145
+
146
+ @dataclass
147
+ class RayTaskActorDemand(ResourceDemand):
148
+ pass
149
+
150
+
151
+ @dataclass
152
+ class ClusterConstraintDemand(ResourceDemand):
153
+ pass
154
+
155
+
156
+ @dataclass
157
+ class ResourceDemandSummary:
158
+ # Placement group demand.
159
+ placement_group_demand: List[PlacementGroupResourceDemand] = field(
160
+ default_factory=list
161
+ )
162
+ # Ray task actor demand.
163
+ ray_task_actor_demand: List[RayTaskActorDemand] = field(default_factory=list)
164
+ # Cluster constraint demand.
165
+ cluster_constraint_demand: List[ClusterConstraintDemand] = field(
166
+ default_factory=list
167
+ )
168
+
169
+
170
+ @dataclass
171
+ class Stats:
172
+ # How long it took to get the GCS request.
173
+ # This is required when initializing the Stats since it should be calculated before
174
+ # the request was made.
175
+ gcs_request_time_s: float
176
+ # How long it took to get all live instances from node provider.
177
+ none_terminated_node_request_time_s: Optional[float] = None
178
+ # How long for autoscaler to process the scaling decision.
179
+ autoscaler_iteration_time_s: Optional[float] = None
180
+ # The last seen autoscaler state version from Ray.
181
+ autoscaler_version: Optional[str] = None
182
+ # The last seen cluster state resource version.
183
+ cluster_resource_state_version: Optional[str] = None
184
+ # Request made time unix timestamp: when the data was pulled from GCS.
185
+ request_ts_s: Optional[int] = None
186
+
187
+
188
+ @dataclass
189
+ class ClusterStatus:
190
+ # Healthy nodes information (non-idle)
191
+ active_nodes: List[NodeInfo] = field(default_factory=list)
192
+ # Idle node information
193
+ idle_nodes: List[NodeInfo] = field(default_factory=list)
194
+ # Pending launches.
195
+ pending_launches: List[LaunchRequest] = field(default_factory=list)
196
+ # Failed launches.
197
+ failed_launches: List[LaunchRequest] = field(default_factory=list)
198
+ # Pending nodes.
199
+ pending_nodes: List[NodeInfo] = field(default_factory=list)
200
+ # Failures
201
+ failed_nodes: List[NodeInfo] = field(default_factory=list)
202
+ # Resource usage summary for entire cluster.
203
+ cluster_resource_usage: List[ResourceUsage] = field(default_factory=list)
204
+ # Demand summary.
205
+ resource_demands: ResourceDemandSummary = field(
206
+ default_factory=ResourceDemandSummary
207
+ )
208
+ # Query metics
209
+ stats: Stats = field(default_factory=Stats)
210
+
211
+ def total_resources(self) -> Dict[str, float]:
212
+ return {r.resource_name: r.total for r in self.cluster_resource_usage}
213
+
214
+ def available_resources(self) -> Dict[str, float]:
215
+ return {r.resource_name: r.total - r.used for r in self.cluster_resource_usage}
216
+
217
+ # TODO(rickyx): we don't show infeasible requests as of now.
218
+ # (They will just be pending forever as part of the demands)
219
+ # We should show them properly in the future.
220
+
221
+
222
+ @dataclass
223
+ class AutoscalerInstance:
224
+ """
225
+ AutoscalerInstance represents an instance that's managed by the autoscaler.
226
+ This includes two states:
227
+ 1. the instance manager state: information of the underlying cloud instance.
228
+ 2. the ray node state, e.g. resources, ray node status.
229
+
230
+ The two states are linked by the cloud instance id, which should be set
231
+ when the ray node is started.
232
+ """
233
+
234
+ # The cloud instance id. It could be None if the instance hasn't been assigned
235
+ # a cloud instance id, e.g. the instance is still in QUEUED or REQUESTED status.
236
+ cloud_instance_id: Optional[str] = None
237
+
238
+ # The ray node state status. It could be None when no ray node is running
239
+ # or has run on the cloud instance: for example, ray is still being installed
240
+ # or the instance manager hasn't had a cloud instance assigned (e.g. QUEUED,
241
+ # REQUESTED).
242
+ ray_node: Optional[NodeState] = None
243
+
244
+ # The instance manager instance state. It would be None when the ray_node is not
245
+ # None.
246
+ # It could be None iff:
247
+ # 1. There's a ray node, but the instance manager hasn't discovered the
248
+ # cloud instance that's running this ray process yet. This could happen since
249
+ # the instance manager only discovers instances periodically.
250
+ #
251
+ # 2. There was a ray node running on the cloud instance, which was already stopped
252
+ # and removed from the instance manager state. But the ray state is still lagging
253
+ # behind.
254
+ #
255
+ # 3. There is a ray node that's unmanaged by the instance manager.
256
+ #
257
+ im_instance: Optional[Instance] = None
258
+
259
+ # | cloud_instance_id | ray_node | im_instance |
260
+ # |-------------------|----------|-------------|
261
+ # | None | None | None | Not possible.
262
+ # | None | None | not None | OK. An instance hasn't had ray running on it yet. # noqa E501
263
+ # | None | Not None | None | OK. Possible if the ray node is not started by autoscaler. # noqa E501
264
+ # | None | Not None | not None | Not possible - no way to link im instance with ray node. # noqa E501
265
+ # | not None | None | None | Not possible since cloud instance id is either part of im state or ray node. # noqa E501
266
+ # | not None | None | not None | OK. e.g. An instance that's not running ray yet. # noqa E501
267
+ # | not None | Not None | None | OK. See scenario 1, 2, 3 above.
268
+ # | not None | Not None | not None | OK. An instance that's running ray.
269
+ def validate(self) -> Tuple[bool, str]:
270
+ """Validate the autoscaler instance state.
271
+
272
+ Returns:
273
+ A tuple of (valid, error_msg) where:
274
+ - valid is whether the state is valid
275
+ - error_msg is the error message for the validation results.
276
+ """
277
+
278
+ state_combinations = {
279
+ # (cloud_instance_id is None, ray_node is None, im_instance is None): (valid, error_msg) # noqa E501
280
+ (True, True, True): (False, "Not possible"),
281
+ (True, True, False): (True, ""),
282
+ (True, False, True): (
283
+ True,
284
+ "There's a ray node w/o cloud instance id, must be started not "
285
+ "by autoscaler",
286
+ ),
287
+ (True, False, False): (
288
+ False,
289
+ "Not possible - no way to link im instance with ray node",
290
+ ),
291
+ (False, True, True): (
292
+ False,
293
+ "Not possible since cloud instance id is either part of "
294
+ "im state or ray node",
295
+ ),
296
+ (False, True, False): (True, ""),
297
+ (False, False, True): (True, ""),
298
+ (False, False, False): (True, ""),
299
+ }
300
+
301
+ valid, error_msg = state_combinations[
302
+ (
303
+ self.cloud_instance_id is None,
304
+ self.ray_node is None,
305
+ self.im_instance is None,
306
+ )
307
+ ]
308
+ if not valid:
309
+ return valid, error_msg
310
+
311
+ if self.im_instance is not None and self.ray_node is None:
312
+ # We don't see a ray node, but tracking an im instance.
313
+ if self.cloud_instance_id is None:
314
+ if InstanceUtil.is_cloud_instance_allocated(self.im_instance.status):
315
+ return (
316
+ False,
317
+ "instance should be in a status where cloud instance "
318
+ "is not allocated.",
319
+ )
320
+ else:
321
+ if not InstanceUtil.is_cloud_instance_allocated(
322
+ self.im_instance.status
323
+ ):
324
+ return (
325
+ False,
326
+ "instance should be in a status where cloud instance is "
327
+ "allocated.",
328
+ )
329
+
330
+ if self.ray_node is not None:
331
+ if self.cloud_instance_id != self.ray_node.instance_id:
332
+ return False, "cloud instance id doesn't match."
333
+
334
+ if self.im_instance is not None and self.cloud_instance_id is not None:
335
+ if self.cloud_instance_id != self.im_instance.cloud_instance_id:
336
+ return False, "cloud instance id doesn't match."
337
+
338
+ return True, ""
339
+
340
+ def is_ray_running(self) -> bool:
341
+ """Whether the ray node is running."""
342
+ return self.ray_node is not None and self.ray_node.status in [
343
+ NodeStatus.RUNNING,
344
+ NodeStatus.IDLE,
345
+ ]
346
+
347
+ def is_ray_stop(self) -> bool:
348
+ """Whether the ray node is stopped."""
349
+ return self.ray_node is None or self.ray_node.status in [
350
+ NodeStatus.DEAD,
351
+ ]