koichi12 commited on
Commit
293db81
·
verified ·
1 Parent(s): c590e6b

Add files using upload-large-folder tool

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .venv/lib/python3.11/site-packages/ray/air/_internal/__pycache__/__init__.cpython-311.pyc +0 -0
  2. .venv/lib/python3.11/site-packages/ray/air/_internal/__pycache__/config.cpython-311.pyc +0 -0
  3. .venv/lib/python3.11/site-packages/ray/autoscaler/__init__.py +8 -0
  4. .venv/lib/python3.11/site-packages/ray/autoscaler/__pycache__/__init__.cpython-311.pyc +0 -0
  5. .venv/lib/python3.11/site-packages/ray/autoscaler/__pycache__/batching_node_provider.cpython-311.pyc +0 -0
  6. .venv/lib/python3.11/site-packages/ray/autoscaler/__pycache__/command_runner.cpython-311.pyc +0 -0
  7. .venv/lib/python3.11/site-packages/ray/autoscaler/__pycache__/launch_and_verify_cluster.cpython-311.pyc +0 -0
  8. .venv/lib/python3.11/site-packages/ray/autoscaler/__pycache__/node_launch_exception.cpython-311.pyc +0 -0
  9. .venv/lib/python3.11/site-packages/ray/autoscaler/__pycache__/node_provider.cpython-311.pyc +0 -0
  10. .venv/lib/python3.11/site-packages/ray/autoscaler/__pycache__/tags.cpython-311.pyc +0 -0
  11. .venv/lib/python3.11/site-packages/ray/autoscaler/_private/__init__.py +0 -0
  12. .venv/lib/python3.11/site-packages/ray/autoscaler/_private/_azure/__init__.py +0 -0
  13. .venv/lib/python3.11/site-packages/ray/autoscaler/_private/_azure/__pycache__/__init__.cpython-311.pyc +0 -0
  14. .venv/lib/python3.11/site-packages/ray/autoscaler/_private/_azure/__pycache__/config.cpython-311.pyc +0 -0
  15. .venv/lib/python3.11/site-packages/ray/autoscaler/_private/_azure/__pycache__/node_provider.cpython-311.pyc +0 -0
  16. .venv/lib/python3.11/site-packages/ray/autoscaler/_private/_azure/azure-config-template.json +130 -0
  17. .venv/lib/python3.11/site-packages/ray/autoscaler/_private/_azure/azure-vm-template.json +294 -0
  18. .venv/lib/python3.11/site-packages/ray/autoscaler/_private/_azure/config.py +208 -0
  19. .venv/lib/python3.11/site-packages/ray/autoscaler/_private/_azure/node_provider.py +488 -0
  20. .venv/lib/python3.11/site-packages/ray/autoscaler/_private/autoscaler.py +1508 -0
  21. .venv/lib/python3.11/site-packages/ray/autoscaler/_private/cli_logger.py +825 -0
  22. .venv/lib/python3.11/site-packages/ray/autoscaler/_private/cli_logger_demoall.py +40 -0
  23. .venv/lib/python3.11/site-packages/ray/autoscaler/_private/cluster_dump.py +652 -0
  24. .venv/lib/python3.11/site-packages/ray/autoscaler/_private/command_runner.py +921 -0
  25. .venv/lib/python3.11/site-packages/ray/autoscaler/_private/commands.py +1631 -0
  26. .venv/lib/python3.11/site-packages/ray/autoscaler/_private/constants.py +140 -0
  27. .venv/lib/python3.11/site-packages/ray/autoscaler/_private/docker.py +129 -0
  28. .venv/lib/python3.11/site-packages/ray/autoscaler/_private/event_summarizer.py +75 -0
  29. .venv/lib/python3.11/site-packages/ray/autoscaler/_private/event_system.py +106 -0
  30. .venv/lib/python3.11/site-packages/ray/autoscaler/_private/fake_multi_node/__init__.py +0 -0
  31. .venv/lib/python3.11/site-packages/ray/autoscaler/_private/fake_multi_node/command_runner.py +91 -0
  32. .venv/lib/python3.11/site-packages/ray/autoscaler/_private/fake_multi_node/docker_monitor.py +246 -0
  33. .venv/lib/python3.11/site-packages/ray/autoscaler/_private/fake_multi_node/test_utils.py +398 -0
  34. .venv/lib/python3.11/site-packages/ray/autoscaler/_private/legacy_info_string.py +37 -0
  35. .venv/lib/python3.11/site-packages/ray/autoscaler/_private/load_metrics.py +375 -0
  36. .venv/lib/python3.11/site-packages/ray/autoscaler/_private/loader.py +15 -0
  37. .venv/lib/python3.11/site-packages/ray/autoscaler/_private/local/__init__.py +0 -0
  38. .venv/lib/python3.11/site-packages/ray/autoscaler/_private/local/__pycache__/__init__.cpython-311.pyc +0 -0
  39. .venv/lib/python3.11/site-packages/ray/autoscaler/_private/local/__pycache__/config.cpython-311.pyc +0 -0
  40. .venv/lib/python3.11/site-packages/ray/autoscaler/_private/local/__pycache__/coordinator_node_provider.cpython-311.pyc +0 -0
  41. .venv/lib/python3.11/site-packages/ray/autoscaler/_private/local/__pycache__/node_provider.cpython-311.pyc +0 -0
  42. .venv/lib/python3.11/site-packages/ray/autoscaler/_private/local/config.py +121 -0
  43. .venv/lib/python3.11/site-packages/ray/autoscaler/_private/local/coordinator_node_provider.py +110 -0
  44. .venv/lib/python3.11/site-packages/ray/autoscaler/_private/local/node_provider.py +304 -0
  45. .venv/lib/python3.11/site-packages/ray/autoscaler/_private/log_timer.py +33 -0
  46. .venv/lib/python3.11/site-packages/ray/autoscaler/_private/monitor.py +719 -0
  47. .venv/lib/python3.11/site-packages/ray/autoscaler/_private/node_launcher.py +221 -0
  48. .venv/lib/python3.11/site-packages/ray/autoscaler/_private/node_provider_availability_tracker.py +165 -0
  49. .venv/lib/python3.11/site-packages/ray/autoscaler/_private/node_tracker.py +77 -0
  50. .venv/lib/python3.11/site-packages/ray/autoscaler/_private/prom_metrics.py +292 -0
.venv/lib/python3.11/site-packages/ray/air/_internal/__pycache__/__init__.cpython-311.pyc ADDED
Binary file (190 Bytes). View file
 
.venv/lib/python3.11/site-packages/ray/air/_internal/__pycache__/config.cpython-311.pyc ADDED
Binary file (2.43 kB). View file
 
.venv/lib/python3.11/site-packages/ray/autoscaler/__init__.py ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from pathlib import Path
3
+
4
+ from ray.autoscaler import sdk
5
+
6
+ __all__ = ["sdk"]
7
+
8
+ AUTOSCALER_DIR_PATH = Path(os.path.abspath(os.path.dirname(__file__)))
.venv/lib/python3.11/site-packages/ray/autoscaler/__pycache__/__init__.cpython-311.pyc ADDED
Binary file (560 Bytes). View file
 
.venv/lib/python3.11/site-packages/ray/autoscaler/__pycache__/batching_node_provider.cpython-311.pyc ADDED
Binary file (12.9 kB). View file
 
.venv/lib/python3.11/site-packages/ray/autoscaler/__pycache__/command_runner.cpython-311.pyc ADDED
Binary file (4.84 kB). View file
 
.venv/lib/python3.11/site-packages/ray/autoscaler/__pycache__/launch_and_verify_cluster.cpython-311.pyc ADDED
Binary file (19 kB). View file
 
.venv/lib/python3.11/site-packages/ray/autoscaler/__pycache__/node_launch_exception.cpython-311.pyc ADDED
Binary file (1.99 kB). View file
 
.venv/lib/python3.11/site-packages/ray/autoscaler/__pycache__/node_provider.cpython-311.pyc ADDED
Binary file (13.4 kB). View file
 
.venv/lib/python3.11/site-packages/ray/autoscaler/__pycache__/tags.cpython-311.pyc ADDED
Binary file (1.32 kB). View file
 
.venv/lib/python3.11/site-packages/ray/autoscaler/_private/__init__.py ADDED
File without changes
.venv/lib/python3.11/site-packages/ray/autoscaler/_private/_azure/__init__.py ADDED
File without changes
.venv/lib/python3.11/site-packages/ray/autoscaler/_private/_azure/__pycache__/__init__.cpython-311.pyc ADDED
Binary file (203 Bytes). View file
 
.venv/lib/python3.11/site-packages/ray/autoscaler/_private/_azure/__pycache__/config.cpython-311.pyc ADDED
Binary file (9.59 kB). View file
 
.venv/lib/python3.11/site-packages/ray/autoscaler/_private/_azure/__pycache__/node_provider.cpython-311.pyc ADDED
Binary file (26.4 kB). View file
 
.venv/lib/python3.11/site-packages/ray/autoscaler/_private/_azure/azure-config-template.json ADDED
@@ -0,0 +1,130 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "$schema": "https://schema.management.azure.com/schemas/2019-04-01/deploymentTemplate.json#",
3
+ "contentVersion": "1.0.0.0",
4
+ "parameters": {
5
+ "clusterId": {
6
+ "type": "string",
7
+ "metadata": {
8
+ "description": "Unique string appended to resource names to isolate resources from different ray clusters."
9
+ }
10
+ },
11
+ "subnet": {
12
+ "type": "string",
13
+ "metadata": {
14
+ "description": "Subnet parameters."
15
+ }
16
+ },
17
+ "msiName": {
18
+ "type": "string",
19
+ "metadata": {
20
+ "description": "Managed service identity."
21
+ }
22
+ },
23
+ "msiResourceGroup": {
24
+ "type": "string",
25
+ "metadata": {
26
+ "description": "Managed service identity resource group."
27
+ }
28
+ },
29
+ "createMsi": {
30
+ "type": "bool",
31
+ "defaultValue": "true"
32
+ }
33
+ },
34
+ "variables": {
35
+ "contributor": "[subscriptionResourceId('Microsoft.Authorization/roleDefinitions', 'b24988ac-6180-42a0-ab88-20f7382dd24c')]",
36
+ "location": "[resourceGroup().location]",
37
+ "roleAssignmentName": "[concat('ray-', parameters('clusterId'), '-ra')]",
38
+ "nsgName": "[concat('ray-', parameters('clusterId'), '-nsg')]",
39
+ "nsg": "[resourceId('Microsoft.Network/networkSecurityGroups', variables('nsgName'))]",
40
+ "vnetName": "[concat('ray-', parameters('clusterId'), '-vnet')]",
41
+ "subnetName": "[concat('ray-', parameters('clusterId'), '-subnet')]"
42
+ },
43
+ "resources": [
44
+ {
45
+ "condition": "[parameters('createMsi')]",
46
+ "type": "Microsoft.ManagedIdentity/userAssignedIdentities",
47
+ "apiVersion": "2018-11-30",
48
+ "location": "[variables('location')]",
49
+ "name": "[parameters('msiName')]"
50
+ },
51
+ {
52
+ "type": "Microsoft.Authorization/roleAssignments",
53
+ "apiVersion": "2020-08-01-preview",
54
+ "name": "[guid(variables('roleAssignmentName'))]",
55
+ "properties": {
56
+ "principalId": "[reference(resourceId(parameters('msiResourceGroup'), 'Microsoft.ManagedIdentity/userAssignedIdentities', parameters('msiName')), '2018-11-30').principalId]",
57
+ "roleDefinitionId": "[variables('contributor')]",
58
+ "scope": "[resourceGroup().id]",
59
+ "principalType": "ServicePrincipal"
60
+ },
61
+ "dependsOn": [
62
+ "[parameters('msiName')]"
63
+ ]
64
+ },
65
+ {
66
+ "type": "Microsoft.Network/networkSecurityGroups",
67
+ "apiVersion": "2019-02-01",
68
+ "name": "[variables('nsgName')]",
69
+ "location": "[variables('location')]",
70
+ "properties": {
71
+ "securityRules": [
72
+ {
73
+ "name": "SSH",
74
+ "properties": {
75
+ "priority": 1000,
76
+ "protocol": "TCP",
77
+ "access": "Allow",
78
+ "direction": "Inbound",
79
+ "sourceAddressPrefix": "*",
80
+ "sourcePortRange": "*",
81
+ "destinationAddressPrefix": "*",
82
+ "destinationPortRange": "22"
83
+ }
84
+ }
85
+ ]
86
+ }
87
+ },
88
+ {
89
+ "type": "Microsoft.Network/virtualNetworks",
90
+ "apiVersion": "2019-11-01",
91
+ "name": "[variables('vnetName')]",
92
+ "location": "[variables('location')]",
93
+ "properties": {
94
+ "addressSpace": {
95
+ "addressPrefixes": [
96
+ "[parameters('subnet')]"
97
+ ]
98
+ },
99
+ "subnets": [
100
+ {
101
+ "name": "[variables('subnetName')]",
102
+ "properties": {
103
+ "addressPrefix": "[parameters('subnet')]",
104
+ "networkSecurityGroup": {
105
+ "id": "[variables('nsg')]"
106
+ }
107
+ }
108
+ }
109
+ ]
110
+ },
111
+ "dependsOn": [
112
+ "[variables('nsg')]"
113
+ ]
114
+ }
115
+ ],
116
+ "outputs": {
117
+ "subnet": {
118
+ "type": "string",
119
+ "value": "[resourceId('Microsoft.Network/virtualNetworks/subnets', variables('vnetName'), variables('subnetName'))]"
120
+ },
121
+ "nsg": {
122
+ "type": "string",
123
+ "value": "[variables('nsg')]"
124
+ },
125
+ "msi": {
126
+ "type": "string",
127
+ "value": "[resourceId(parameters('msiResourceGroup'), 'Microsoft.ManagedIdentity/userAssignedIdentities', parameters('msiName'))]"
128
+ }
129
+ }
130
+ }
.venv/lib/python3.11/site-packages/ray/autoscaler/_private/_azure/azure-vm-template.json ADDED
@@ -0,0 +1,294 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "$schema": "https://schema.management.azure.com/schemas/2019-04-01/deploymentTemplate.json#",
3
+ "contentVersion": "1.0.0.0",
4
+ "parameters": {
5
+ "vmName": {
6
+ "type": "string",
7
+ "metadata": {
8
+ "description": "The name of you Virtual Machine."
9
+ }
10
+ },
11
+ "adminUsername": {
12
+ "type": "string",
13
+ "metadata": {
14
+ "description": "Username for the Virtual Machine."
15
+ }
16
+ },
17
+ "publicKey": {
18
+ "type": "securestring",
19
+ "metadata": {
20
+ "description": "SSH Key for the Virtual Machine"
21
+ }
22
+ },
23
+ "imagePublisher": {
24
+ "type": "string",
25
+ "metadata": {
26
+ "description": "The publisher of the VM image"
27
+ }
28
+ },
29
+ "imageOffer": {
30
+ "type": "string",
31
+ "metadata": {
32
+ "description": "The offer of the VM image"
33
+ }
34
+ },
35
+ "imageSku": {
36
+ "type": "string",
37
+ "metadata": {
38
+ "description": "The sku of the VM image"
39
+ }
40
+ },
41
+ "imageVersion": {
42
+ "type": "string",
43
+ "metadata": {
44
+ "description": "The version of the VM image"
45
+ }
46
+ },
47
+ "vmSize": {
48
+ "type": "string",
49
+ "metadata": {
50
+ "description": "The size of the VM"
51
+ }
52
+ },
53
+ "vmTags": {
54
+ "type": "object",
55
+ "metadata": {
56
+ "description": "Tags for the VM"
57
+ }
58
+ },
59
+ "vmCount": {
60
+ "type": "int",
61
+ "metadata": {
62
+ "description": "Number of VMs to deploy"
63
+ }
64
+ },
65
+ "provisionPublicIp": {
66
+ "type": "bool",
67
+ "defaultValue": true,
68
+ "metadata": {
69
+ "description": "If true creates a public ip"
70
+ }
71
+ },
72
+ "priority": {
73
+ "type": "string",
74
+ "defaultValue": "Regular",
75
+ "metadata": {
76
+ "description": "Specifies the priority for the virtual machine."
77
+ }
78
+ },
79
+ "evictionPolicy": {
80
+ "type": "string",
81
+ "defaultValue": "Delete",
82
+ "metadata": {
83
+ "description": "Specifies the eviction policy for the virtual machine."
84
+ }
85
+ },
86
+ "billingProfile": {
87
+ "type": "object",
88
+ "defaultValue": {},
89
+ "metadata": {
90
+ "description": "Specifies the maximum price to pay for Azure Spot VM."
91
+ }
92
+ },
93
+ "msi": {
94
+ "type": "string",
95
+ "metadata": {
96
+ "description": "Managed service identity resource id."
97
+ }
98
+ },
99
+ "nsg": {
100
+ "type": "string",
101
+ "metadata": {
102
+ "description": "Network security group resource id."
103
+ }
104
+ },
105
+ "subnet": {
106
+ "type": "string",
107
+ "metadata": {
108
+ "descriptions": "Subnet resource id."
109
+ }
110
+ },
111
+ "enableAcceleratedNetworking": {
112
+ "type": "bool",
113
+ "defaultValue": false,
114
+ "metadata": {
115
+ "descriptions": "Whether to enable accelerated networking."
116
+ }
117
+ }
118
+ },
119
+ "variables": {
120
+ "location": "[resourceGroup().location]",
121
+ "networkInterfaceNamePrivate": "[concat(parameters('vmName'), '-nic')]",
122
+ "networkInterfaceNamePublic": "[concat(parameters('vmName'), '-nic-public')]",
123
+ "networkInterfaceName": "[if(parameters('provisionPublicIp'), variables('networkInterfaceNamePublic'), variables('networkInterfaceNamePrivate'))]",
124
+ "networkIpConfig": "[guid(resourceGroup().id, parameters('vmName'))]",
125
+ "osDiskType": "Standard_LRS",
126
+ "publicIpAddressName": "[concat(parameters('vmName'), '-ip')]"
127
+ },
128
+ "resources": [
129
+ {
130
+ "type": "Microsoft.Network/networkInterfaces",
131
+ "apiVersion": "2020-06-01",
132
+ "name": "[concat(variables('networkInterfaceNamePublic'), copyIndex())]",
133
+ "location": "[variables('location')]",
134
+ "dependsOn": [
135
+ "[resourceId('Microsoft.Network/publicIpAddresses/', concat(variables('publicIpAddressName'), copyIndex()))]"
136
+ ],
137
+ "copy": {
138
+ "name": "NICPublicCopy",
139
+ "count": "[parameters('vmCount')]"
140
+ },
141
+ "properties": {
142
+ "ipConfigurations": [
143
+ {
144
+ "name": "[variables('networkIpConfig')]",
145
+ "properties": {
146
+ "subnet": {
147
+ "id": "[parameters('subnet')]"
148
+ },
149
+ "privateIPAllocationMethod": "Dynamic",
150
+ "publicIpAddress": {
151
+ "id": "[resourceId('Microsoft.Network/publicIPAddresses', concat(variables('publicIPAddressName'), copyIndex()))]"
152
+ }
153
+ }
154
+ }
155
+ ],
156
+ "networkSecurityGroup": {
157
+ "id": "[parameters('nsg')]"
158
+ },
159
+ "enableAcceleratedNetworking": "[parameters('enableAcceleratedNetworking')]"
160
+ },
161
+ "condition": "[parameters('provisionPublicIp')]"
162
+ },
163
+ {
164
+ "type": "Microsoft.Network/networkInterfaces",
165
+ "apiVersion": "2020-06-01",
166
+ "name": "[concat(variables('networkInterfaceNamePrivate'), copyIndex())]",
167
+ "location": "[variables('location')]",
168
+ "copy": {
169
+ "name": "NICPrivateCopy",
170
+ "count": "[parameters('vmCount')]"
171
+ },
172
+ "properties": {
173
+ "ipConfigurations": [
174
+ {
175
+ "name": "[variables('networkIpConfig')]",
176
+ "properties": {
177
+ "subnet": {
178
+ "id": "[parameters('subnet')]"
179
+ },
180
+ "privateIPAllocationMethod": "Dynamic"
181
+ }
182
+ }
183
+ ],
184
+ "networkSecurityGroup": {
185
+ "id": "[parameters('nsg')]"
186
+ },
187
+ "enableAcceleratedNetworking": "[parameters('enableAcceleratedNetworking')]"
188
+ },
189
+ "condition": "[not(parameters('provisionPublicIp'))]"
190
+ },
191
+ {
192
+ "type": "Microsoft.Network/publicIpAddresses",
193
+ "apiVersion": "2019-02-01",
194
+ "name": "[concat(variables('publicIpAddressName'), copyIndex())]",
195
+ "location": "[variables('location')]",
196
+ "properties": {
197
+ "publicIpAllocationMethod": "Static",
198
+ "publicIPAddressVersion": "IPv4"
199
+ },
200
+ "copy": {
201
+ "name": "PublicIpCopy",
202
+ "count": "[parameters('vmCount')]"
203
+ },
204
+ "sku": {
205
+ "name": "Basic",
206
+ "tier": "Regional"
207
+ },
208
+ "condition": "[parameters('provisionPublicIp')]"
209
+ },
210
+ {
211
+ "type": "Microsoft.Compute/virtualMachines",
212
+ "apiVersion": "2019-03-01",
213
+ "name": "[concat(parameters('vmName'), copyIndex())]",
214
+ "location": "[variables('location')]",
215
+ "dependsOn": [
216
+ "[resourceId('Microsoft.Network/networkInterfaces/', concat(variables('networkInterfaceName'), copyIndex()))]"
217
+ ],
218
+ "copy": {
219
+ "name": "VmCopy",
220
+ "count": "[parameters('vmCount')]"
221
+ },
222
+ "tags": "[parameters('vmTags')]",
223
+ "properties": {
224
+ "hardwareProfile": {
225
+ "vmSize": "[parameters('vmSize')]"
226
+ },
227
+ "storageProfile": {
228
+ "osDisk": {
229
+ "createOption": "fromImage",
230
+ "managedDisk": {
231
+ "storageAccountType": "[variables('osDiskType')]"
232
+ }
233
+ },
234
+ "imageReference": {
235
+ "publisher": "[parameters('imagePublisher')]",
236
+ "offer": "[parameters('imageOffer')]",
237
+ "sku": "[parameters('imageSku')]",
238
+ "version": "[parameters('imageVersion')]"
239
+ }
240
+ },
241
+ "networkProfile": {
242
+ "networkInterfaces": [
243
+ {
244
+ "id": "[resourceId('Microsoft.Network/networkInterfaces', concat(variables('networkInterfaceName'), copyIndex()))]"
245
+ }
246
+ ]
247
+ },
248
+ "osProfile": {
249
+ "computerName": "[concat(parameters('vmName'), copyIndex())]",
250
+ "adminUsername": "[parameters('adminUsername')]",
251
+ "adminPassword": "[parameters('publicKey')]",
252
+ "linuxConfiguration": {
253
+ "disablePasswordAuthentication": true,
254
+ "ssh": {
255
+ "publicKeys": [
256
+ {
257
+ "path": "[concat('/home/', parameters('adminUsername'), '/.ssh/authorized_keys')]",
258
+ "keyData": "[parameters('publicKey')]"
259
+ }
260
+ ]
261
+ }
262
+ }
263
+ },
264
+ "priority": "[parameters('priority')]",
265
+ "evictionPolicy": "[if(equals(parameters('priority'), 'Spot'), parameters('evictionPolicy'), '')]",
266
+ "billingProfile": "[parameters('billingProfile')]"
267
+ },
268
+ "identity": {
269
+ "type": "UserAssigned",
270
+ "userAssignedIdentities": {
271
+ "[parameters('msi')]": {
272
+ }
273
+ }
274
+ }
275
+ }
276
+ ],
277
+ "outputs": {
278
+ "publicIp": {
279
+ "type": "array",
280
+ "copy": {
281
+ "count": "[parameters('vmCount')]",
282
+ "input": "[reference(concat(variables('publicIpAddressName'), copyIndex())).ipAddress]"
283
+ },
284
+ "condition": "[parameters('provisionPublicIp')]"
285
+ },
286
+ "privateIp": {
287
+ "type": "array",
288
+ "copy": {
289
+ "count": "[parameters('vmCount')]",
290
+ "input": "[reference(concat(variables('networkInterfaceName'), copyIndex())).ipConfigurations[0].properties.privateIPAddress]"
291
+ }
292
+ }
293
+ }
294
+ }
.venv/lib/python3.11/site-packages/ray/autoscaler/_private/_azure/config.py ADDED
@@ -0,0 +1,208 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import logging
3
+ import random
4
+ from hashlib import sha256
5
+ from pathlib import Path
6
+ from typing import Any, Callable
7
+
8
+ from azure.common.credentials import get_cli_profile
9
+ from azure.identity import AzureCliCredential
10
+ from azure.mgmt.resource import ResourceManagementClient
11
+ from azure.mgmt.resource.resources.models import DeploymentMode
12
+
13
+ UNIQUE_ID_LEN = 4
14
+
15
+ logger = logging.getLogger(__name__)
16
+
17
+
18
+ def get_azure_sdk_function(client: Any, function_name: str) -> Callable:
19
+ """Retrieve a callable function from Azure SDK client object.
20
+
21
+ Newer versions of the various client SDKs renamed function names to
22
+ have a begin_ prefix. This function supports both the old and new
23
+ versions of the SDK by first trying the old name and falling back to
24
+ the prefixed new name.
25
+ """
26
+ func = getattr(
27
+ client, function_name, getattr(client, f"begin_{function_name}", None)
28
+ )
29
+ if func is None:
30
+ raise AttributeError(
31
+ "'{obj}' object has no {func} or begin_{func} attribute".format(
32
+ obj={client.__name__}, func=function_name
33
+ )
34
+ )
35
+ return func
36
+
37
+
38
+ def bootstrap_azure(config):
39
+ config = _configure_key_pair(config)
40
+ config = _configure_resource_group(config)
41
+ return config
42
+
43
+
44
+ def _configure_resource_group(config):
45
+ # TODO: look at availability sets
46
+ # https://docs.microsoft.com/en-us/azure/virtual-machines/windows/tutorial-availability-sets
47
+ subscription_id = config["provider"].get("subscription_id")
48
+ if subscription_id is None:
49
+ subscription_id = get_cli_profile().get_subscription_id()
50
+ resource_client = ResourceManagementClient(AzureCliCredential(), subscription_id)
51
+ config["provider"]["subscription_id"] = subscription_id
52
+ logger.info("Using subscription id: %s", subscription_id)
53
+
54
+ assert (
55
+ "resource_group" in config["provider"]
56
+ ), "Provider config must include resource_group field"
57
+ resource_group = config["provider"]["resource_group"]
58
+
59
+ assert (
60
+ "location" in config["provider"]
61
+ ), "Provider config must include location field"
62
+ params = {"location": config["provider"]["location"]}
63
+
64
+ if "tags" in config["provider"]:
65
+ params["tags"] = config["provider"]["tags"]
66
+
67
+ logger.info("Creating/Updating resource group: %s", resource_group)
68
+ rg_create_or_update = get_azure_sdk_function(
69
+ client=resource_client.resource_groups, function_name="create_or_update"
70
+ )
71
+ rg_create_or_update(resource_group_name=resource_group, parameters=params)
72
+
73
+ # load the template file
74
+ current_path = Path(__file__).parent
75
+ template_path = current_path.joinpath("azure-config-template.json")
76
+ with open(template_path, "r") as template_fp:
77
+ template = json.load(template_fp)
78
+
79
+ logger.info("Using cluster name: %s", config["cluster_name"])
80
+
81
+ # set unique id for resources in this cluster
82
+ unique_id = config["provider"].get("unique_id")
83
+ if unique_id is None:
84
+ hasher = sha256()
85
+ hasher.update(config["provider"]["resource_group"].encode("utf-8"))
86
+ unique_id = hasher.hexdigest()[:UNIQUE_ID_LEN]
87
+ else:
88
+ unique_id = str(unique_id)
89
+ config["provider"]["unique_id"] = unique_id
90
+ logger.info("Using unique id: %s", unique_id)
91
+ cluster_id = "{}-{}".format(config["cluster_name"], unique_id)
92
+
93
+ subnet_mask = config["provider"].get("subnet_mask")
94
+ if subnet_mask is None:
95
+ # choose a random subnet, skipping most common value of 0
96
+ random.seed(unique_id)
97
+ subnet_mask = "10.{}.0.0/16".format(random.randint(1, 254))
98
+ logger.info("Using subnet mask: %s", subnet_mask)
99
+
100
+ # Copy over properties from existing subnet.
101
+ # Addresses issue (https://github.com/Azure/azure-quickstart-templates/issues/2786)
102
+ # where existing subnet properties will get overwritten unless explicitly specified
103
+ # during multiple deployments even if vnet/subnet do not change.
104
+ # May eventually be fixed by passing empty subnet list if they already exist:
105
+ # https://techcommunity.microsoft.com/t5/azure-networking-blog/azure-virtual-network-now-supports-updates-without-subnet/ba-p/4067952
106
+ list_by_rg = get_azure_sdk_function(
107
+ client=resource_client.resources, function_name="list_by_resource_group"
108
+ )
109
+ existing_vnets = list(
110
+ list_by_rg(
111
+ resource_group,
112
+ f"substringof('{unique_id}', name) and "
113
+ "resourceType eq 'Microsoft.Network/virtualNetworks'",
114
+ )
115
+ )
116
+ if len(existing_vnets) > 0:
117
+ vnid = existing_vnets[0].id
118
+ get_by_id = get_azure_sdk_function(
119
+ client=resource_client.resources, function_name="get_by_id"
120
+ )
121
+ subnet = get_by_id(vnid, resource_client.DEFAULT_API_VERSION).properties[
122
+ "subnets"
123
+ ][0]
124
+ template_vnet = next(
125
+ (
126
+ rs
127
+ for rs in template["resources"]
128
+ if rs["type"] == "Microsoft.Network/virtualNetworks"
129
+ ),
130
+ None,
131
+ )
132
+ if template_vnet is not None:
133
+ template_subnets = template_vnet["properties"].get("subnets")
134
+ if template_subnets is not None:
135
+ template_subnets[0]["properties"].update(subnet["properties"])
136
+
137
+ # Get or create an MSI name and resource group.
138
+ # Defaults to current resource group if not provided.
139
+ use_existing_msi = (
140
+ "msi_name" in config["provider"] and "msi_resource_group" in config["provider"]
141
+ )
142
+ msi_resource_group = config["provider"].get("msi_resource_group", resource_group)
143
+ msi_name = config["provider"].get("msi_name", f"ray-{cluster_id}-msi")
144
+ logger.info(
145
+ "Using msi_name: %s from msi_resource_group: %s", msi_name, msi_resource_group
146
+ )
147
+
148
+ parameters = {
149
+ "properties": {
150
+ "mode": DeploymentMode.incremental,
151
+ "template": template,
152
+ "parameters": {
153
+ "subnet": {"value": subnet_mask},
154
+ "clusterId": {"value": cluster_id},
155
+ "msiName": {"value": msi_name},
156
+ "msiResourceGroup": {"value": msi_resource_group},
157
+ "createMsi": {"value": not use_existing_msi},
158
+ },
159
+ }
160
+ }
161
+
162
+ create_or_update = get_azure_sdk_function(
163
+ client=resource_client.deployments, function_name="create_or_update"
164
+ )
165
+ outputs = (
166
+ create_or_update(
167
+ resource_group_name=resource_group,
168
+ deployment_name="ray-config",
169
+ parameters=parameters,
170
+ )
171
+ .result()
172
+ .properties.outputs
173
+ )
174
+
175
+ # append output resource ids to be used with vm creation
176
+ config["provider"]["msi"] = outputs["msi"]["value"]
177
+ config["provider"]["nsg"] = outputs["nsg"]["value"]
178
+ config["provider"]["subnet"] = outputs["subnet"]["value"]
179
+
180
+ return config
181
+
182
+
183
+ def _configure_key_pair(config):
184
+ ssh_user = config["auth"]["ssh_user"]
185
+ public_key = None
186
+ # search if the keys exist
187
+ for key_type in ["ssh_private_key", "ssh_public_key"]:
188
+ try:
189
+ key_path = Path(config["auth"][key_type]).expanduser()
190
+ except KeyError:
191
+ raise Exception("Config must define {}".format(key_type))
192
+ except TypeError:
193
+ raise Exception("Invalid config value for {}".format(key_type))
194
+
195
+ assert key_path.is_file(), "Could not find ssh key: {}".format(key_path)
196
+
197
+ if key_type == "ssh_public_key":
198
+ with open(key_path, "r") as f:
199
+ public_key = f.read()
200
+
201
+ for node_type in config["available_node_types"].values():
202
+ azure_arm_parameters = node_type["node_config"].setdefault(
203
+ "azure_arm_parameters", {}
204
+ )
205
+ azure_arm_parameters["adminUsername"] = ssh_user
206
+ azure_arm_parameters["publicKey"] = public_key
207
+
208
+ return config
.venv/lib/python3.11/site-packages/ray/autoscaler/_private/_azure/node_provider.py ADDED
@@ -0,0 +1,488 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import logging
3
+ import time
4
+ from concurrent.futures import Future, ThreadPoolExecutor
5
+ from pathlib import Path
6
+ from threading import RLock
7
+ from uuid import uuid4
8
+
9
+ from azure.core.exceptions import ResourceNotFoundError
10
+ from azure.identity import DefaultAzureCredential
11
+ from azure.mgmt.compute import ComputeManagementClient
12
+ from azure.mgmt.network import NetworkManagementClient
13
+ from azure.mgmt.resource import ResourceManagementClient
14
+ from azure.mgmt.resource.resources.models import DeploymentMode
15
+
16
+ from ray.autoscaler._private._azure.config import (
17
+ bootstrap_azure,
18
+ get_azure_sdk_function,
19
+ )
20
+ from ray.autoscaler._private.constants import (
21
+ AUTOSCALER_NODE_START_WAIT_S,
22
+ AUTOSCALER_NODE_TERMINATE_WAIT_S,
23
+ MAX_PARALLEL_SHUTDOWN_WORKERS,
24
+ )
25
+ from ray.autoscaler.node_provider import NodeProvider
26
+ from ray.autoscaler.tags import (
27
+ NODE_KIND_HEAD,
28
+ TAG_RAY_CLUSTER_NAME,
29
+ TAG_RAY_LAUNCH_CONFIG,
30
+ TAG_RAY_NODE_KIND,
31
+ TAG_RAY_NODE_NAME,
32
+ TAG_RAY_USER_NODE_TYPE,
33
+ )
34
+
35
+ VM_NAME_MAX_LEN = 64
36
+ UNIQUE_ID_LEN = 4
37
+
38
+ logger = logging.getLogger(__name__)
39
+ azure_logger = logging.getLogger("azure.core.pipeline.policies.http_logging_policy")
40
+ azure_logger.setLevel(logging.WARNING)
41
+
42
+
43
+ def synchronized(f):
44
+ def wrapper(self, *args, **kwargs):
45
+ self.lock.acquire()
46
+ try:
47
+ return f(self, *args, **kwargs)
48
+ finally:
49
+ self.lock.release()
50
+
51
+ return wrapper
52
+
53
+
54
+ class AzureNodeProvider(NodeProvider):
55
+ """Node Provider for Azure
56
+
57
+ This provider assumes Azure credentials are set by running ``az login``
58
+ and the default subscription is configured through ``az account``
59
+ or set in the ``provider`` field of the autoscaler configuration.
60
+
61
+ Nodes may be in one of three states: {pending, running, terminated}. Nodes
62
+ appear immediately once started by ``create_node``, and transition
63
+ immediately to terminated when ``terminate_node`` is called.
64
+ """
65
+
66
+ def __init__(self, provider_config, cluster_name):
67
+ NodeProvider.__init__(self, provider_config, cluster_name)
68
+ subscription_id = provider_config["subscription_id"]
69
+ self.cache_stopped_nodes = provider_config.get("cache_stopped_nodes", True)
70
+ credential = DefaultAzureCredential(exclude_shared_token_cache_credential=True)
71
+ self.compute_client = ComputeManagementClient(credential, subscription_id)
72
+ self.network_client = NetworkManagementClient(credential, subscription_id)
73
+ self.resource_client = ResourceManagementClient(credential, subscription_id)
74
+
75
+ self.lock = RLock()
76
+
77
+ # cache node objects
78
+ self.cached_nodes = {}
79
+
80
+ # Cache terminating node operations
81
+ self.terminating_nodes: dict[str, Future] = {}
82
+ self.termination_executor = ThreadPoolExecutor(
83
+ max_workers=MAX_PARALLEL_SHUTDOWN_WORKERS
84
+ )
85
+
86
+ @synchronized
87
+ def _get_filtered_nodes(self, tag_filters):
88
+ # add cluster name filter to only get nodes from this cluster
89
+ cluster_tag_filters = {**tag_filters, TAG_RAY_CLUSTER_NAME: self.cluster_name}
90
+
91
+ def match_tags(tags):
92
+ for k, v in cluster_tag_filters.items():
93
+ if tags.get(k) != v:
94
+ return False
95
+ return True
96
+
97
+ vms = self.compute_client.virtual_machines.list(
98
+ resource_group_name=self.provider_config["resource_group"]
99
+ )
100
+
101
+ nodes = [self._extract_metadata(vm) for vm in vms]
102
+ self.cached_nodes = {node["name"]: node for node in nodes}
103
+
104
+ # Update terminating nodes list by removing nodes that
105
+ # have finished termination.
106
+ self.terminating_nodes = {
107
+ k: v for k, v in self.terminating_nodes.items() if not v.done()
108
+ }
109
+
110
+ return {k: v for k, v in self.cached_nodes.items() if match_tags(v["tags"])}
111
+
112
+ def _extract_metadata(self, vm):
113
+ # get tags
114
+ metadata = {"name": vm.name, "tags": vm.tags, "status": ""}
115
+
116
+ # get status
117
+ resource_group = self.provider_config["resource_group"]
118
+ try:
119
+ instance = self.compute_client.virtual_machines.instance_view(
120
+ resource_group_name=resource_group, vm_name=vm.name
121
+ ).as_dict()
122
+ except ResourceNotFoundError:
123
+ return metadata
124
+
125
+ for status in instance["statuses"]:
126
+ # If ProvisioningState is "failed" (e.g.,
127
+ # ProvisioningState/failed/RetryableError), we can get a third
128
+ # string here, so we need to limit to the first two outputs.
129
+ code, state = status["code"].split("/")[:2]
130
+ # skip provisioning status
131
+ if code == "PowerState":
132
+ metadata["status"] = state
133
+ break
134
+
135
+ # get ip data
136
+ nic_id = vm.network_profile.network_interfaces[0].id
137
+ metadata["nic_name"] = nic_id.split("/")[-1]
138
+ nic = self.network_client.network_interfaces.get(
139
+ resource_group_name=resource_group,
140
+ network_interface_name=metadata["nic_name"],
141
+ )
142
+ ip_config = nic.ip_configurations[0]
143
+
144
+ # Get public IP if not using internal IPs or if this is the
145
+ # head node and use_external_head_ip is True
146
+ if not self.provider_config.get("use_internal_ips", False) or (
147
+ self.provider_config.get("use_external_head_ip", False)
148
+ and metadata["tags"][TAG_RAY_NODE_KIND] == NODE_KIND_HEAD
149
+ ):
150
+ public_ip_id = ip_config.public_ip_address.id
151
+ metadata["public_ip_name"] = public_ip_id.split("/")[-1]
152
+ public_ip = self.network_client.public_ip_addresses.get(
153
+ resource_group_name=resource_group,
154
+ public_ip_address_name=metadata["public_ip_name"],
155
+ )
156
+ metadata["external_ip"] = public_ip.ip_address
157
+
158
+ metadata["internal_ip"] = ip_config.private_ip_address
159
+
160
+ return metadata
161
+
162
+ def stopped_nodes(self, tag_filters):
163
+ """Return a list of stopped node ids filtered by the specified tags dict."""
164
+ nodes = self._get_filtered_nodes(tag_filters=tag_filters)
165
+ return [k for k, v in nodes.items() if v["status"].startswith("deallocat")]
166
+
167
+ def non_terminated_nodes(self, tag_filters):
168
+ """Return a list of node ids filtered by the specified tags dict.
169
+
170
+ This list must not include terminated nodes. For performance reasons,
171
+ providers are allowed to cache the result of a call to nodes() to
172
+ serve single-node queries (e.g. is_running(node_id)). This means that
173
+ nodes() must be called again to refresh results.
174
+
175
+ Examples:
176
+ >>> from ray.autoscaler.tags import TAG_RAY_NODE_KIND
177
+ >>> provider = ... # doctest: +SKIP
178
+ >>> provider.non_terminated_nodes( # doctest: +SKIP
179
+ ... {TAG_RAY_NODE_KIND: "worker"})
180
+ ["node-1", "node-2"]
181
+ """
182
+ nodes = self._get_filtered_nodes(tag_filters=tag_filters)
183
+ return [
184
+ k
185
+ for k, v in nodes.items()
186
+ if not v["status"].startswith("deallocat") or k in self.terminating_nodes
187
+ ]
188
+
189
+ def is_running(self, node_id):
190
+ """Return whether the specified node is running."""
191
+ # always get current status
192
+ node = self._get_node(node_id=node_id)
193
+ return node["status"] == "running"
194
+
195
+ def is_terminated(self, node_id):
196
+ """Return whether the specified node is terminated."""
197
+ # always get current status
198
+ node = self._get_node(node_id=node_id)
199
+ return node["status"].startswith("deallocat")
200
+
201
+ def node_tags(self, node_id):
202
+ """Returns the tags of the given node (string dict)."""
203
+ return self._get_cached_node(node_id=node_id)["tags"]
204
+
205
+ def external_ip(self, node_id):
206
+ """Returns the external ip of the given node."""
207
+ ip = (
208
+ self._get_cached_node(node_id=node_id)["external_ip"]
209
+ or self._get_node(node_id=node_id)["external_ip"]
210
+ )
211
+ return ip
212
+
213
+ def internal_ip(self, node_id):
214
+ """Returns the internal ip (Ray ip) of the given node."""
215
+ ip = (
216
+ self._get_cached_node(node_id=node_id)["internal_ip"]
217
+ or self._get_node(node_id=node_id)["internal_ip"]
218
+ )
219
+ return ip
220
+
221
+ def create_node(self, node_config, tags, count):
222
+ resource_group = self.provider_config["resource_group"]
223
+
224
+ if self.cache_stopped_nodes:
225
+ VALIDITY_TAGS = [
226
+ TAG_RAY_CLUSTER_NAME,
227
+ TAG_RAY_NODE_KIND,
228
+ TAG_RAY_LAUNCH_CONFIG,
229
+ TAG_RAY_USER_NODE_TYPE,
230
+ ]
231
+ filters = {tag: tags[tag] for tag in VALIDITY_TAGS if tag in tags}
232
+ reuse_nodes = self.stopped_nodes(filters)[:count]
233
+ logger.info(
234
+ f"Reusing nodes {list(reuse_nodes)}. "
235
+ "To disable reuse, set `cache_stopped_nodes: False` "
236
+ "under `provider` in the cluster configuration.",
237
+ )
238
+ start = get_azure_sdk_function(
239
+ client=self.compute_client.virtual_machines, function_name="start"
240
+ )
241
+ for node_id in reuse_nodes:
242
+ start(resource_group_name=resource_group, vm_name=node_id).wait()
243
+ self.set_node_tags(node_id, tags)
244
+ count -= len(reuse_nodes)
245
+
246
+ if count:
247
+ self._create_node(node_config, tags, count)
248
+
249
+ def _create_node(self, node_config, tags, count):
250
+ """Creates a number of nodes within the namespace."""
251
+ resource_group = self.provider_config["resource_group"]
252
+
253
+ # load the template file
254
+ current_path = Path(__file__).parent
255
+ template_path = current_path.joinpath("azure-vm-template.json")
256
+ with open(template_path, "r") as template_fp:
257
+ template = json.load(template_fp)
258
+
259
+ # get the tags
260
+ config_tags = node_config.get("tags", {}).copy()
261
+ config_tags.update(tags)
262
+ config_tags[TAG_RAY_CLUSTER_NAME] = self.cluster_name
263
+
264
+ vm_name = "{node}-{unique_id}-{vm_id}".format(
265
+ node=config_tags.get(TAG_RAY_NODE_NAME, "node"),
266
+ unique_id=self.provider_config["unique_id"],
267
+ vm_id=uuid4().hex[:UNIQUE_ID_LEN],
268
+ )[:VM_NAME_MAX_LEN]
269
+
270
+ template_params = node_config["azure_arm_parameters"].copy()
271
+ template_params["vmName"] = vm_name
272
+ # Provision public IP if not using internal IPs or if this is the
273
+ # head node and use_external_head_ip is True
274
+ template_params["provisionPublicIp"] = not self.provider_config.get(
275
+ "use_internal_ips", False
276
+ ) or (
277
+ self.provider_config.get("use_external_head_ip", False)
278
+ and config_tags[TAG_RAY_NODE_KIND] == NODE_KIND_HEAD
279
+ )
280
+ template_params["vmTags"] = config_tags
281
+ template_params["vmCount"] = count
282
+ template_params["msi"] = self.provider_config["msi"]
283
+ template_params["nsg"] = self.provider_config["nsg"]
284
+ template_params["subnet"] = self.provider_config["subnet"]
285
+
286
+ parameters = {
287
+ "properties": {
288
+ "mode": DeploymentMode.incremental,
289
+ "template": template,
290
+ "parameters": {
291
+ key: {"value": value} for key, value in template_params.items()
292
+ },
293
+ }
294
+ }
295
+
296
+ # TODO: we could get the private/public ips back directly
297
+ create_or_update = get_azure_sdk_function(
298
+ client=self.resource_client.deployments, function_name="create_or_update"
299
+ )
300
+ create_or_update(
301
+ resource_group_name=resource_group,
302
+ deployment_name=vm_name,
303
+ parameters=parameters,
304
+ ).wait(timeout=AUTOSCALER_NODE_START_WAIT_S)
305
+
306
+ @synchronized
307
+ def set_node_tags(self, node_id, tags):
308
+ """Sets the tag values (string dict) for the specified node."""
309
+ node_tags = self._get_cached_node(node_id)["tags"]
310
+ node_tags.update(tags)
311
+ update = get_azure_sdk_function(
312
+ client=self.compute_client.virtual_machines, function_name="update"
313
+ )
314
+ update(
315
+ resource_group_name=self.provider_config["resource_group"],
316
+ vm_name=node_id,
317
+ parameters={"tags": node_tags},
318
+ )
319
+ self.cached_nodes[node_id]["tags"] = node_tags
320
+
321
+ def terminate_node(self, node_id):
322
+ """Terminates the specified node. This will delete the VM and
323
+ associated resources (NIC, IP, Storage) for the specified node."""
324
+
325
+ resource_group = self.provider_config["resource_group"]
326
+
327
+ if self.cache_stopped_nodes:
328
+ try:
329
+ # stop machine and leave all resources
330
+ logger.info(
331
+ f"Stopping instance {node_id}"
332
+ "(to fully terminate instead, "
333
+ "set `cache_stopped_nodes: False` "
334
+ "under `provider` in the cluster configuration)"
335
+ )
336
+ stop = get_azure_sdk_function(
337
+ client=self.compute_client.virtual_machines,
338
+ function_name="deallocate",
339
+ )
340
+ stop(resource_group_name=resource_group, vm_name=node_id)
341
+ except Exception as e:
342
+ logger.warning("Failed to stop VM: {}".format(e))
343
+
344
+ # If node_id is in terminating nodes dict, it's already terminating
345
+ # Otherwise, kick off termination and add it to the dict
346
+ elif node_id not in self.terminating_nodes:
347
+ self.terminating_nodes[node_id] = self.termination_executor.submit(
348
+ self._delete_node_and_resources, resource_group, node_id
349
+ )
350
+
351
+ def _delete_node_and_resources(self, resource_group, node_id):
352
+ try:
353
+ vm = self.compute_client.virtual_machines.get(
354
+ resource_group_name=resource_group, vm_name=node_id
355
+ )
356
+ except ResourceNotFoundError as e:
357
+ # Node no longer exists
358
+ logger.warning("Failed to delete VM: {}".format(e))
359
+ return
360
+
361
+ # Gather dependent disks
362
+ disks = set()
363
+ if vm.storage_profile is not None and vm.storage_profile.data_disks is not None:
364
+ for d in vm.storage_profile.data_disks:
365
+ if d.name is not None:
366
+ disks.add(d.name)
367
+ if (
368
+ vm.storage_profile is not None
369
+ and vm.storage_profile.os_disk is not None
370
+ and vm.storage_profile.os_disk.name is not None
371
+ ):
372
+ disks.add(vm.storage_profile.os_disk.name)
373
+
374
+ # Gather dependent NICs and public IPs
375
+ nics = set()
376
+ ips = set()
377
+ if (
378
+ vm.network_profile is not None
379
+ and vm.network_profile.network_interfaces is not None
380
+ ):
381
+ for nint in vm.network_profile.network_interfaces:
382
+ if nint.id is not None:
383
+ nic_name = nint.id.split("/")[-1]
384
+ nics.add(nic_name)
385
+ # Get public IP if not using internal IPs or if this is the
386
+ # head node and use_external_head_ip is True
387
+ if not self.provider_config.get("use_internal_ips", False) or (
388
+ self.provider_config.get("use_external_head_ip", False)
389
+ and vm.tags[TAG_RAY_NODE_KIND] == NODE_KIND_HEAD
390
+ ):
391
+ nic = self.network_client.network_interfaces.get(
392
+ resource_group_name=resource_group,
393
+ network_interface_name=nic_name,
394
+ )
395
+ if nic.ip_configurations is not None:
396
+ for ipc in nic.ip_configurations:
397
+ if ipc.public_ip_address.id is not None:
398
+ ips.add(ipc.public_ip_address.id.split("/")[-1])
399
+
400
+ # Delete VM
401
+ st = time.monotonic()
402
+ delete = get_azure_sdk_function(
403
+ client=self.compute_client.virtual_machines,
404
+ function_name="delete",
405
+ )
406
+ try:
407
+ delete(resource_group_name=resource_group, vm_name=node_id).wait(
408
+ timeout=AUTOSCALER_NODE_TERMINATE_WAIT_S
409
+ )
410
+ except Exception as e:
411
+ logger.warning("Failed to delete VM: {}".format(e))
412
+
413
+ # Delete disks (no need to wait for these, but gather the LROs for end)
414
+ disk_lros = []
415
+ delete = get_azure_sdk_function(
416
+ client=self.compute_client.disks, function_name="delete"
417
+ )
418
+ for d in disks:
419
+ try:
420
+ disk_lros.append(
421
+ delete(
422
+ resource_group_name=resource_group,
423
+ disk_name=d,
424
+ )
425
+ )
426
+ except Exception as e:
427
+ logger.warning("Failed to delete disk: {}".format(e))
428
+
429
+ # Delete NICs
430
+ nic_lros = []
431
+ delete = get_azure_sdk_function(
432
+ client=self.network_client.network_interfaces, function_name="delete"
433
+ )
434
+ for n in nics:
435
+ try:
436
+ nic_lros.append(
437
+ delete(
438
+ resource_group_name=resource_group,
439
+ network_interface_name=n,
440
+ )
441
+ )
442
+ except Exception as e:
443
+ logger.warning("Failed to delete NIC: {}".format(e))
444
+
445
+ while (
446
+ not all(nlro.done() for nlro in nic_lros)
447
+ and (time.monotonic() - st) < AUTOSCALER_NODE_TERMINATE_WAIT_S
448
+ ):
449
+ time.sleep(0.1)
450
+
451
+ # Delete Public IPs
452
+ delete = get_azure_sdk_function(
453
+ client=self.network_client.public_ip_addresses,
454
+ function_name="delete",
455
+ )
456
+ ip_lros = []
457
+ for ip in ips:
458
+ try:
459
+ ip_lros.append(
460
+ delete(
461
+ resource_group_name=resource_group,
462
+ public_ip_address_name=ip,
463
+ )
464
+ )
465
+ except Exception as e:
466
+ logger.warning("Failed to delete public IP: {}".format(e))
467
+
468
+ while (
469
+ not all(dlro.done() for dlro in disk_lros)
470
+ and (time.monotonic() - st) < AUTOSCALER_NODE_TERMINATE_WAIT_S
471
+ ):
472
+ time.sleep(0.1)
473
+ while (
474
+ not all(iplro.done() for iplro in ip_lros)
475
+ and (time.monotonic() - st) < AUTOSCALER_NODE_TERMINATE_WAIT_S
476
+ ):
477
+ time.sleep(0.1)
478
+
479
+ def _get_node(self, node_id):
480
+ self._get_filtered_nodes({}) # Side effect: updates cache
481
+ return self.cached_nodes[node_id]
482
+
483
+ def _get_cached_node(self, node_id):
484
+ return self.cached_nodes.get(node_id) or self._get_node(node_id=node_id)
485
+
486
+ @staticmethod
487
+ def bootstrap_config(cluster_config):
488
+ return bootstrap_azure(cluster_config)
.venv/lib/python3.11/site-packages/ray/autoscaler/_private/autoscaler.py ADDED
@@ -0,0 +1,1508 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import copy
2
+ import logging
3
+ import math
4
+ import operator
5
+ import os
6
+ import queue
7
+ import subprocess
8
+ import threading
9
+ import time
10
+ from collections import Counter, defaultdict, namedtuple
11
+ from dataclasses import dataclass, field
12
+ from enum import Enum
13
+ from typing import Any, Callable, Dict, FrozenSet, List, Optional, Set, Tuple, Union
14
+
15
+ import yaml
16
+
17
+ import ray
18
+ import ray._private.ray_constants as ray_constants
19
+ from ray.autoscaler._private.constants import (
20
+ AUTOSCALER_HEARTBEAT_TIMEOUT_S,
21
+ AUTOSCALER_MAX_CONCURRENT_LAUNCHES,
22
+ AUTOSCALER_MAX_LAUNCH_BATCH,
23
+ AUTOSCALER_MAX_NUM_FAILURES,
24
+ AUTOSCALER_STATUS_LOG,
25
+ AUTOSCALER_UPDATE_INTERVAL_S,
26
+ DISABLE_LAUNCH_CONFIG_CHECK_KEY,
27
+ DISABLE_NODE_UPDATERS_KEY,
28
+ FOREGROUND_NODE_LAUNCH_KEY,
29
+ WORKER_LIVENESS_CHECK_KEY,
30
+ )
31
+ from ray.autoscaler._private.event_summarizer import EventSummarizer
32
+ from ray.autoscaler._private.legacy_info_string import legacy_log_info_string
33
+ from ray.autoscaler._private.load_metrics import LoadMetrics
34
+ from ray.autoscaler._private.local.node_provider import (
35
+ LocalNodeProvider,
36
+ record_local_head_state_if_needed,
37
+ )
38
+ from ray.autoscaler._private.node_launcher import BaseNodeLauncher, NodeLauncher
39
+ from ray.autoscaler._private.node_provider_availability_tracker import (
40
+ NodeAvailabilitySummary,
41
+ NodeProviderAvailabilityTracker,
42
+ )
43
+ from ray.autoscaler._private.node_tracker import NodeTracker
44
+ from ray.autoscaler._private.prom_metrics import AutoscalerPrometheusMetrics
45
+ from ray.autoscaler._private.providers import _get_node_provider
46
+ from ray.autoscaler._private.resource_demand_scheduler import (
47
+ ResourceDemandScheduler,
48
+ ResourceDict,
49
+ get_bin_pack_residual,
50
+ )
51
+ from ray.autoscaler._private.updater import NodeUpdaterThread
52
+ from ray.autoscaler._private.util import (
53
+ ConcurrentCounter,
54
+ NodeCount,
55
+ NodeID,
56
+ NodeIP,
57
+ NodeType,
58
+ NodeTypeConfigDict,
59
+ format_info_string,
60
+ hash_launch_conf,
61
+ hash_runtime_conf,
62
+ validate_config,
63
+ with_head_node_ip,
64
+ )
65
+ from ray.autoscaler.node_provider import NodeProvider
66
+ from ray.autoscaler.tags import (
67
+ NODE_KIND_HEAD,
68
+ NODE_KIND_UNMANAGED,
69
+ NODE_KIND_WORKER,
70
+ STATUS_UP_TO_DATE,
71
+ STATUS_UPDATE_FAILED,
72
+ TAG_RAY_FILE_MOUNTS_CONTENTS,
73
+ TAG_RAY_LAUNCH_CONFIG,
74
+ TAG_RAY_NODE_KIND,
75
+ TAG_RAY_NODE_STATUS,
76
+ TAG_RAY_RUNTIME_CONFIG,
77
+ TAG_RAY_USER_NODE_TYPE,
78
+ )
79
+ from ray.exceptions import RpcError
80
+
81
+ logger = logging.getLogger(__name__)
82
+
83
+ # Status of a node e.g. "up-to-date", see ray/autoscaler/tags.py
84
+ NodeStatus = str
85
+
86
+ # Tuple of modified fields for the given node_id returned by should_update
87
+ # that will be passed into a NodeUpdaterThread.
88
+ UpdateInstructions = namedtuple(
89
+ "UpdateInstructions",
90
+ ["node_id", "setup_commands", "ray_start_commands", "docker_config"],
91
+ )
92
+
93
+ NodeLaunchData = Tuple[NodeTypeConfigDict, NodeCount, Optional[NodeType]]
94
+
95
+
96
+ @dataclass
97
+ class AutoscalerSummary:
98
+ active_nodes: Dict[NodeType, int]
99
+ idle_nodes: Optional[Dict[NodeType, int]]
100
+ pending_nodes: List[Tuple[NodeIP, NodeType, NodeStatus]]
101
+ pending_launches: Dict[NodeType, int]
102
+ failed_nodes: List[Tuple[NodeIP, NodeType]]
103
+ node_availability_summary: NodeAvailabilitySummary = field(
104
+ default_factory=lambda: NodeAvailabilitySummary({})
105
+ )
106
+ # A dictionary of node IP to a list of reasons the node is not idle.
107
+ node_activities: Optional[Dict[str, Tuple[NodeIP, List[str]]]] = None
108
+ pending_resources: Dict[str, int] = field(default_factory=lambda: {})
109
+ # A mapping from node name (the same key as `usage_by_node`) to node type.
110
+ # Optional for deployment modes which have the concept of node types and
111
+ # backwards compatibility.
112
+ node_type_mapping: Optional[Dict[str, str]] = None
113
+ # Whether the autoscaler summary is v1 or v2.
114
+ legacy: bool = False
115
+
116
+
117
+ class NonTerminatedNodes:
118
+ """Class to extract and organize information on non-terminated nodes."""
119
+
120
+ def __init__(self, provider: NodeProvider):
121
+ start_time = time.time()
122
+ # All non-terminated nodes
123
+ self.all_node_ids = provider.non_terminated_nodes({})
124
+
125
+ # Managed worker nodes (node kind "worker"):
126
+ self.worker_ids: List[NodeID] = []
127
+ # The head node (node kind "head")
128
+ self.head_id: Optional[NodeID] = None
129
+
130
+ for node in self.all_node_ids:
131
+ node_kind = provider.node_tags(node)[TAG_RAY_NODE_KIND]
132
+ if node_kind == NODE_KIND_WORKER:
133
+ self.worker_ids.append(node)
134
+ elif node_kind == NODE_KIND_HEAD:
135
+ self.head_id = node
136
+
137
+ # Note: For typical use-cases, self.all_node_ids == self.worker_ids +
138
+ # [self.head_id]. The difference being in the case of unmanaged nodes.
139
+
140
+ # Record the time of the non_terminated nodes call. This typically
141
+ # translates to a "describe" or "list" call on most cluster managers
142
+ # which can be quite expensive. Note that we include the processing
143
+ # time because on some clients, there may be pagination and the
144
+ # underlying api calls may be done lazily.
145
+ self.non_terminated_nodes_time = time.time() - start_time
146
+ logger.info(
147
+ f"The autoscaler took {round(self.non_terminated_nodes_time, 3)}"
148
+ " seconds to fetch the list of non-terminated nodes."
149
+ )
150
+
151
+ def remove_terminating_nodes(self, terminating_nodes: List[NodeID]) -> None:
152
+ """Remove nodes we're in the process of terminating from internal
153
+ state."""
154
+
155
+ def not_terminating(node):
156
+ return node not in terminating_nodes
157
+
158
+ self.worker_ids = list(filter(not_terminating, self.worker_ids))
159
+ self.all_node_ids = list(filter(not_terminating, self.all_node_ids))
160
+
161
+
162
+ # Whether a worker should be kept based on the min_workers and
163
+ # max_workers constraints.
164
+ #
165
+ # keep: should keep the worker
166
+ # terminate: should terminate the worker
167
+ # decide_later: the worker can be terminated if needed
168
+ KeepOrTerminate = Enum("KeepOrTerminate", "keep terminate decide_later")
169
+
170
+
171
+ class StandardAutoscaler:
172
+ """The autoscaling control loop for a Ray cluster.
173
+
174
+ There are two ways to start an autoscaling cluster: manually by running
175
+ `ray start --head --autoscaling-config=/path/to/config.yaml` on a instance
176
+ that has permission to launch other instances, or you can also use `ray up
177
+ /path/to/config.yaml` from your laptop, which will configure the right
178
+ AWS/Cloud roles automatically. See the Ray documentation
179
+ (https://docs.ray.io/en/latest/) for a full definition of autoscaling behavior.
180
+ StandardAutoscaler's `update` method is periodically called in
181
+ `monitor.py`'s monitoring loop.
182
+
183
+ StandardAutoscaler is also used to bootstrap clusters (by adding workers
184
+ until the cluster size that can handle the resource demand is met).
185
+ """
186
+
187
+ def __init__(
188
+ self,
189
+ # TODO(ekl): require config reader to be a callable always.
190
+ config_reader: Union[str, Callable[[], dict]],
191
+ load_metrics: LoadMetrics,
192
+ gcs_client: "ray._raylet.GcsClient",
193
+ session_name: Optional[str] = None,
194
+ max_launch_batch: int = AUTOSCALER_MAX_LAUNCH_BATCH,
195
+ max_concurrent_launches: int = AUTOSCALER_MAX_CONCURRENT_LAUNCHES,
196
+ max_failures: int = AUTOSCALER_MAX_NUM_FAILURES,
197
+ process_runner: Any = subprocess,
198
+ update_interval_s: int = AUTOSCALER_UPDATE_INTERVAL_S,
199
+ prefix_cluster_info: bool = False,
200
+ event_summarizer: Optional[EventSummarizer] = None,
201
+ prom_metrics: Optional[AutoscalerPrometheusMetrics] = None,
202
+ ):
203
+ """Create a StandardAutoscaler.
204
+
205
+ Args:
206
+ config_reader: Path to a Ray Autoscaler YAML, or a function to read
207
+ and return the latest config.
208
+ load_metrics: Provides metrics for the Ray cluster.
209
+ session_name: The session name of the cluster this autoscaler
210
+ is deployed.
211
+ max_launch_batch: Max number of nodes to launch in one request.
212
+ max_concurrent_launches: Max number of nodes that can be
213
+ concurrently launched. This value and `max_launch_batch`
214
+ determine the number of batches that are used to launch nodes.
215
+ max_failures: Number of failures that the autoscaler will tolerate
216
+ before exiting.
217
+ process_runner: Subproc-like interface used by the CommandRunner.
218
+ update_interval_s: Seconds between running the autoscaling loop.
219
+ prefix_cluster_info: Whether to add the cluster name to info strs.
220
+ event_summarizer: Utility to consolidate duplicated messages.
221
+ prom_metrics: Prometheus metrics for autoscaler-related operations.
222
+ gcs_client: client for interactions with the GCS. Used to drain nodes
223
+ before termination.
224
+ """
225
+
226
+ if isinstance(config_reader, str):
227
+ # Auto wrap with file reader.
228
+ def read_fn():
229
+ with open(config_reader) as f:
230
+ new_config = yaml.safe_load(f.read())
231
+ return new_config
232
+
233
+ self.config_reader = read_fn
234
+ else:
235
+ self.config_reader = config_reader
236
+
237
+ self.node_provider_availability_tracker = NodeProviderAvailabilityTracker()
238
+ # Prefix each line of info string with cluster name if True
239
+ self.prefix_cluster_info = prefix_cluster_info
240
+ # Keep this before self.reset (self.provider needs to be created
241
+ # exactly once).
242
+ self.provider = None
243
+ # Keep this before self.reset (if an exception occurs in reset
244
+ # then prom_metrics must be instantitiated to increment the
245
+ # exception counter)
246
+ self.prom_metrics = prom_metrics or AutoscalerPrometheusMetrics(
247
+ session_name=session_name
248
+ ) # noqa
249
+ self.resource_demand_scheduler = None
250
+ self.reset(errors_fatal=True)
251
+ self.load_metrics = load_metrics
252
+
253
+ self.max_failures = max_failures
254
+ self.max_launch_batch = max_launch_batch
255
+ self.max_concurrent_launches = max_concurrent_launches
256
+ self.process_runner = process_runner
257
+ self.event_summarizer = event_summarizer or EventSummarizer()
258
+
259
+ # Map from node_id to NodeUpdater threads
260
+ self.updaters: Dict[NodeID, NodeUpdaterThread] = {}
261
+ self.num_failed_updates: Dict[NodeID, int] = defaultdict(int)
262
+ self.num_successful_updates: Dict[NodeID, int] = defaultdict(int)
263
+ self.num_failures = 0
264
+ self.last_update_time = 0.0
265
+ self.update_interval_s = update_interval_s
266
+
267
+ # Keeps track of pending and running nodes
268
+ self.non_terminated_nodes: Optional[NonTerminatedNodes] = None
269
+
270
+ # Tracks nodes scheduled for termination
271
+ self.nodes_to_terminate: List[NodeID] = []
272
+
273
+ # Disable NodeUpdater threads if true.
274
+ # Should be set to true in situations where another component, such as
275
+ # a Kubernetes operator, is responsible for Ray setup on nodes.
276
+ self.disable_node_updaters = self.config["provider"].get(
277
+ DISABLE_NODE_UPDATERS_KEY, False
278
+ )
279
+ logger.info(f"{DISABLE_NODE_UPDATERS_KEY}:{self.disable_node_updaters}")
280
+
281
+ # Disable launch config checking if true.
282
+ # This is set in the fake_multinode situations where there isn't any
283
+ # meaningful node "type" to enforce.
284
+ self.disable_launch_config_check = self.config["provider"].get(
285
+ DISABLE_LAUNCH_CONFIG_CHECK_KEY, False
286
+ )
287
+ logger.info(
288
+ f"{DISABLE_LAUNCH_CONFIG_CHECK_KEY}:{self.disable_launch_config_check}"
289
+ )
290
+
291
+ # By default, the autoscaler launches nodes in batches asynchronously in
292
+ # background threads.
293
+ # When the following flag is set, that behavior is disabled, so that nodes
294
+ # are launched in the main thread, all in one batch, blocking until all
295
+ # NodeProvider.create_node calls have returned.
296
+ self.foreground_node_launch = self.config["provider"].get(
297
+ FOREGROUND_NODE_LAUNCH_KEY, False
298
+ )
299
+ logger.info(f"{FOREGROUND_NODE_LAUNCH_KEY}:{self.foreground_node_launch}")
300
+
301
+ # By default, the autoscaler kills and/or tries to recover
302
+ # a worker node if it hasn't produced a resource heartbeat in the last 30
303
+ # seconds. The worker_liveness_check flag allows disabling this behavior in
304
+ # settings where another component, such as a Kubernetes operator, is
305
+ # responsible for healthchecks.
306
+ self.worker_liveness_check = self.config["provider"].get(
307
+ WORKER_LIVENESS_CHECK_KEY, True
308
+ )
309
+ logger.info(f"{WORKER_LIVENESS_CHECK_KEY}:{self.worker_liveness_check}")
310
+
311
+ # Node launchers
312
+ self.foreground_node_launcher: Optional[BaseNodeLauncher] = None
313
+ self.launch_queue: Optional[queue.Queue[NodeLaunchData]] = None
314
+ self.pending_launches = ConcurrentCounter()
315
+ if self.foreground_node_launch:
316
+ self.foreground_node_launcher = BaseNodeLauncher(
317
+ provider=self.provider,
318
+ pending=self.pending_launches,
319
+ event_summarizer=self.event_summarizer,
320
+ node_provider_availability_tracker=self.node_provider_availability_tracker, # noqa: E501 Flake and black disagree how to format this.
321
+ session_name=session_name,
322
+ node_types=self.available_node_types,
323
+ prom_metrics=self.prom_metrics,
324
+ )
325
+ else:
326
+ self.launch_queue = queue.Queue()
327
+ max_batches = math.ceil(max_concurrent_launches / float(max_launch_batch))
328
+ for i in range(int(max_batches)):
329
+ node_launcher = NodeLauncher(
330
+ provider=self.provider,
331
+ queue=self.launch_queue,
332
+ index=i,
333
+ pending=self.pending_launches,
334
+ event_summarizer=self.event_summarizer,
335
+ node_provider_availability_tracker=self.node_provider_availability_tracker, # noqa: E501 Flake and black disagreee how to format this.
336
+ session_name=session_name,
337
+ node_types=self.available_node_types,
338
+ prom_metrics=self.prom_metrics,
339
+ )
340
+ node_launcher.daemon = True
341
+ node_launcher.start()
342
+
343
+ # NodeTracker maintains soft state to track the number of recently
344
+ # failed nodes. It is best effort only.
345
+ self.node_tracker = NodeTracker()
346
+
347
+ # Expand local file_mounts to allow ~ in the paths. This can't be done
348
+ # earlier when the config is written since we might be on different
349
+ # platform and the expansion would result in wrong path.
350
+ self.config["file_mounts"] = {
351
+ remote: os.path.expanduser(local)
352
+ for remote, local in self.config["file_mounts"].items()
353
+ }
354
+
355
+ self.gcs_client = gcs_client
356
+
357
+ for local_path in self.config["file_mounts"].values():
358
+ assert os.path.exists(local_path)
359
+ logger.info("StandardAutoscaler: {}".format(self.config))
360
+
361
+ @property
362
+ def all_node_types(self) -> Set[str]:
363
+ return self.config["available_node_types"].keys()
364
+
365
+ def update(self):
366
+ try:
367
+ self.reset(errors_fatal=False)
368
+ self._update()
369
+ except Exception as e:
370
+ self.prom_metrics.update_loop_exceptions.inc()
371
+ logger.exception("StandardAutoscaler: Error during autoscaling.")
372
+ self.num_failures += 1
373
+ if self.num_failures > self.max_failures:
374
+ logger.critical("StandardAutoscaler: Too many errors, abort.")
375
+ raise e
376
+
377
+ def _update(self):
378
+ # For type checking, assert that these objects have been instantitiated.
379
+ assert self.provider
380
+ assert self.resource_demand_scheduler
381
+
382
+ now = time.time()
383
+ # Throttle autoscaling updates to this interval to avoid exceeding
384
+ # rate limits on API calls.
385
+ if now - self.last_update_time < self.update_interval_s:
386
+ return
387
+
388
+ self.last_update_time = now
389
+
390
+ # Query the provider to update the list of non-terminated nodes
391
+ self.non_terminated_nodes = NonTerminatedNodes(self.provider)
392
+
393
+ # Back off the update if the provider says it's not safe to proceed.
394
+ if not self.provider.safe_to_scale():
395
+ logger.info(
396
+ "Backing off of autoscaler update."
397
+ f" Will try again in {self.update_interval_s} seconds."
398
+ )
399
+ return
400
+
401
+ # This will accumulate the nodes we need to terminate.
402
+ self.nodes_to_terminate = []
403
+
404
+ # Update running nodes gauge
405
+ num_workers = len(self.non_terminated_nodes.worker_ids)
406
+ self.prom_metrics.running_workers.set(num_workers)
407
+
408
+ # Remove from LoadMetrics the ips unknown to the NodeProvider.
409
+ self.load_metrics.prune_active_ips(
410
+ active_ips=[
411
+ self.provider.internal_ip(node_id)
412
+ for node_id in self.non_terminated_nodes.all_node_ids
413
+ ]
414
+ )
415
+
416
+ # Update status strings
417
+ if AUTOSCALER_STATUS_LOG:
418
+ logger.info(self.info_string())
419
+ legacy_log_info_string(self, self.non_terminated_nodes.worker_ids)
420
+
421
+ if not self.provider.is_readonly():
422
+ self.terminate_nodes_to_enforce_config_constraints(now)
423
+
424
+ if self.disable_node_updaters:
425
+ # Don't handle unhealthy nodes if the liveness check is disabled.
426
+ # self.worker_liveness_check is True by default.
427
+ if self.worker_liveness_check:
428
+ self.terminate_unhealthy_nodes(now)
429
+ else:
430
+ self.process_completed_updates()
431
+ self.update_nodes()
432
+ # Don't handle unhealthy nodes if the liveness check is disabled.
433
+ # self.worker_liveness_check is True by default.
434
+ if self.worker_liveness_check:
435
+ self.attempt_to_recover_unhealthy_nodes(now)
436
+ self.set_prometheus_updater_data()
437
+
438
+ # Dict[NodeType, int], List[ResourceDict]
439
+ to_launch, unfulfilled = self.resource_demand_scheduler.get_nodes_to_launch(
440
+ self.non_terminated_nodes.all_node_ids,
441
+ self.pending_launches.breakdown(),
442
+ self.load_metrics.get_resource_demand_vector(),
443
+ self.load_metrics.get_resource_utilization(),
444
+ self.load_metrics.get_pending_placement_groups(),
445
+ self.load_metrics.get_static_node_resources_by_ip(),
446
+ ensure_min_cluster_size=self.load_metrics.get_resource_requests(),
447
+ node_availability_summary=self.node_provider_availability_tracker.summary(),
448
+ )
449
+ self._report_pending_infeasible(unfulfilled)
450
+
451
+ if not self.provider.is_readonly():
452
+ self.launch_required_nodes(to_launch)
453
+
454
+ # Execute optional end-of-update logic.
455
+ # Keep this method call at the end of autoscaler._update().
456
+ self.provider.post_process()
457
+
458
+ # Record the amount of time the autoscaler took for
459
+ # this _update() iteration.
460
+ update_time = time.time() - self.last_update_time
461
+ logger.info(
462
+ f"The autoscaler took {round(update_time, 3)}"
463
+ " seconds to complete the update iteration."
464
+ )
465
+ self.prom_metrics.update_time.observe(update_time)
466
+
467
+ def terminate_nodes_to_enforce_config_constraints(self, now: float):
468
+ """Terminates nodes to enforce constraints defined by the autoscaling
469
+ config.
470
+
471
+ (1) Terminates nodes in excess of `max_workers`.
472
+ (2) Terminates nodes idle for longer than `idle_timeout_minutes`.
473
+ (3) Terminates outdated nodes,
474
+ namely nodes whose configs don't match `node_config` for the
475
+ relevant node type.
476
+
477
+ Avoids terminating non-outdated nodes required by
478
+ autoscaler.sdk.request_resources().
479
+ """
480
+ # For type checking, assert that these objects have been instantitiated.
481
+ assert self.non_terminated_nodes
482
+ assert self.provider
483
+
484
+ last_used = self.load_metrics.ray_nodes_last_used_time_by_ip
485
+
486
+ idle_timeout_s = 60 * self.config["idle_timeout_minutes"]
487
+
488
+ last_used_cutoff = now - idle_timeout_s
489
+
490
+ # Sort based on last used to make sure to keep min_workers that
491
+ # were most recently used. Otherwise, _keep_min_workers_of_node_type
492
+ # might keep a node that should be terminated.
493
+ sorted_node_ids = self._sort_based_on_last_used(
494
+ self.non_terminated_nodes.worker_ids, last_used
495
+ )
496
+
497
+ # Don't terminate nodes needed by request_resources()
498
+ nodes_not_allowed_to_terminate: FrozenSet[NodeID] = {}
499
+ if self.load_metrics.get_resource_requests():
500
+ nodes_not_allowed_to_terminate = (
501
+ self._get_nodes_needed_for_request_resources(sorted_node_ids)
502
+ )
503
+
504
+ # Tracks counts of nodes we intend to keep for each node type.
505
+ node_type_counts = defaultdict(int)
506
+
507
+ def keep_node(node_id: NodeID) -> None:
508
+ assert self.provider
509
+ # Update per-type counts.
510
+ tags = self.provider.node_tags(node_id)
511
+ if TAG_RAY_USER_NODE_TYPE in tags:
512
+ node_type = tags[TAG_RAY_USER_NODE_TYPE]
513
+ node_type_counts[node_type] += 1
514
+
515
+ # Nodes that we could terminate, if needed.
516
+ nodes_we_could_terminate: List[NodeID] = []
517
+
518
+ for node_id in sorted_node_ids:
519
+ # Make sure to not kill idle node types if the number of workers
520
+ # of that type is lower/equal to the min_workers of that type
521
+ # or it is needed for request_resources().
522
+ should_keep_or_terminate, reason = self._keep_worker_of_node_type(
523
+ node_id, node_type_counts
524
+ )
525
+ if should_keep_or_terminate == KeepOrTerminate.terminate:
526
+ self.schedule_node_termination(node_id, reason, logger.info)
527
+ continue
528
+ if (
529
+ should_keep_or_terminate == KeepOrTerminate.keep
530
+ or node_id in nodes_not_allowed_to_terminate
531
+ ) and self.launch_config_ok(node_id):
532
+ keep_node(node_id)
533
+ continue
534
+
535
+ node_ip = self.provider.internal_ip(node_id)
536
+
537
+ if node_ip in last_used and last_used[node_ip] < last_used_cutoff:
538
+ self.schedule_node_termination(node_id, "idle", logger.info)
539
+ # Get the local time of the node's last use as a string.
540
+ formatted_last_used_time = time.asctime(
541
+ time.localtime(last_used[node_ip])
542
+ )
543
+ logger.info(f"Node last used: {formatted_last_used_time}.")
544
+ # Note that the current time will appear in the log prefix.
545
+ elif not self.launch_config_ok(node_id):
546
+ self.schedule_node_termination(node_id, "outdated", logger.info)
547
+ else:
548
+ keep_node(node_id)
549
+ nodes_we_could_terminate.append(node_id)
550
+
551
+ # Terminate nodes if there are too many
552
+ num_workers = len(self.non_terminated_nodes.worker_ids)
553
+ num_extra_nodes_to_terminate = (
554
+ num_workers - len(self.nodes_to_terminate) - self.config["max_workers"]
555
+ )
556
+
557
+ if num_extra_nodes_to_terminate > len(nodes_we_could_terminate):
558
+ logger.warning(
559
+ "StandardAutoscaler: trying to terminate "
560
+ f"{num_extra_nodes_to_terminate} nodes, while only "
561
+ f"{len(nodes_we_could_terminate)} are safe to terminate."
562
+ " Inconsistent config is likely."
563
+ )
564
+ num_extra_nodes_to_terminate = len(nodes_we_could_terminate)
565
+
566
+ # If num_extra_nodes_to_terminate is negative or zero,
567
+ # we would have less than max_workers nodes after terminating
568
+ # nodes_to_terminate and we do not need to terminate anything else.
569
+ if num_extra_nodes_to_terminate > 0:
570
+ extra_nodes_to_terminate = nodes_we_could_terminate[
571
+ -num_extra_nodes_to_terminate:
572
+ ]
573
+ for node_id in extra_nodes_to_terminate:
574
+ self.schedule_node_termination(node_id, "max workers", logger.info)
575
+
576
+ self.terminate_scheduled_nodes()
577
+
578
+ def schedule_node_termination(
579
+ self, node_id: NodeID, reason_opt: Optional[str], logger_method: Callable
580
+ ) -> None:
581
+ # For type checking, assert that this object has been instantitiated.
582
+ assert self.provider
583
+
584
+ if reason_opt is None:
585
+ raise Exception("reason should be not None.")
586
+ reason: str = reason_opt
587
+ node_ip = self.provider.internal_ip(node_id)
588
+ # Log, record an event, and add node_id to nodes_to_terminate.
589
+ logger_method(
590
+ "StandardAutoscaler: "
591
+ f"Terminating the node with id {node_id}"
592
+ f" and ip {node_ip}."
593
+ f" ({reason})"
594
+ )
595
+ self.event_summarizer.add(
596
+ "Removing {} nodes of type "
597
+ + self._get_node_type(node_id)
598
+ + " ({}).".format(reason),
599
+ quantity=1,
600
+ aggregate=operator.add,
601
+ )
602
+ self.nodes_to_terminate.append(node_id)
603
+
604
+ def terminate_scheduled_nodes(self):
605
+ """Terminate scheduled nodes and clean associated autoscaler state."""
606
+ # For type checking, assert that these objects have been instantitiated.
607
+ assert self.provider
608
+ assert self.non_terminated_nodes
609
+
610
+ if not self.nodes_to_terminate:
611
+ return
612
+
613
+ # Drain the nodes
614
+ self.drain_nodes_via_gcs(self.nodes_to_terminate)
615
+ # Terminate the nodes
616
+ self.provider.terminate_nodes(self.nodes_to_terminate)
617
+ for node in self.nodes_to_terminate:
618
+ self.node_tracker.untrack(node)
619
+ self.prom_metrics.stopped_nodes.inc()
620
+
621
+ # Update internal node lists
622
+ self.non_terminated_nodes.remove_terminating_nodes(self.nodes_to_terminate)
623
+
624
+ self.nodes_to_terminate = []
625
+
626
+ def drain_nodes_via_gcs(self, provider_node_ids_to_drain: List[NodeID]):
627
+ """Send an RPC request to the GCS to drain (prepare for termination)
628
+ the nodes with the given node provider ids.
629
+
630
+ note: The current implementation of DrainNode on the GCS side is to
631
+ de-register and gracefully shut down the Raylets. In the future,
632
+ the behavior may change to better reflect the name "Drain."
633
+ See https://github.com/ray-project/ray/pull/19350.
634
+ """
635
+ # For type checking, assert that this object has been instantitiated.
636
+ assert self.provider
637
+
638
+ # The GCS expects Raylet ids in the request, rather than NodeProvider
639
+ # ids. To get the Raylet ids of the nodes to we're draining, we make
640
+ # the following translations of identifiers:
641
+ # node provider node id -> ip -> raylet id
642
+
643
+ # Convert node provider node ids to ips.
644
+ node_ips = set()
645
+ failed_ip_fetch = False
646
+ for provider_node_id in provider_node_ids_to_drain:
647
+ # If the provider's call to fetch ip fails, the exception is not
648
+ # fatal. Log the exception and proceed.
649
+ try:
650
+ ip = self.provider.internal_ip(provider_node_id)
651
+ node_ips.add(ip)
652
+ except Exception:
653
+ logger.exception(
654
+ "Failed to get ip of node with id"
655
+ f" {provider_node_id} during scale-down."
656
+ )
657
+ failed_ip_fetch = True
658
+ if failed_ip_fetch:
659
+ self.prom_metrics.drain_node_exceptions.inc()
660
+
661
+ # Only attempt to drain connected nodes, i.e. nodes with ips in
662
+ # LoadMetrics.
663
+ connected_node_ips = node_ips & self.load_metrics.raylet_id_by_ip.keys()
664
+
665
+ # Convert ips to Raylet ids.
666
+ # (The assignment ip->raylet_id is well-defined under current
667
+ # assumptions. See "use_node_id_as_ip" in monitor.py)
668
+ raylet_ids_to_drain = {
669
+ self.load_metrics.raylet_id_by_ip[ip] for ip in connected_node_ips
670
+ }
671
+
672
+ if not raylet_ids_to_drain:
673
+ return
674
+
675
+ logger.info(f"Draining {len(raylet_ids_to_drain)} raylet(s).")
676
+ try:
677
+ # A successful response indicates that the GCS has marked the
678
+ # desired nodes as "drained." The cloud provider can then terminate
679
+ # the nodes without the GCS printing an error.
680
+ # Check if we succeeded in draining all of the intended nodes by
681
+ # looking at the RPC response.
682
+ drained_raylet_ids = set(
683
+ self.gcs_client.drain_nodes(raylet_ids_to_drain, timeout=5)
684
+ )
685
+ failed_to_drain = raylet_ids_to_drain - drained_raylet_ids
686
+ if failed_to_drain:
687
+ self.prom_metrics.drain_node_exceptions.inc()
688
+ logger.error(f"Failed to drain {len(failed_to_drain)} raylet(s).")
689
+ # If we get a gRPC error with an UNIMPLEMENTED code, fail silently.
690
+ # This error indicates that the GCS is using Ray version < 1.8.0,
691
+ # for which DrainNode is not implemented.
692
+ except RpcError as e:
693
+ # If the code is UNIMPLEMENTED, pass.
694
+ if e.rpc_code == ray._raylet.GRPC_STATUS_CODE_UNIMPLEMENTED:
695
+ pass
696
+ # Otherwise, it's a plain old gRPC error and we should log it.
697
+ else:
698
+ self.prom_metrics.drain_node_exceptions.inc()
699
+ logger.exception("Failed to drain Ray nodes. Traceback follows.")
700
+ except Exception:
701
+ # We don't need to interrupt the autoscaler update with an
702
+ # exception, but we should log what went wrong and record the
703
+ # failure in Prometheus.
704
+ self.prom_metrics.drain_node_exceptions.inc()
705
+ logger.exception("Failed to drain Ray nodes. Traceback follows.")
706
+
707
+ def launch_required_nodes(self, to_launch: Dict[NodeType, int]) -> None:
708
+ if to_launch:
709
+ for node_type, count in to_launch.items():
710
+ self.launch_new_node(count, node_type=node_type)
711
+
712
+ def update_nodes(self):
713
+ """Run NodeUpdaterThreads to run setup commands, sync files,
714
+ and/or start Ray.
715
+ """
716
+ # Update nodes with out-of-date files.
717
+ # TODO(edoakes): Spawning these threads directly seems to cause
718
+ # problems. They should at a minimum be spawned as daemon threads.
719
+ # See https://github.com/ray-project/ray/pull/5903 for more info.
720
+ T = []
721
+ for node_id, setup_commands, ray_start_commands, docker_config in (
722
+ self.should_update(node_id)
723
+ for node_id in self.non_terminated_nodes.worker_ids
724
+ ):
725
+ if node_id is not None:
726
+ resources = self._node_resources(node_id)
727
+ labels = self._node_labels(node_id)
728
+ logger.debug(f"{node_id}: Starting new thread runner.")
729
+ T.append(
730
+ threading.Thread(
731
+ target=self.spawn_updater,
732
+ args=(
733
+ node_id,
734
+ setup_commands,
735
+ ray_start_commands,
736
+ resources,
737
+ labels,
738
+ docker_config,
739
+ ),
740
+ )
741
+ )
742
+ for t in T:
743
+ t.start()
744
+ for t in T:
745
+ t.join()
746
+
747
+ def process_completed_updates(self):
748
+ """Clean up completed NodeUpdaterThreads."""
749
+ completed_nodes = []
750
+ for node_id, updater in self.updaters.items():
751
+ if not updater.is_alive():
752
+ completed_nodes.append(node_id)
753
+ if completed_nodes:
754
+ failed_nodes = []
755
+ for node_id in completed_nodes:
756
+ updater = self.updaters[node_id]
757
+ if updater.exitcode == 0:
758
+ self.num_successful_updates[node_id] += 1
759
+ self.prom_metrics.successful_updates.inc()
760
+ if updater.for_recovery:
761
+ self.prom_metrics.successful_recoveries.inc()
762
+ if updater.update_time:
763
+ self.prom_metrics.worker_update_time.observe(
764
+ updater.update_time
765
+ )
766
+ # Mark the node as active to prevent the node recovery
767
+ # logic immediately trying to restart Ray on the new node.
768
+ self.load_metrics.mark_active(self.provider.internal_ip(node_id))
769
+ else:
770
+ failed_nodes.append(node_id)
771
+ self.num_failed_updates[node_id] += 1
772
+ self.prom_metrics.failed_updates.inc()
773
+ if updater.for_recovery:
774
+ self.prom_metrics.failed_recoveries.inc()
775
+ self.node_tracker.untrack(node_id)
776
+ del self.updaters[node_id]
777
+
778
+ if failed_nodes:
779
+ # Some nodes in failed_nodes may already have been terminated
780
+ # during an update (for being idle after missing a heartbeat).
781
+
782
+ # Update the list of non-terminated workers.
783
+ for node_id in failed_nodes:
784
+ # Check if the node has already been terminated.
785
+ if node_id in self.non_terminated_nodes.worker_ids:
786
+ self.schedule_node_termination(
787
+ node_id, "launch failed", logger.error
788
+ )
789
+ else:
790
+ logger.warning(
791
+ f"StandardAutoscaler: {node_id}:"
792
+ " Failed to update node."
793
+ " Node has already been terminated."
794
+ )
795
+ self.terminate_scheduled_nodes()
796
+
797
+ def set_prometheus_updater_data(self):
798
+ """Record total number of active NodeUpdaterThreads and how many of
799
+ these are being run to recover nodes.
800
+ """
801
+ self.prom_metrics.updating_nodes.set(len(self.updaters))
802
+ num_recovering = 0
803
+ for updater in self.updaters.values():
804
+ if updater.for_recovery:
805
+ num_recovering += 1
806
+ self.prom_metrics.recovering_nodes.set(num_recovering)
807
+
808
+ def _report_pending_infeasible(self, unfulfilled: List[ResourceDict]):
809
+ """Emit event messages for infeasible or unschedulable tasks.
810
+
811
+ This adds messages to the event summarizer for warning on infeasible
812
+ or "cluster full" resource requests.
813
+
814
+ Args:
815
+ unfulfilled: List of resource demands that would be unfulfilled
816
+ even after full scale-up.
817
+ """
818
+ # For type checking, assert that this object has been instantitiated.
819
+ assert self.resource_demand_scheduler
820
+ pending = []
821
+ infeasible = []
822
+ for bundle in unfulfilled:
823
+ placement_group = any(
824
+ "_group_" in k
825
+ or k == ray_constants.PLACEMENT_GROUP_BUNDLE_RESOURCE_NAME
826
+ for k in bundle
827
+ )
828
+ if placement_group:
829
+ continue
830
+ if self.resource_demand_scheduler.is_feasible(bundle):
831
+ pending.append(bundle)
832
+ else:
833
+ infeasible.append(bundle)
834
+ if pending:
835
+ if self.load_metrics.cluster_full_of_actors_detected:
836
+ for request in pending:
837
+ self.event_summarizer.add_once_per_interval(
838
+ "Warning: The following resource request cannot be "
839
+ "scheduled right now: {}. This is likely due to all "
840
+ "cluster resources being claimed by actors. Consider "
841
+ "creating fewer actors or adding more nodes "
842
+ "to this Ray cluster.".format(request),
843
+ key="pending_{}".format(sorted(request.items())),
844
+ interval_s=30,
845
+ )
846
+ if infeasible:
847
+ for request in infeasible:
848
+ self.event_summarizer.add_once_per_interval(
849
+ "Error: No available node types can fulfill resource "
850
+ "request {}. Add suitable node types to this cluster to "
851
+ "resolve this issue.".format(request),
852
+ key="infeasible_{}".format(sorted(request.items())),
853
+ interval_s=30,
854
+ )
855
+
856
+ def _sort_based_on_last_used(
857
+ self, nodes: List[NodeID], last_used: Dict[str, float]
858
+ ) -> List[NodeID]:
859
+ """Sort the nodes based on the last time they were used.
860
+
861
+ The first item in the return list is the most recently used.
862
+ """
863
+ last_used_copy = copy.deepcopy(last_used)
864
+ # Add the unconnected nodes as the least recently used (the end of
865
+ # list). This prioritizes connected nodes.
866
+ least_recently_used = -1
867
+
868
+ def last_time_used(node_id: NodeID):
869
+ assert self.provider
870
+ node_ip = self.provider.internal_ip(node_id)
871
+ if node_ip not in last_used_copy:
872
+ return least_recently_used
873
+ else:
874
+ return last_used_copy[node_ip]
875
+
876
+ return sorted(nodes, key=last_time_used, reverse=True)
877
+
878
+ def _get_nodes_needed_for_request_resources(
879
+ self, sorted_node_ids: List[NodeID]
880
+ ) -> FrozenSet[NodeID]:
881
+ # TODO(ameer): try merging this with resource_demand_scheduler
882
+ # code responsible for adding nodes for request_resources().
883
+ """Returns the nodes NOT allowed to terminate due to request_resources().
884
+
885
+ Args:
886
+ sorted_node_ids: the node ids sorted based on last used (LRU last).
887
+
888
+ Returns:
889
+ FrozenSet[NodeID]: a set of nodes (node ids) that
890
+ we should NOT terminate.
891
+ """
892
+ # For type checking, assert that this object has been instantitiated.
893
+ assert self.provider
894
+
895
+ nodes_not_allowed_to_terminate: Set[NodeID] = set()
896
+ static_node_resources: Dict[
897
+ NodeIP, ResourceDict
898
+ ] = self.load_metrics.get_static_node_resources_by_ip()
899
+
900
+ head_node_resources: ResourceDict = copy.deepcopy(
901
+ self.available_node_types[self.config["head_node_type"]]["resources"]
902
+ )
903
+ # TODO(ameer): this is somewhat duplicated in
904
+ # resource_demand_scheduler.py.
905
+ if not head_node_resources:
906
+ # Legacy yaml might include {} in the resources field.
907
+ head_node_ip = self.provider.internal_ip(self.non_terminated_nodes.head_id)
908
+ head_node_resources = static_node_resources.get(head_node_ip, {})
909
+
910
+ max_node_resources: List[ResourceDict] = [head_node_resources]
911
+ resource_demand_vector_worker_node_ids = []
912
+ # Get max resources on all the non terminated nodes.
913
+ for node_id in sorted_node_ids:
914
+ tags = self.provider.node_tags(node_id)
915
+ if TAG_RAY_USER_NODE_TYPE in tags:
916
+ node_type = tags[TAG_RAY_USER_NODE_TYPE]
917
+ node_resources: ResourceDict = copy.deepcopy(
918
+ self.available_node_types[node_type]["resources"]
919
+ )
920
+ if not node_resources:
921
+ # Legacy yaml might include {} in the resources field.
922
+ node_ip = self.provider.internal_ip(node_id)
923
+ node_resources = static_node_resources.get(node_ip, {})
924
+ max_node_resources.append(node_resources)
925
+ resource_demand_vector_worker_node_ids.append(node_id)
926
+ # Since it is sorted based on last used, we "keep" nodes that are
927
+ # most recently used when we binpack. We assume get_bin_pack_residual
928
+ # is following the given order here.
929
+ used_resource_requests: List[ResourceDict]
930
+ _, used_resource_requests = get_bin_pack_residual(
931
+ max_node_resources, self.load_metrics.get_resource_requests()
932
+ )
933
+ # Remove the first entry (the head node).
934
+ max_node_resources.pop(0)
935
+ # Remove the first entry (the head node).
936
+ used_resource_requests.pop(0)
937
+ for i, node_id in enumerate(resource_demand_vector_worker_node_ids):
938
+ if (
939
+ used_resource_requests[i] == max_node_resources[i]
940
+ and max_node_resources[i]
941
+ ):
942
+ # No resources of the node were needed for request_resources().
943
+ # max_node_resources[i] is an empty dict for legacy yamls
944
+ # before the node is connected.
945
+ pass
946
+ else:
947
+ nodes_not_allowed_to_terminate.add(node_id)
948
+ return frozenset(nodes_not_allowed_to_terminate)
949
+
950
+ def _keep_worker_of_node_type(
951
+ self, node_id: NodeID, node_type_counts: Dict[NodeType, int]
952
+ ) -> Tuple[KeepOrTerminate, Optional[str]]:
953
+ """Determines if a worker should be kept based on the min_workers
954
+ and max_workers constraint of the worker's node_type.
955
+
956
+ Returns KeepOrTerminate.keep when both of the following hold:
957
+ (a) The worker's node_type is present among the keys of the current
958
+ config's available_node_types dict.
959
+ (b) Deleting the node would violate the min_workers constraint for that
960
+ worker's node_type.
961
+
962
+ Returns KeepOrTerminate.terminate when both the following hold:
963
+ (a) The worker's node_type is not present among the keys of the current
964
+ config's available_node_types dict.
965
+ (b) Keeping the node would violate the max_workers constraint for that
966
+ worker's node_type.
967
+
968
+ Return KeepOrTerminate.decide_later otherwise.
969
+
970
+ Args:
971
+ node_type_counts(Dict[NodeType, int]): The non_terminated node
972
+ types counted so far.
973
+ Returns:
974
+ KeepOrTerminate: keep if the node should be kept, terminate if the
975
+ node should be terminated, decide_later if we are allowed
976
+ to terminate it, but do not have to.
977
+ Optional[str]: reason for termination. Not None on
978
+ KeepOrTerminate.terminate, None otherwise.
979
+ """
980
+ # For type checking, assert that this object has been instantitiated.
981
+ assert self.provider
982
+
983
+ tags = self.provider.node_tags(node_id)
984
+ if TAG_RAY_USER_NODE_TYPE in tags:
985
+ node_type = tags[TAG_RAY_USER_NODE_TYPE]
986
+
987
+ min_workers = self.available_node_types.get(node_type, {}).get(
988
+ "min_workers", 0
989
+ )
990
+ max_workers = self.available_node_types.get(node_type, {}).get(
991
+ "max_workers", 0
992
+ )
993
+ if node_type not in self.available_node_types:
994
+ # The node type has been deleted from the cluster config.
995
+ # Allow terminating it if needed.
996
+ available_node_types = list(self.available_node_types.keys())
997
+ return (
998
+ KeepOrTerminate.terminate,
999
+ f"not in available_node_types: {available_node_types}",
1000
+ )
1001
+ new_count = node_type_counts[node_type] + 1
1002
+ if new_count <= min(min_workers, max_workers):
1003
+ return KeepOrTerminate.keep, None
1004
+ if new_count > max_workers:
1005
+ return KeepOrTerminate.terminate, "max_workers_per_type"
1006
+
1007
+ return KeepOrTerminate.decide_later, None
1008
+
1009
+ def _node_resources(self, node_id):
1010
+ node_type = self.provider.node_tags(node_id).get(TAG_RAY_USER_NODE_TYPE)
1011
+ if self.available_node_types:
1012
+ return self.available_node_types.get(node_type, {}).get("resources", {})
1013
+ else:
1014
+ return {}
1015
+
1016
+ def _node_labels(self, node_id):
1017
+ node_type = self.provider.node_tags(node_id).get(TAG_RAY_USER_NODE_TYPE)
1018
+ if self.available_node_types:
1019
+ return self.available_node_types.get(node_type, {}).get("labels", {})
1020
+ else:
1021
+ return {}
1022
+
1023
+ def reset(self, errors_fatal=False):
1024
+ sync_continuously = False
1025
+ if hasattr(self, "config"):
1026
+ sync_continuously = self.config.get("file_mounts_sync_continuously", False)
1027
+ try:
1028
+ new_config = self.config_reader()
1029
+ if new_config != getattr(self, "config", None):
1030
+ try:
1031
+ validate_config(new_config)
1032
+ except Exception as e:
1033
+ self.prom_metrics.config_validation_exceptions.inc()
1034
+ logger.debug(
1035
+ "Cluster config validation failed. The version of "
1036
+ "the ray CLI you launched this cluster with may "
1037
+ "be higher than the version of ray being run on "
1038
+ "the cluster. Some new features may not be "
1039
+ "available until you upgrade ray on your cluster.",
1040
+ exc_info=e,
1041
+ )
1042
+ logger.debug(
1043
+ f"New config after validation: {new_config},"
1044
+ f" of type: {type(new_config)}"
1045
+ )
1046
+ (new_runtime_hash, new_file_mounts_contents_hash) = hash_runtime_conf(
1047
+ new_config["file_mounts"],
1048
+ new_config["cluster_synced_files"],
1049
+ [
1050
+ new_config["worker_setup_commands"],
1051
+ new_config["worker_start_ray_commands"],
1052
+ ],
1053
+ generate_file_mounts_contents_hash=sync_continuously,
1054
+ )
1055
+ self.config = new_config
1056
+ self.runtime_hash = new_runtime_hash
1057
+ self.file_mounts_contents_hash = new_file_mounts_contents_hash
1058
+ if not self.provider:
1059
+ self.provider = _get_node_provider(
1060
+ self.config["provider"], self.config["cluster_name"]
1061
+ )
1062
+
1063
+ # If using the LocalNodeProvider, make sure the head node is marked
1064
+ # non-terminated.
1065
+ if isinstance(self.provider, LocalNodeProvider):
1066
+ record_local_head_state_if_needed(self.provider)
1067
+
1068
+ self.available_node_types = self.config["available_node_types"]
1069
+ upscaling_speed = self.config.get("upscaling_speed")
1070
+ aggressive = self.config.get("autoscaling_mode") == "aggressive"
1071
+ target_utilization_fraction = self.config.get("target_utilization_fraction")
1072
+ if upscaling_speed:
1073
+ upscaling_speed = float(upscaling_speed)
1074
+ # TODO(ameer): consider adding (if users ask) an option of
1075
+ # initial_upscaling_num_workers.
1076
+ elif aggressive:
1077
+ upscaling_speed = 99999
1078
+ logger.warning(
1079
+ "Legacy aggressive autoscaling mode "
1080
+ "detected. Replacing it by setting upscaling_speed to "
1081
+ "99999."
1082
+ )
1083
+ elif target_utilization_fraction:
1084
+ upscaling_speed = 1 / max(target_utilization_fraction, 0.001) - 1
1085
+ logger.warning(
1086
+ "Legacy target_utilization_fraction config "
1087
+ "detected. Replacing it by setting upscaling_speed to "
1088
+ + "1 / target_utilization_fraction - 1."
1089
+ )
1090
+ else:
1091
+ upscaling_speed = 1.0
1092
+ if self.resource_demand_scheduler:
1093
+ # The node types are autofilled internally for legacy yamls,
1094
+ # overwriting the class will remove the inferred node resources
1095
+ # for legacy yamls.
1096
+ self.resource_demand_scheduler.reset_config(
1097
+ self.provider,
1098
+ self.available_node_types,
1099
+ self.config["max_workers"],
1100
+ self.config["head_node_type"],
1101
+ upscaling_speed,
1102
+ )
1103
+ else:
1104
+ self.resource_demand_scheduler = ResourceDemandScheduler(
1105
+ self.provider,
1106
+ self.available_node_types,
1107
+ self.config["max_workers"],
1108
+ self.config["head_node_type"],
1109
+ upscaling_speed,
1110
+ )
1111
+
1112
+ except Exception as e:
1113
+ self.prom_metrics.reset_exceptions.inc()
1114
+ if errors_fatal:
1115
+ raise e
1116
+ else:
1117
+ logger.exception("StandardAutoscaler: Error parsing config.")
1118
+
1119
+ def launch_config_ok(self, node_id):
1120
+ if self.disable_launch_config_check:
1121
+ return True
1122
+ node_tags = self.provider.node_tags(node_id)
1123
+ tag_launch_conf = node_tags.get(TAG_RAY_LAUNCH_CONFIG)
1124
+ node_type = node_tags.get(TAG_RAY_USER_NODE_TYPE)
1125
+ if node_type not in self.available_node_types:
1126
+ # The node type has been deleted from the cluster config.
1127
+ # Don't keep the node.
1128
+ return False
1129
+
1130
+ # The `worker_nodes` field is deprecated in favor of per-node-type
1131
+ # node_configs. We allow it for backwards-compatibility.
1132
+ launch_config = copy.deepcopy(self.config.get("worker_nodes", {}))
1133
+ if node_type:
1134
+ launch_config.update(
1135
+ self.config["available_node_types"][node_type]["node_config"]
1136
+ )
1137
+ calculated_launch_hash = hash_launch_conf(launch_config, self.config["auth"])
1138
+
1139
+ if calculated_launch_hash != tag_launch_conf:
1140
+ return False
1141
+ return True
1142
+
1143
+ def files_up_to_date(self, node_id):
1144
+ node_tags = self.provider.node_tags(node_id)
1145
+ applied_config_hash = node_tags.get(TAG_RAY_RUNTIME_CONFIG)
1146
+ applied_file_mounts_contents_hash = node_tags.get(TAG_RAY_FILE_MOUNTS_CONTENTS)
1147
+ if applied_config_hash != self.runtime_hash or (
1148
+ self.file_mounts_contents_hash is not None
1149
+ and self.file_mounts_contents_hash != applied_file_mounts_contents_hash
1150
+ ):
1151
+ logger.info(
1152
+ "StandardAutoscaler: "
1153
+ "{}: Runtime state is ({},{}), want ({},{})".format(
1154
+ node_id,
1155
+ applied_config_hash,
1156
+ applied_file_mounts_contents_hash,
1157
+ self.runtime_hash,
1158
+ self.file_mounts_contents_hash,
1159
+ )
1160
+ )
1161
+ return False
1162
+ return True
1163
+
1164
+ def heartbeat_on_time(self, node_id: NodeID, now: float) -> bool:
1165
+ """Determine whether we've received a heartbeat from a node within the
1166
+ last AUTOSCALER_HEARTBEAT_TIMEOUT_S seconds.
1167
+ """
1168
+ # For type checking, assert that this object has been instantitiated.
1169
+ assert self.provider
1170
+
1171
+ key = self.provider.internal_ip(node_id)
1172
+
1173
+ if key in self.load_metrics.last_heartbeat_time_by_ip:
1174
+ last_heartbeat_time = self.load_metrics.last_heartbeat_time_by_ip[key]
1175
+ delta = now - last_heartbeat_time
1176
+ if delta < AUTOSCALER_HEARTBEAT_TIMEOUT_S:
1177
+ return True
1178
+ return False
1179
+
1180
+ def terminate_unhealthy_nodes(self, now: float):
1181
+ """Terminated nodes for which we haven't received a heartbeat on time.
1182
+ These nodes are subsequently terminated.
1183
+ """
1184
+ # For type checking, assert that these objects have been instantitiated.
1185
+ assert self.provider
1186
+ assert self.non_terminated_nodes
1187
+
1188
+ for node_id in self.non_terminated_nodes.worker_ids:
1189
+ node_status = self.provider.node_tags(node_id)[TAG_RAY_NODE_STATUS]
1190
+ # We're not responsible for taking down
1191
+ # nodes with pending or failed status:
1192
+ if not node_status == STATUS_UP_TO_DATE:
1193
+ continue
1194
+ # This node is up-to-date. If it hasn't had the chance to produce
1195
+ # a heartbeat, fake the heartbeat now (see logic for completed node
1196
+ # updaters).
1197
+ ip = self.provider.internal_ip(node_id)
1198
+ if ip not in self.load_metrics.last_heartbeat_time_by_ip:
1199
+ self.load_metrics.mark_active(ip)
1200
+ # Heartbeat indicates node is healthy:
1201
+ if self.heartbeat_on_time(node_id, now):
1202
+ continue
1203
+ self.schedule_node_termination(
1204
+ node_id, "lost contact with raylet", logger.warning
1205
+ )
1206
+ self.terminate_scheduled_nodes()
1207
+
1208
+ def attempt_to_recover_unhealthy_nodes(self, now):
1209
+ for node_id in self.non_terminated_nodes.worker_ids:
1210
+ self.recover_if_needed(node_id, now)
1211
+
1212
+ def recover_if_needed(self, node_id, now):
1213
+ if not self.can_update(node_id):
1214
+ return
1215
+ if self.heartbeat_on_time(node_id, now):
1216
+ return
1217
+
1218
+ logger.warning(
1219
+ "StandardAutoscaler: "
1220
+ "{}: No recent heartbeat, "
1221
+ "restarting Ray to recover...".format(node_id)
1222
+ )
1223
+ self.event_summarizer.add(
1224
+ "Restarting {} nodes of type "
1225
+ + self._get_node_type(node_id)
1226
+ + " (lost contact with raylet).",
1227
+ quantity=1,
1228
+ aggregate=operator.add,
1229
+ )
1230
+ head_node_ip = self.provider.internal_ip(self.non_terminated_nodes.head_id)
1231
+ updater = NodeUpdaterThread(
1232
+ node_id=node_id,
1233
+ provider_config=self.config["provider"],
1234
+ provider=self.provider,
1235
+ auth_config=self.config["auth"],
1236
+ cluster_name=self.config["cluster_name"],
1237
+ file_mounts={},
1238
+ initialization_commands=[],
1239
+ setup_commands=[],
1240
+ ray_start_commands=with_head_node_ip(
1241
+ self.config["worker_start_ray_commands"], head_node_ip
1242
+ ),
1243
+ runtime_hash=self.runtime_hash,
1244
+ file_mounts_contents_hash=self.file_mounts_contents_hash,
1245
+ process_runner=self.process_runner,
1246
+ use_internal_ip=True,
1247
+ is_head_node=False,
1248
+ docker_config=self.config.get("docker"),
1249
+ node_resources=self._node_resources(node_id),
1250
+ node_labels=self._node_labels(node_id),
1251
+ for_recovery=True,
1252
+ )
1253
+ updater.start()
1254
+ self.updaters[node_id] = updater
1255
+
1256
+ def _get_node_type(self, node_id: str) -> str:
1257
+ # For type checking, assert that this object has been instantitiated.
1258
+ assert self.provider
1259
+
1260
+ node_tags = self.provider.node_tags(node_id)
1261
+ if TAG_RAY_USER_NODE_TYPE in node_tags:
1262
+ return node_tags[TAG_RAY_USER_NODE_TYPE]
1263
+ else:
1264
+ return "unknown_node_type"
1265
+
1266
+ def _get_node_type_specific_fields(self, node_id: str, fields_key: str) -> Any:
1267
+ # For type checking, assert that this object has been instantitiated.
1268
+ assert self.provider
1269
+
1270
+ fields = self.config[fields_key]
1271
+ node_tags = self.provider.node_tags(node_id)
1272
+ if TAG_RAY_USER_NODE_TYPE in node_tags:
1273
+ node_type = node_tags[TAG_RAY_USER_NODE_TYPE]
1274
+ if node_type not in self.available_node_types:
1275
+ raise ValueError(f"Unknown node type tag: {node_type}.")
1276
+ node_specific_config = self.available_node_types[node_type]
1277
+ if fields_key in node_specific_config:
1278
+ fields = node_specific_config[fields_key]
1279
+ return fields
1280
+
1281
+ def _get_node_specific_docker_config(self, node_id):
1282
+ if "docker" not in self.config:
1283
+ return {}
1284
+ docker_config = copy.deepcopy(self.config.get("docker", {}))
1285
+ node_specific_docker = self._get_node_type_specific_fields(node_id, "docker")
1286
+ docker_config.update(node_specific_docker)
1287
+ return docker_config
1288
+
1289
+ def should_update(self, node_id):
1290
+ if not self.can_update(node_id):
1291
+ return UpdateInstructions(None, None, None, None) # no update
1292
+
1293
+ status = self.provider.node_tags(node_id).get(TAG_RAY_NODE_STATUS)
1294
+ if status == STATUS_UP_TO_DATE and self.files_up_to_date(node_id):
1295
+ return UpdateInstructions(None, None, None, None) # no update
1296
+
1297
+ successful_updated = self.num_successful_updates.get(node_id, 0) > 0
1298
+ if successful_updated and self.config.get("restart_only", False):
1299
+ setup_commands = []
1300
+ ray_start_commands = self.config["worker_start_ray_commands"]
1301
+ elif successful_updated and self.config.get("no_restart", False):
1302
+ setup_commands = self._get_node_type_specific_fields(
1303
+ node_id, "worker_setup_commands"
1304
+ )
1305
+ ray_start_commands = []
1306
+ else:
1307
+ setup_commands = self._get_node_type_specific_fields(
1308
+ node_id, "worker_setup_commands"
1309
+ )
1310
+ ray_start_commands = self.config["worker_start_ray_commands"]
1311
+
1312
+ docker_config = self._get_node_specific_docker_config(node_id)
1313
+ return UpdateInstructions(
1314
+ node_id=node_id,
1315
+ setup_commands=setup_commands,
1316
+ ray_start_commands=ray_start_commands,
1317
+ docker_config=docker_config,
1318
+ )
1319
+
1320
+ def spawn_updater(
1321
+ self,
1322
+ node_id,
1323
+ setup_commands,
1324
+ ray_start_commands,
1325
+ node_resources,
1326
+ node_labels,
1327
+ docker_config,
1328
+ ):
1329
+ logger.info(
1330
+ f"Creating new (spawn_updater) updater thread for node" f" {node_id}."
1331
+ )
1332
+ ip = self.provider.internal_ip(node_id)
1333
+ node_type = self._get_node_type(node_id)
1334
+ self.node_tracker.track(node_id, ip, node_type)
1335
+ head_node_ip = self.provider.internal_ip(self.non_terminated_nodes.head_id)
1336
+ updater = NodeUpdaterThread(
1337
+ node_id=node_id,
1338
+ provider_config=self.config["provider"],
1339
+ provider=self.provider,
1340
+ auth_config=self.config["auth"],
1341
+ cluster_name=self.config["cluster_name"],
1342
+ file_mounts=self.config["file_mounts"],
1343
+ initialization_commands=with_head_node_ip(
1344
+ self._get_node_type_specific_fields(node_id, "initialization_commands"),
1345
+ head_node_ip,
1346
+ ),
1347
+ setup_commands=with_head_node_ip(setup_commands, head_node_ip),
1348
+ ray_start_commands=with_head_node_ip(ray_start_commands, head_node_ip),
1349
+ runtime_hash=self.runtime_hash,
1350
+ file_mounts_contents_hash=self.file_mounts_contents_hash,
1351
+ is_head_node=False,
1352
+ cluster_synced_files=self.config["cluster_synced_files"],
1353
+ rsync_options={
1354
+ "rsync_exclude": self.config.get("rsync_exclude"),
1355
+ "rsync_filter": self.config.get("rsync_filter"),
1356
+ },
1357
+ process_runner=self.process_runner,
1358
+ use_internal_ip=True,
1359
+ docker_config=docker_config,
1360
+ node_resources=node_resources,
1361
+ node_labels=node_labels,
1362
+ )
1363
+ updater.start()
1364
+ self.updaters[node_id] = updater
1365
+
1366
+ def can_update(self, node_id):
1367
+ if self.disable_node_updaters:
1368
+ return False
1369
+ if node_id in self.updaters:
1370
+ return False
1371
+ if not self.launch_config_ok(node_id):
1372
+ return False
1373
+ if self.num_failed_updates.get(node_id, 0) > 0: # TODO(ekl) retry?
1374
+ return False
1375
+ logger.debug(
1376
+ f"{node_id} is not being updated and "
1377
+ "passes config check (can_update=True)."
1378
+ )
1379
+ return True
1380
+
1381
+ def launch_new_node(self, count: int, node_type: str) -> None:
1382
+ logger.info("StandardAutoscaler: Queue {} new nodes for launch".format(count))
1383
+ self.pending_launches.inc(node_type, count)
1384
+ config = copy.deepcopy(self.config)
1385
+ if self.foreground_node_launch:
1386
+ assert self.foreground_node_launcher is not None
1387
+ # Launch in the main thread and block.
1388
+ self.foreground_node_launcher.launch_node(config, count, node_type)
1389
+ else:
1390
+ assert self.launch_queue is not None
1391
+ # Split into individual launch requests of the max batch size.
1392
+ while count > 0:
1393
+ # Enqueue launch data for the background NodeUpdater threads.
1394
+ self.launch_queue.put(
1395
+ (config, min(count, self.max_launch_batch), node_type)
1396
+ )
1397
+ count -= self.max_launch_batch
1398
+
1399
+ def kill_workers(self):
1400
+ logger.error("StandardAutoscaler: kill_workers triggered")
1401
+ nodes = self.workers()
1402
+ if nodes:
1403
+ self.provider.terminate_nodes(nodes)
1404
+ for node in nodes:
1405
+ self.node_tracker.untrack(node)
1406
+ self.prom_metrics.stopped_nodes.inc()
1407
+ logger.error("StandardAutoscaler: terminated {} node(s)".format(len(nodes)))
1408
+
1409
+ def summary(self) -> Optional[AutoscalerSummary]:
1410
+ """Summarizes the active, pending, and failed node launches.
1411
+
1412
+ An active node is a node whose raylet is actively reporting heartbeats.
1413
+ A pending node is non-active node whose node tag is uninitialized,
1414
+ waiting for ssh, syncing files, or setting up.
1415
+ If a node is not pending or active, it is failed.
1416
+
1417
+ Returns:
1418
+ AutoscalerSummary: The summary.
1419
+ """
1420
+ # For type checking, assert that this object has been instantitiated.
1421
+ assert self.provider
1422
+
1423
+ if not self.non_terminated_nodes:
1424
+ return None
1425
+ active_nodes: Dict[NodeType, int] = Counter()
1426
+ pending_nodes = []
1427
+ failed_nodes = []
1428
+ non_failed = set()
1429
+
1430
+ node_type_mapping = {}
1431
+
1432
+ for node_id in self.non_terminated_nodes.all_node_ids:
1433
+ ip = self.provider.internal_ip(node_id)
1434
+ node_tags = self.provider.node_tags(node_id)
1435
+
1436
+ if not all(
1437
+ tag in node_tags
1438
+ for tag in (
1439
+ TAG_RAY_NODE_KIND,
1440
+ TAG_RAY_USER_NODE_TYPE,
1441
+ TAG_RAY_NODE_STATUS,
1442
+ )
1443
+ ):
1444
+ # In some node providers, creation of a node and tags is not
1445
+ # atomic, so just skip it.
1446
+ continue
1447
+
1448
+ if node_tags[TAG_RAY_NODE_KIND] == NODE_KIND_UNMANAGED:
1449
+ continue
1450
+ node_type = node_tags[TAG_RAY_USER_NODE_TYPE]
1451
+
1452
+ node_type_mapping[ip] = node_type
1453
+
1454
+ # TODO (Alex): If a node's raylet has died, it shouldn't be marked
1455
+ # as active.
1456
+ is_active = self.load_metrics.is_active(ip)
1457
+ if is_active:
1458
+ active_nodes[node_type] += 1
1459
+ non_failed.add(node_id)
1460
+ else:
1461
+ status = node_tags[TAG_RAY_NODE_STATUS]
1462
+ completed_states = [STATUS_UP_TO_DATE, STATUS_UPDATE_FAILED]
1463
+ is_pending = status not in completed_states
1464
+ if is_pending:
1465
+ pending_nodes.append((node_id, ip, node_type, status))
1466
+ non_failed.add(node_id)
1467
+
1468
+ failed_nodes = self.node_tracker.get_all_failed_node_info(non_failed)
1469
+
1470
+ # The concurrent counter leaves some 0 counts in, so we need to
1471
+ # manually filter those out.
1472
+ pending_launches = {}
1473
+ for node_type, count in self.pending_launches.breakdown().items():
1474
+ if count:
1475
+ pending_launches[node_type] = count
1476
+
1477
+ pending_resources = {}
1478
+ for node_resources in self.resource_demand_scheduler.calculate_node_resources(
1479
+ nodes=[node_id for node_id, _, _, _ in pending_nodes],
1480
+ pending_nodes=pending_launches,
1481
+ # We don't fill this field out because we're intentionally only
1482
+ # passing pending nodes (which aren't tracked by load metrics
1483
+ # anyways).
1484
+ unused_resources_by_ip={},
1485
+ )[0]:
1486
+ for key, value in node_resources.items():
1487
+ pending_resources[key] = value + pending_resources.get(key, 0)
1488
+
1489
+ return AutoscalerSummary(
1490
+ # Convert active_nodes from counter to dict for later serialization
1491
+ active_nodes=dict(active_nodes),
1492
+ idle_nodes=None,
1493
+ pending_nodes=[
1494
+ (ip, node_type, status) for _, ip, node_type, status in pending_nodes
1495
+ ],
1496
+ pending_launches=pending_launches,
1497
+ failed_nodes=failed_nodes,
1498
+ node_availability_summary=self.node_provider_availability_tracker.summary(),
1499
+ pending_resources=pending_resources,
1500
+ node_type_mapping=node_type_mapping,
1501
+ legacy=True,
1502
+ )
1503
+
1504
+ def info_string(self):
1505
+ lm_summary = self.load_metrics.summary()
1506
+ autoscaler_summary = self.summary()
1507
+ assert autoscaler_summary
1508
+ return "\n" + format_info_string(lm_summary, autoscaler_summary)
.venv/lib/python3.11/site-packages/ray/autoscaler/_private/cli_logger.py ADDED
@@ -0,0 +1,825 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Logger implementing the Command Line Interface.
2
+
3
+ A replacement for the standard Python `logging` API
4
+ designed for implementing a better CLI UX for the cluster launcher.
5
+
6
+ Supports color, bold text, italics, underlines, etc.
7
+ (depending on TTY features)
8
+ as well as indentation and other structured output.
9
+ """
10
+ import inspect
11
+ import logging
12
+ import os
13
+ import sys
14
+ import time
15
+ from contextlib import contextmanager
16
+ from functools import wraps
17
+ from typing import Any, Callable, Dict, List, Optional, Tuple
18
+
19
+ import click
20
+ import colorama
21
+
22
+ # Import ray first to use the bundled colorama
23
+ import ray # noqa: F401
24
+
25
+ if sys.platform == "win32":
26
+ import msvcrt
27
+ else:
28
+ import select
29
+
30
+
31
+ class _ColorfulMock:
32
+ def __init__(self):
33
+ # do not do any color work
34
+ self.identity = lambda x: x
35
+
36
+ self.colorful = self
37
+ self.colormode = None
38
+
39
+ self.NO_COLORS = None
40
+ self.ANSI_8_COLORS = None
41
+
42
+ def disable(self):
43
+ pass
44
+
45
+ @contextmanager
46
+ def with_style(self, x):
47
+ class IdentityClass:
48
+ def __getattr__(self, name):
49
+ return lambda y: y
50
+
51
+ yield IdentityClass()
52
+
53
+ def __getattr__(self, name):
54
+ if name == "with_style":
55
+ return self.with_style
56
+
57
+ return self.identity
58
+
59
+
60
+ try:
61
+ import colorful as _cf
62
+ from colorful.core import ColorfulString
63
+
64
+ _cf.use_8_ansi_colors()
65
+ except ModuleNotFoundError:
66
+ # We mock Colorful to restrict the colors used for consistency
67
+ # anyway, so we also allow for not having colorful at all.
68
+ # If the Ray Core dependency on colorful is ever removed,
69
+ # the CliLogger code will still work.
70
+ class ColorfulString:
71
+ pass
72
+
73
+ _cf = _ColorfulMock()
74
+
75
+
76
+ # We want to only allow specific formatting
77
+ # to prevent people from accidentally making bad looking color schemes.
78
+ #
79
+ # This is especially important since most will look bad on either light
80
+ # or dark themes.
81
+ class _ColorfulProxy:
82
+ _proxy_allowlist = [
83
+ "disable",
84
+ "reset",
85
+ "bold",
86
+ "italic",
87
+ "underlined",
88
+ # used instead of `gray` as `dimmed` adapts to
89
+ # both light and dark themes
90
+ "dimmed",
91
+ "dodgerBlue", # group
92
+ "limeGreen", # success
93
+ "red", # error
94
+ "orange", # warning
95
+ "skyBlue", # label
96
+ "magenta", # syntax highlighting key words and symbols
97
+ "yellow", # syntax highlighting strings
98
+ ]
99
+
100
+ def __getattr__(self, name):
101
+ res = getattr(_cf, name)
102
+ if callable(res) and name not in _ColorfulProxy._proxy_allowlist:
103
+ raise ValueError(
104
+ "Usage of the colorful method '" + name + "' is forbidden "
105
+ "by the proxy to keep a consistent color scheme. "
106
+ "Check `cli_logger.py` for allowed methods"
107
+ )
108
+ return res
109
+
110
+
111
+ cf = _ColorfulProxy()
112
+
113
+ colorama.init(strip=False)
114
+
115
+
116
+ def _external_caller_info():
117
+ """Get the info from the caller frame.
118
+
119
+ Used to override the logging function and line number with the correct
120
+ ones. See the comment on _patched_makeRecord for more info.
121
+ """
122
+
123
+ frame = inspect.currentframe()
124
+ caller = frame
125
+ levels = 0
126
+ while caller.f_code.co_filename == __file__:
127
+ caller = caller.f_back
128
+ levels += 1
129
+ return {
130
+ "lineno": caller.f_lineno,
131
+ "filename": os.path.basename(caller.f_code.co_filename),
132
+ }
133
+
134
+
135
+ def _format_msg(
136
+ msg: str,
137
+ *args: Any,
138
+ no_format: bool = None,
139
+ _tags: Dict[str, Any] = None,
140
+ _numbered: Tuple[str, int, int] = None,
141
+ **kwargs: Any,
142
+ ):
143
+ """Formats a message for printing.
144
+
145
+ Renders `msg` using the built-in `str.format` and the passed-in
146
+ `*args` and `**kwargs`.
147
+
148
+ Args:
149
+ *args (Any): `.format` arguments for `msg`.
150
+ no_format (bool):
151
+ If `no_format` is `True`,
152
+ `.format` will not be called on the message.
153
+
154
+ Useful if the output is user-provided or may otherwise
155
+ contain an unexpected formatting string (e.g. "{}").
156
+ _tags (Dict[str, Any]):
157
+ key-value pairs to display at the end of
158
+ the message in square brackets.
159
+
160
+ If a tag is set to `True`, it is printed without the value,
161
+ the presence of the tag treated as a "flag".
162
+
163
+ E.g. `_format_msg("hello", _tags=dict(from=mom, signed=True))`
164
+ `hello [from=Mom, signed]`
165
+ _numbered (Tuple[str, int, int]):
166
+ `(brackets, i, n)`
167
+
168
+ The `brackets` string is composed of two "bracket" characters,
169
+ `i` is the index, `n` is the total.
170
+
171
+ The string `{i}/{n}` surrounded by the "brackets" is
172
+ prepended to the message.
173
+
174
+ This is used to number steps in a procedure, with different
175
+ brackets specifying different major tasks.
176
+
177
+ E.g. `_format_msg("hello", _numbered=("[]", 0, 5))`
178
+ `[0/5] hello`
179
+
180
+ Returns:
181
+ The formatted message.
182
+ """
183
+
184
+ if isinstance(msg, str) or isinstance(msg, ColorfulString):
185
+ tags_str = ""
186
+ if _tags is not None:
187
+ tags_list = []
188
+ for k, v in _tags.items():
189
+ if v is True:
190
+ tags_list += [k]
191
+ continue
192
+ if v is False:
193
+ continue
194
+
195
+ tags_list += [k + "=" + v]
196
+ if tags_list:
197
+ tags_str = cf.reset(cf.dimmed(" [{}]".format(", ".join(tags_list))))
198
+
199
+ numbering_str = ""
200
+ if _numbered is not None:
201
+ chars, i, n = _numbered
202
+ numbering_str = cf.dimmed(chars[0] + str(i) + "/" + str(n) + chars[1]) + " "
203
+
204
+ if no_format:
205
+ # todo: throw if given args/kwargs?
206
+ return numbering_str + msg + tags_str
207
+ return numbering_str + msg.format(*args, **kwargs) + tags_str
208
+
209
+ if kwargs:
210
+ raise ValueError("We do not support printing kwargs yet.")
211
+
212
+ res = [msg, *args]
213
+ res = [str(x) for x in res]
214
+ return ", ".join(res)
215
+
216
+
217
+ # TODO: come up with a plan to unify logging.
218
+ # formatter = logging.Formatter(
219
+ # # TODO(maximsmol): figure out the required log level padding
220
+ # # width automatically
221
+ # fmt="[{asctime}] {levelname:6} {message}",
222
+ # datefmt="%x %X",
223
+ # # We want alignment on our level names
224
+ # style="{")
225
+
226
+
227
+ def _isatty():
228
+ """More robust check for interactive terminal/tty."""
229
+ try:
230
+ # https://stackoverflow.com/questions/6108330/
231
+ # checking-for-interactive-shell-in-a-python-script
232
+ return sys.__stdin__.isatty()
233
+ except Exception:
234
+ # sometimes this can fail due to closed output
235
+ # either way, no-tty is generally safe fallback.
236
+ return False
237
+
238
+
239
+ class _CliLogger:
240
+ """Singleton class for CLI logging.
241
+
242
+ Without calling 'cli_logger.configure', the CLILogger will default
243
+ to 'record' style logging.
244
+
245
+ Attributes:
246
+ color_mode (str):
247
+ Can be "true", "false", or "auto".
248
+
249
+ Enables or disables `colorful`.
250
+
251
+ If `color_mode` is "auto", is set to `not stdout.isatty()`
252
+ indent_level (int):
253
+ The current indentation level.
254
+
255
+ All messages will be indented by prepending `" " * indent_level`
256
+ vebosity (int):
257
+ Output verbosity.
258
+
259
+ Low verbosity will disable `verbose` and `very_verbose` messages.
260
+ """
261
+
262
+ color_mode: str
263
+ # color_mode: Union[Literal["auto"], Literal["false"], Literal["true"]]
264
+ indent_level: int
265
+ interactive: bool
266
+ VALID_LOG_STYLES = ("auto", "record", "pretty")
267
+
268
+ _autodetected_cf_colormode: int
269
+
270
+ def __init__(self):
271
+ self.indent_level = 0
272
+
273
+ self._verbosity = 0
274
+ self._verbosity_overriden = False
275
+ self._color_mode = "auto"
276
+ self._log_style = "record"
277
+ self.pretty = False
278
+ self.interactive = False
279
+
280
+ # store whatever colorful has detected for future use if
281
+ # the color ouput is toggled (colorful detects # of supported colors,
282
+ # so it has some non-trivial logic to determine this)
283
+ self._autodetected_cf_colormode = cf.colorful.colormode
284
+ self.set_format()
285
+
286
+ def set_format(self, format_tmpl=None):
287
+ if not format_tmpl:
288
+ from ray.autoscaler._private.constants import LOGGER_FORMAT
289
+
290
+ format_tmpl = LOGGER_FORMAT
291
+ self._formatter = logging.Formatter(format_tmpl)
292
+
293
+ def configure(self, log_style=None, color_mode=None, verbosity=None):
294
+ """Configures the logger according to values."""
295
+ if log_style is not None:
296
+ self._set_log_style(log_style)
297
+
298
+ if color_mode is not None:
299
+ self._set_color_mode(color_mode)
300
+
301
+ if verbosity is not None:
302
+ self._set_verbosity(verbosity)
303
+
304
+ self.detect_colors()
305
+
306
+ @property
307
+ def log_style(self):
308
+ return self._log_style
309
+
310
+ def _set_log_style(self, x):
311
+ """Configures interactivity and formatting."""
312
+ self._log_style = x.lower()
313
+ self.interactive = _isatty()
314
+
315
+ if self._log_style == "auto":
316
+ self.pretty = _isatty()
317
+ elif self._log_style == "record":
318
+ self.pretty = False
319
+ self._set_color_mode("false")
320
+ elif self._log_style == "pretty":
321
+ self.pretty = True
322
+
323
+ @property
324
+ def color_mode(self):
325
+ return self._color_mode
326
+
327
+ def _set_color_mode(self, x):
328
+ self._color_mode = x.lower()
329
+ self.detect_colors()
330
+
331
+ @property
332
+ def verbosity(self):
333
+ if self._verbosity_overriden:
334
+ return self._verbosity
335
+ elif not self.pretty:
336
+ return 999
337
+ return self._verbosity
338
+
339
+ def _set_verbosity(self, x):
340
+ self._verbosity = x
341
+ self._verbosity_overriden = True
342
+
343
+ def detect_colors(self):
344
+ """Update color output settings.
345
+
346
+ Parse the `color_mode` string and optionally disable or force-enable
347
+ color output
348
+ (8-color ANSI if no terminal detected to be safe) in colorful.
349
+ """
350
+ if self.color_mode == "true":
351
+ if self._autodetected_cf_colormode != cf.NO_COLORS:
352
+ cf.colormode = self._autodetected_cf_colormode
353
+ else:
354
+ cf.colormode = cf.ANSI_8_COLORS
355
+ return
356
+ if self.color_mode == "false":
357
+ cf.disable()
358
+ return
359
+ if self.color_mode == "auto":
360
+ # colorful autodetects tty settings
361
+ return
362
+
363
+ raise ValueError("Invalid log color setting: " + self.color_mode)
364
+
365
+ def newline(self):
366
+ """Print a line feed."""
367
+ self.print("")
368
+
369
+ def _print(
370
+ self,
371
+ msg: str,
372
+ _level_str: str = "INFO",
373
+ _linefeed: bool = True,
374
+ end: str = None,
375
+ ):
376
+ """Proxy for printing messages.
377
+
378
+ Args:
379
+ msg: Message to print.
380
+ linefeed (bool):
381
+ If `linefeed` is `False` no linefeed is printed at the
382
+ end of the message.
383
+ """
384
+ if self.pretty:
385
+ rendered_message = " " * self.indent_level + msg
386
+ else:
387
+ if msg.strip() == "":
388
+ return
389
+ caller_info = _external_caller_info()
390
+ record = logging.LogRecord(
391
+ name="cli",
392
+ # We override the level name later
393
+ # TODO(maximsmol): give approximate level #s to our log levels
394
+ level=0,
395
+ # The user-facing logs do not need this information anyway
396
+ # and it would be very tedious to extract since _print
397
+ # can be at varying depths in the call stack
398
+ # TODO(maximsmol): do it anyway to be extra
399
+ pathname=caller_info["filename"],
400
+ lineno=caller_info["lineno"],
401
+ msg=msg,
402
+ args={},
403
+ # No exception
404
+ exc_info=None,
405
+ )
406
+ record.levelname = _level_str
407
+ rendered_message = self._formatter.format(record)
408
+
409
+ # We aren't using standard python logging convention, so we hardcode
410
+ # the log levels for now.
411
+ if _level_str in ["WARNING", "ERROR", "PANIC"]:
412
+ stream = sys.stderr
413
+ else:
414
+ stream = sys.stdout
415
+
416
+ if not _linefeed:
417
+ stream.write(rendered_message)
418
+ stream.flush()
419
+ return
420
+
421
+ kwargs = {"end": end}
422
+ print(rendered_message, file=stream, **kwargs)
423
+
424
+ def indented(self):
425
+ """Context manager that starts an indented block of output."""
426
+ cli_logger = self
427
+
428
+ class IndentedContextManager:
429
+ def __enter__(self):
430
+ cli_logger.indent_level += 1
431
+
432
+ def __exit__(self, type, value, tb):
433
+ cli_logger.indent_level -= 1
434
+
435
+ return IndentedContextManager()
436
+
437
+ def group(self, msg: str, *args: Any, **kwargs: Any):
438
+ """Print a group title in a special color and start an indented block.
439
+
440
+ For arguments, see `_format_msg`.
441
+ """
442
+ self.print(cf.dodgerBlue(msg), *args, **kwargs)
443
+
444
+ return self.indented()
445
+
446
+ def verbatim_error_ctx(self, msg: str, *args: Any, **kwargs: Any):
447
+ """Context manager for printing multi-line error messages.
448
+
449
+ Displays a start sequence "!!! {optional message}"
450
+ and a matching end sequence "!!!".
451
+
452
+ The string "!!!" can be used as a "tombstone" for searching.
453
+
454
+ For arguments, see `_format_msg`.
455
+ """
456
+ cli_logger = self
457
+
458
+ class VerbatimErorContextManager:
459
+ def __enter__(self):
460
+ cli_logger.error(cf.bold("!!! ") + "{}", msg, *args, **kwargs)
461
+
462
+ def __exit__(self, type, value, tb):
463
+ cli_logger.error(cf.bold("!!!"))
464
+
465
+ return VerbatimErorContextManager()
466
+
467
+ def labeled_value(self, key: str, msg: str, *args: Any, **kwargs: Any):
468
+ """Displays a key-value pair with special formatting.
469
+
470
+ Args:
471
+ key: Label that is prepended to the message.
472
+
473
+ For other arguments, see `_format_msg`.
474
+ """
475
+ self._print(cf.skyBlue(key) + ": " + _format_msg(cf.bold(msg), *args, **kwargs))
476
+
477
+ def verbose(self, msg: str, *args: Any, **kwargs: Any):
478
+ """Prints a message if verbosity is not 0.
479
+
480
+ For arguments, see `_format_msg`.
481
+ """
482
+ if self.verbosity > 0:
483
+ self.print(msg, *args, _level_str="VINFO", **kwargs)
484
+
485
+ def verbose_warning(self, msg, *args, **kwargs):
486
+ """Prints a formatted warning if verbosity is not 0.
487
+
488
+ For arguments, see `_format_msg`.
489
+ """
490
+ if self.verbosity > 0:
491
+ self._warning(msg, *args, _level_str="VWARN", **kwargs)
492
+
493
+ def verbose_error(self, msg: str, *args: Any, **kwargs: Any):
494
+ """Logs an error if verbosity is not 0.
495
+
496
+ For arguments, see `_format_msg`.
497
+ """
498
+ if self.verbosity > 0:
499
+ self._error(msg, *args, _level_str="VERR", **kwargs)
500
+
501
+ def very_verbose(self, msg: str, *args: Any, **kwargs: Any):
502
+ """Prints if verbosity is > 1.
503
+
504
+ For arguments, see `_format_msg`.
505
+ """
506
+ if self.verbosity > 1:
507
+ self.print(msg, *args, _level_str="VVINFO", **kwargs)
508
+
509
+ def success(self, msg: str, *args: Any, **kwargs: Any):
510
+ """Prints a formatted success message.
511
+
512
+ For arguments, see `_format_msg`.
513
+ """
514
+ self.print(cf.limeGreen(msg), *args, _level_str="SUCC", **kwargs)
515
+
516
+ def _warning(self, msg: str, *args: Any, _level_str: str = None, **kwargs: Any):
517
+ """Prints a formatted warning message.
518
+
519
+ For arguments, see `_format_msg`.
520
+ """
521
+ if _level_str is None:
522
+ raise ValueError("Log level not set.")
523
+ self.print(cf.orange(msg), *args, _level_str=_level_str, **kwargs)
524
+
525
+ def warning(self, *args, **kwargs):
526
+ self._warning(*args, _level_str="WARN", **kwargs)
527
+
528
+ def _error(self, msg: str, *args: Any, _level_str: str = None, **kwargs: Any):
529
+ """Prints a formatted error message.
530
+
531
+ For arguments, see `_format_msg`.
532
+ """
533
+ if _level_str is None:
534
+ raise ValueError("Log level not set.")
535
+ self.print(cf.red(msg), *args, _level_str=_level_str, **kwargs)
536
+
537
+ def error(self, *args, **kwargs):
538
+ self._error(*args, _level_str="ERR", **kwargs)
539
+
540
+ def panic(self, *args, **kwargs):
541
+ self._error(*args, _level_str="PANIC", **kwargs)
542
+
543
+ # Fine to expose _level_str here, since this is a general log function.
544
+ def print(
545
+ self,
546
+ msg: str,
547
+ *args: Any,
548
+ _level_str: str = "INFO",
549
+ end: str = None,
550
+ **kwargs: Any,
551
+ ):
552
+ """Prints a message.
553
+
554
+ For arguments, see `_format_msg`.
555
+ """
556
+ self._print(_format_msg(msg, *args, **kwargs), _level_str=_level_str, end=end)
557
+
558
+ def info(self, msg: str, no_format=True, *args, **kwargs):
559
+ self.print(msg, no_format=no_format, *args, **kwargs)
560
+
561
+ def abort(
562
+ self, msg: Optional[str] = None, *args: Any, exc: Any = None, **kwargs: Any
563
+ ):
564
+ """Prints an error and aborts execution.
565
+
566
+ Print an error and throw an exception to terminate the program
567
+ (the exception will not print a message).
568
+ """
569
+ if msg is not None:
570
+ self._error(msg, *args, _level_str="PANIC", **kwargs)
571
+
572
+ if exc is not None:
573
+ raise exc
574
+
575
+ exc_cls = click.ClickException
576
+ if self.pretty:
577
+ exc_cls = SilentClickException
578
+
579
+ if msg is None:
580
+ msg = "Exiting due to cli_logger.abort()"
581
+ raise exc_cls(msg)
582
+
583
+ def doassert(self, val: bool, msg: str, *args: Any, **kwargs: Any):
584
+ """Handle assertion without throwing a scary exception.
585
+
586
+ Args:
587
+ val: Value to check.
588
+
589
+ For other arguments, see `_format_msg`.
590
+ """
591
+ if not val:
592
+ exc = None
593
+ if not self.pretty:
594
+ exc = AssertionError()
595
+
596
+ # TODO(maximsmol): rework asserts so that we get the expression
597
+ # that triggered the assert
598
+ # to do this, install a global try-catch
599
+ # for AssertionError and raise them normally
600
+ self.abort(msg, *args, exc=exc, **kwargs)
601
+
602
+ def render_list(self, xs: List[str], separator: str = cf.reset(", ")):
603
+ """Render a list of bolded values using a non-bolded separator."""
604
+ return separator.join([str(cf.bold(x)) for x in xs])
605
+
606
+ def confirm(
607
+ self,
608
+ yes: bool,
609
+ msg: str,
610
+ *args: Any,
611
+ _abort: bool = False,
612
+ _default: bool = False,
613
+ _timeout_s: Optional[float] = None,
614
+ **kwargs: Any,
615
+ ):
616
+ """Display a confirmation dialog.
617
+
618
+ Valid answers are "y/yes/true/1" and "n/no/false/0".
619
+
620
+ Args:
621
+ yes: If `yes` is `True` the dialog will default to "yes"
622
+ and continue without waiting for user input.
623
+ _abort (bool):
624
+ If `_abort` is `True`,
625
+ "no" means aborting the program.
626
+ _default (bool):
627
+ The default action to take if the user just presses enter
628
+ with no input.
629
+ _timeout_s (float):
630
+ If user has no input within _timeout_s seconds, the default
631
+ action is taken. None means no timeout.
632
+ """
633
+ should_abort = _abort
634
+ default = _default
635
+
636
+ if not self.interactive and not yes:
637
+ # no formatting around --yes here since this is non-interactive
638
+ self.error(
639
+ "This command requires user confirmation. "
640
+ "When running non-interactively, supply --yes to skip."
641
+ )
642
+ raise ValueError("Non-interactive confirm without --yes.")
643
+
644
+ if default:
645
+ yn_str = "Y/n"
646
+ else:
647
+ yn_str = "y/N"
648
+
649
+ confirm_str = cf.underlined("Confirm [" + yn_str + "]:") + " "
650
+
651
+ rendered_message = _format_msg(msg, *args, **kwargs)
652
+ # the rendered message ends with ascii coding
653
+ if rendered_message and not msg.endswith("\n"):
654
+ rendered_message += " "
655
+
656
+ msg_len = len(rendered_message.split("\n")[-1])
657
+ complete_str = rendered_message + confirm_str
658
+
659
+ if yes:
660
+ self._print(complete_str + "y " + cf.dimmed("[automatic, due to --yes]"))
661
+ return True
662
+
663
+ self._print(complete_str, _linefeed=False)
664
+
665
+ res = None
666
+ yes_answers = ["y", "yes", "true", "1"]
667
+ no_answers = ["n", "no", "false", "0"]
668
+ try:
669
+ while True:
670
+ if _timeout_s is None:
671
+ ans = sys.stdin.readline()
672
+ elif sys.platform == "win32":
673
+ # Windows doesn't support select
674
+ start_time = time.time()
675
+ ans = ""
676
+ while True:
677
+ if (time.time() - start_time) >= _timeout_s:
678
+ self.newline()
679
+ ans = "\n"
680
+ break
681
+ elif msvcrt.kbhit():
682
+ ch = msvcrt.getwch()
683
+ if ch in ("\n", "\r"):
684
+ self.newline()
685
+ ans = ans + "\n"
686
+ break
687
+ elif ch == "\b":
688
+ if ans:
689
+ ans = ans[:-1]
690
+ # Emulate backspace erasing
691
+ print("\b \b", end="", flush=True)
692
+ else:
693
+ ans = ans + ch
694
+ print(ch, end="", flush=True)
695
+ else:
696
+ time.sleep(0.1)
697
+ else:
698
+ ready, _, _ = select.select([sys.stdin], [], [], _timeout_s)
699
+ if not ready:
700
+ self.newline()
701
+ ans = "\n"
702
+ else:
703
+ ans = sys.stdin.readline()
704
+
705
+ ans = ans.lower()
706
+
707
+ if ans == "\n":
708
+ res = default
709
+ break
710
+
711
+ ans = ans.strip()
712
+ if ans in yes_answers:
713
+ res = True
714
+ break
715
+ if ans in no_answers:
716
+ res = False
717
+ break
718
+
719
+ indent = " " * msg_len
720
+ self.error(
721
+ "{}Invalid answer: {}. Expected {} or {}",
722
+ indent,
723
+ cf.bold(ans.strip()),
724
+ self.render_list(yes_answers, "/"),
725
+ self.render_list(no_answers, "/"),
726
+ )
727
+ self._print(indent + confirm_str, _linefeed=False)
728
+ except KeyboardInterrupt:
729
+ self.newline()
730
+ res = default
731
+
732
+ if not res and should_abort:
733
+ # todo: make sure we tell the user if they
734
+ # need to do cleanup
735
+ self._print("Exiting...")
736
+ raise SilentClickException(
737
+ "Exiting due to the response to confirm(should_abort=True)."
738
+ )
739
+
740
+ return res
741
+
742
+ def prompt(self, msg: str, *args, **kwargs):
743
+ """Prompt the user for some text input.
744
+
745
+ Args:
746
+ msg: The mesage to display to the user before the prompt.
747
+
748
+ Returns:
749
+ The string entered by the user.
750
+ """
751
+ complete_str = cf.underlined(msg)
752
+ rendered_message = _format_msg(complete_str, *args, **kwargs)
753
+ # the rendered message ends with ascii coding
754
+ if rendered_message and not msg.endswith("\n"):
755
+ rendered_message += " "
756
+ self._print(rendered_message, linefeed=False)
757
+
758
+ res = ""
759
+ try:
760
+ ans = sys.stdin.readline()
761
+ ans = ans.lower()
762
+ res = ans.strip()
763
+ except KeyboardInterrupt:
764
+ self.newline()
765
+
766
+ return res
767
+
768
+ def flush(self):
769
+ sys.stdout.flush()
770
+ sys.stderr.flush()
771
+
772
+
773
+ class SilentClickException(click.ClickException):
774
+ """`ClickException` that does not print a message.
775
+
776
+ Some of our tooling relies on catching ClickException in particular.
777
+
778
+ However the default prints a message, which is undesirable since we expect
779
+ our code to log errors manually using `cli_logger.error()` to allow for
780
+ colors and other formatting.
781
+ """
782
+
783
+ def __init__(self, message: str):
784
+ super(SilentClickException, self).__init__(message)
785
+
786
+ def show(self, file=None):
787
+ pass
788
+
789
+
790
+ cli_logger = _CliLogger()
791
+
792
+ CLICK_LOGGING_OPTIONS = [
793
+ click.option(
794
+ "--log-style",
795
+ required=False,
796
+ type=click.Choice(cli_logger.VALID_LOG_STYLES, case_sensitive=False),
797
+ default="auto",
798
+ help=(
799
+ "If 'pretty', outputs with formatting and color. If 'record', "
800
+ "outputs record-style without formatting. "
801
+ "'auto' defaults to 'pretty', and disables pretty logging "
802
+ "if stdin is *not* a TTY."
803
+ ),
804
+ ),
805
+ click.option(
806
+ "--log-color",
807
+ required=False,
808
+ type=click.Choice(["auto", "false", "true"], case_sensitive=False),
809
+ default="auto",
810
+ help=("Use color logging. Auto enables color logging if stdout is a TTY."),
811
+ ),
812
+ click.option("-v", "--verbose", default=None, count=True),
813
+ ]
814
+
815
+
816
+ def add_click_logging_options(f: Callable) -> Callable:
817
+ for option in reversed(CLICK_LOGGING_OPTIONS):
818
+ f = option(f)
819
+
820
+ @wraps(f)
821
+ def wrapper(*args, log_style=None, log_color=None, verbose=None, **kwargs):
822
+ cli_logger.configure(log_style, log_color, verbose)
823
+ return f(*args, **kwargs)
824
+
825
+ return wrapper
.venv/lib/python3.11/site-packages/ray/autoscaler/_private/cli_logger_demoall.py ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python
2
+
3
+ # This is an executable script that runs an example of every single CliLogger
4
+ # function for demonstration purposes. Primarily useful for tuning color and
5
+ # other formatting.
6
+
7
+ from ray.autoscaler._private.cli_logger import cf, cli_logger
8
+
9
+ cli_logger.configure(log_style="auto", verbosity=999)
10
+
11
+ cli_logger.print(cf.bold("Bold ") + cf.italic("Italic ") + cf.underlined("Underlined"))
12
+ cli_logger.labeled_value("Label", "value")
13
+ cli_logger.print("List: {}", cli_logger.render_list([1, 2, 3]))
14
+ cli_logger.newline()
15
+ cli_logger.very_verbose("Very verbose")
16
+ cli_logger.verbose("Verbose")
17
+ cli_logger.verbose_warning("Verbose warning")
18
+ cli_logger.verbose_error("Verbose error")
19
+ cli_logger.print("Info")
20
+ cli_logger.success("Success")
21
+ cli_logger.warning("Warning")
22
+ cli_logger.error("Error")
23
+ cli_logger.newline()
24
+ try:
25
+ cli_logger.abort("Abort")
26
+ except Exception:
27
+ pass
28
+ try:
29
+ cli_logger.doassert(False, "Assert")
30
+ except Exception:
31
+ pass
32
+ cli_logger.newline()
33
+ cli_logger.confirm(True, "example")
34
+ cli_logger.newline()
35
+ with cli_logger.indented():
36
+ cli_logger.print("Indented")
37
+ with cli_logger.group("Group"):
38
+ cli_logger.print("Group contents")
39
+ with cli_logger.verbatim_error_ctx("Verbtaim error"):
40
+ cli_logger.print("Error contents")
.venv/lib/python3.11/site-packages/ray/autoscaler/_private/cluster_dump.py ADDED
@@ -0,0 +1,652 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import re
3
+ import subprocess
4
+ import sys
5
+ import tarfile
6
+ import tempfile
7
+ import threading
8
+ from concurrent.futures import ThreadPoolExecutor
9
+ from contextlib import contextmanager
10
+ from typing import List, Optional, Sequence, Tuple
11
+
12
+ import yaml
13
+
14
+ import ray # noqa: F401
15
+ from ray.autoscaler._private.cli_logger import cli_logger
16
+ from ray.autoscaler._private.providers import _get_node_provider
17
+ from ray.autoscaler.tags import NODE_KIND_HEAD, NODE_KIND_WORKER, TAG_RAY_NODE_KIND
18
+
19
+ # Import psutil after ray so the packaged version is used.
20
+ import psutil
21
+
22
+ MAX_PARALLEL_SSH_WORKERS = 8
23
+ DEFAULT_SSH_USER = "ubuntu"
24
+ DEFAULT_SSH_KEYS = ["~/ray_bootstrap_key.pem", "~/.ssh/ray-autoscaler_2_us-west-2.pem"]
25
+
26
+
27
+ class CommandFailed(RuntimeError):
28
+ pass
29
+
30
+
31
+ class LocalCommandFailed(CommandFailed):
32
+ pass
33
+
34
+
35
+ class RemoteCommandFailed(CommandFailed):
36
+ pass
37
+
38
+
39
+ class GetParameters:
40
+ def __init__(
41
+ self,
42
+ logs: bool = True,
43
+ debug_state: bool = True,
44
+ pip: bool = True,
45
+ processes: bool = True,
46
+ processes_verbose: bool = True,
47
+ processes_list: Optional[List[Tuple[str, bool]]] = None,
48
+ ):
49
+ self.logs = logs
50
+ self.debug_state = debug_state
51
+ self.pip = pip
52
+ self.processes = processes
53
+ self.processes_verbose = processes_verbose
54
+ self.processes_list = processes_list
55
+
56
+
57
+ class Node:
58
+ """Node (as in "machine")"""
59
+
60
+ def __init__(
61
+ self,
62
+ host: str,
63
+ ssh_user: str = "ubuntu",
64
+ ssh_key: str = "~/ray_bootstrap_key.pem",
65
+ docker_container: Optional[str] = None,
66
+ is_head: bool = False,
67
+ ):
68
+ self.host = host
69
+ self.ssh_user = ssh_user
70
+ self.ssh_key = ssh_key
71
+ self.docker_container = docker_container
72
+ self.is_head = is_head
73
+
74
+
75
+ class Archive:
76
+ """Archive object to collect and compress files into a single file.
77
+
78
+ Objects of this class can be passed around to different data collection
79
+ functions. These functions can use the :meth:`subdir` method to add
80
+ files to a sub directory of the archive.
81
+
82
+ """
83
+
84
+ def __init__(self, file: Optional[str] = None):
85
+ self.file = file or tempfile.mkstemp(prefix="ray_logs_", suffix=".tar.gz")[1]
86
+ self.tar = None
87
+ self._lock = threading.Lock()
88
+
89
+ @property
90
+ def is_open(self):
91
+ return bool(self.tar)
92
+
93
+ def open(self):
94
+ self.tar = tarfile.open(self.file, "w:gz")
95
+
96
+ def close(self):
97
+ self.tar.close()
98
+ self.tar = None
99
+
100
+ def __enter__(self):
101
+ self.open()
102
+ return self
103
+
104
+ def __exit__(self, exc_type, exc_val, exc_tb):
105
+ self.close()
106
+
107
+ @contextmanager
108
+ def subdir(self, subdir: str, root: Optional[str] = "/"):
109
+ """Open a context to add files to the archive.
110
+
111
+ Example:
112
+
113
+ .. code-block:: python
114
+
115
+ with Archive("file.tar.gz") as archive:
116
+ with archive.subdir("logfiles", root="/tmp/logs") as sd:
117
+ # Will be added as `logfiles/nested/file.txt`
118
+ sd.add("/tmp/logs/nested/file.txt")
119
+
120
+ Args:
121
+ subdir: Subdir to which to add files to. Calling the
122
+ ``add(path)`` command will place files into the ``subdir``
123
+ directory of the archive.
124
+ root: Root path. Files without an explicit ``arcname``
125
+ will be named relatively to this path.
126
+
127
+ Yields:
128
+ A context object that can be used to add files to the archive.
129
+ """
130
+ root = os.path.abspath(root)
131
+
132
+ class _Context:
133
+ @staticmethod
134
+ def add(path: str, arcname: Optional[str] = None):
135
+ path = os.path.abspath(path)
136
+ arcname = arcname or os.path.join(subdir, os.path.relpath(path, root))
137
+
138
+ self._lock.acquire()
139
+ self.tar.add(path, arcname=arcname)
140
+ self._lock.release()
141
+
142
+ yield _Context()
143
+
144
+
145
+ ###
146
+ # Functions to gather logs and information on the local node
147
+ ###
148
+
149
+
150
+ def get_local_ray_logs(
151
+ archive: Archive,
152
+ exclude: Optional[Sequence[str]] = None,
153
+ session_log_dir: str = "/tmp/ray/session_latest",
154
+ ) -> Archive:
155
+ """Copy local log files into an archive.
156
+
157
+ Args:
158
+ archive: Archive object to add log files to.
159
+ exclude (Sequence[str]): Sequence of regex patterns. Files that match
160
+ any of these patterns will not be included in the archive.
161
+ session_dir: Path to the Ray session files. Defaults to
162
+ ``/tmp/ray/session_latest``
163
+
164
+ Returns:
165
+ Open archive object.
166
+
167
+ """
168
+ if not archive.is_open:
169
+ archive.open()
170
+
171
+ exclude = exclude or []
172
+
173
+ session_log_dir = os.path.join(os.path.expanduser(session_log_dir), "logs")
174
+
175
+ with archive.subdir("logs", root=session_log_dir) as sd:
176
+ for root, dirs, files in os.walk(session_log_dir):
177
+ for file in files:
178
+ file_path = os.path.join(root, file)
179
+ rel_path = os.path.relpath(file_path, start=session_log_dir)
180
+ # Skip file if it matches any pattern in `exclude`
181
+ if any(re.match(pattern, rel_path) for pattern in exclude):
182
+ continue
183
+ sd.add(file_path)
184
+
185
+ return archive
186
+
187
+
188
+ def get_local_debug_state(
189
+ archive: Archive, session_dir: str = "/tmp/ray/session_latest"
190
+ ) -> Archive:
191
+ """Copy local log files into an archive.
192
+
193
+ Args:
194
+ archive: Archive object to add log files to.
195
+ session_dir: Path to the Ray session files. Defaults to
196
+ ``/tmp/ray/session_latest``
197
+
198
+ Returns:
199
+ Open archive object.
200
+
201
+ """
202
+ if not archive.is_open:
203
+ archive.open()
204
+
205
+ session_dir = os.path.expanduser(session_dir)
206
+ debug_state_file = os.path.join(session_dir, "logs/debug_state.txt")
207
+
208
+ if not os.path.exists(debug_state_file):
209
+ raise LocalCommandFailed("No `debug_state.txt` file found.")
210
+
211
+ with archive.subdir("", root=session_dir) as sd:
212
+ sd.add(debug_state_file)
213
+
214
+ return archive
215
+
216
+
217
+ def get_local_pip_packages(archive: Archive):
218
+ """Get currently installed pip packages and write into an archive.
219
+
220
+ Args:
221
+ archive: Archive object to add meta files to.
222
+
223
+ Returns:
224
+ Open archive object.
225
+ """
226
+ if not archive.is_open:
227
+ archive.open()
228
+
229
+ try:
230
+ from pip._internal.operations import freeze
231
+ except ImportError: # pip < 10.0
232
+ from pip.operations import freeze
233
+
234
+ with tempfile.NamedTemporaryFile("wt") as fp:
235
+ for line in freeze.freeze():
236
+ fp.writelines([line, "\n"])
237
+
238
+ fp.flush()
239
+ with archive.subdir("") as sd:
240
+ sd.add(fp.name, "pip_packages.txt")
241
+
242
+ return archive
243
+
244
+
245
+ def get_local_ray_processes(
246
+ archive: Archive,
247
+ processes: Optional[List[Tuple[str, bool]]] = None,
248
+ verbose: bool = False,
249
+ ):
250
+ """Get the status of all the relevant ray processes.
251
+ Args:
252
+ archive: Archive object to add process info files to.
253
+ processes: List of processes to get information on. The first
254
+ element of the tuple is a string to filter by, and the second
255
+ element is a boolean indicating if we should filter by command
256
+ name (True) or command line including parameters (False)
257
+ verbose: If True, show entire executable command line.
258
+ If False, show just the first term.
259
+ Returns:
260
+ Open archive object.
261
+ """
262
+ if not processes:
263
+ # local import to avoid circular dependencies
264
+ from ray.autoscaler._private.constants import RAY_PROCESSES
265
+
266
+ processes = RAY_PROCESSES
267
+
268
+ process_infos = []
269
+ for process in psutil.process_iter(["pid", "name", "cmdline", "status"]):
270
+ try:
271
+ with process.oneshot():
272
+ cmdline = " ".join(process.cmdline())
273
+ process_infos.append(
274
+ (
275
+ {
276
+ "executable": cmdline
277
+ if verbose
278
+ else cmdline.split("--", 1)[0][:-1],
279
+ "name": process.name(),
280
+ "pid": process.pid,
281
+ "status": process.status(),
282
+ },
283
+ process.cmdline(),
284
+ )
285
+ )
286
+ except Exception as exc:
287
+ raise LocalCommandFailed(exc) from exc
288
+
289
+ relevant_processes = {}
290
+ for process_dict, cmdline in process_infos:
291
+ for keyword, filter_by_cmd in processes:
292
+ if filter_by_cmd:
293
+ corpus = process_dict["name"]
294
+ else:
295
+ corpus = subprocess.list2cmdline(cmdline)
296
+ if keyword in corpus and process_dict["pid"] not in relevant_processes:
297
+ relevant_processes[process_dict["pid"]] = process_dict
298
+
299
+ with tempfile.NamedTemporaryFile("wt") as fp:
300
+ for line in relevant_processes.values():
301
+ fp.writelines([yaml.dump(line), "\n"])
302
+
303
+ fp.flush()
304
+ with archive.subdir("meta") as sd:
305
+ sd.add(fp.name, "process_info.txt")
306
+
307
+ return archive
308
+
309
+
310
+ def get_all_local_data(archive: Archive, parameters: GetParameters):
311
+ """Get all local data.
312
+
313
+ Gets:
314
+ - The Ray logs of the latest session
315
+ - The currently installed pip packages
316
+
317
+ Args:
318
+ archive: Archive object to add meta files to.
319
+ parameters: Parameters (settings) for getting data.
320
+
321
+ Returns:
322
+ Open archive object.
323
+ """
324
+ if not archive.is_open:
325
+ archive.open()
326
+
327
+ if parameters.logs:
328
+ try:
329
+ get_local_ray_logs(archive=archive)
330
+ except LocalCommandFailed as exc:
331
+ cli_logger.error(exc)
332
+ if parameters.debug_state:
333
+ try:
334
+ get_local_debug_state(archive=archive)
335
+ except LocalCommandFailed as exc:
336
+ cli_logger.error(exc)
337
+ if parameters.pip:
338
+ try:
339
+ get_local_pip_packages(archive=archive)
340
+ except LocalCommandFailed as exc:
341
+ cli_logger.error(exc)
342
+ if parameters.processes:
343
+ try:
344
+ get_local_ray_processes(
345
+ archive=archive,
346
+ processes=parameters.processes_list,
347
+ verbose=parameters.processes_verbose,
348
+ )
349
+ except LocalCommandFailed as exc:
350
+ cli_logger.error(exc)
351
+
352
+ return archive
353
+
354
+
355
+ ###
356
+ # Functions to invoke remote scripts and gather data from remote nodes
357
+ ###
358
+
359
+
360
+ def _wrap(items: List[str], quotes="'"):
361
+ return f"{quotes}{' '.join(items)}{quotes}"
362
+
363
+
364
+ def create_and_get_archive_from_remote_node(
365
+ remote_node: Node, parameters: GetParameters, script_path: str = "ray"
366
+ ) -> Optional[str]:
367
+ """Create an archive containing logs on a remote node and transfer.
368
+
369
+ This will call ``ray local-dump --stream`` on the remote
370
+ node. The resulting file will be saved locally in a temporary file and
371
+ returned.
372
+
373
+ Args:
374
+ remote_node: Remote node to gather archive from.
375
+ script_path: Path to this script on the remote node.
376
+ parameters: Parameters (settings) for getting data.
377
+
378
+ Returns:
379
+ Path to a temporary file containing the node's collected data.
380
+
381
+ """
382
+ cmd = [
383
+ "ssh",
384
+ "-o StrictHostKeyChecking=no",
385
+ "-o UserKnownHostsFile=/dev/null",
386
+ "-o LogLevel=ERROR",
387
+ "-i",
388
+ remote_node.ssh_key,
389
+ f"{remote_node.ssh_user}@{remote_node.host}",
390
+ ]
391
+
392
+ if remote_node.docker_container:
393
+ cmd += [
394
+ "docker",
395
+ "exec",
396
+ remote_node.docker_container,
397
+ ]
398
+
399
+ collect_cmd = [script_path, "local-dump", "--stream"]
400
+ collect_cmd += ["--logs"] if parameters.logs else ["--no-logs"]
401
+ collect_cmd += ["--debug-state"] if parameters.debug_state else ["--no-debug-state"]
402
+ collect_cmd += ["--pip"] if parameters.pip else ["--no-pip"]
403
+ collect_cmd += ["--processes"] if parameters.processes else ["--no-processes"]
404
+ if parameters.processes:
405
+ collect_cmd += (
406
+ ["--processes-verbose"]
407
+ if parameters.processes_verbose
408
+ else ["--no-proccesses-verbose"]
409
+ )
410
+
411
+ cmd += ["/bin/bash", "-c", _wrap(collect_cmd, quotes='"')]
412
+
413
+ cat = "node" if not remote_node.is_head else "head"
414
+
415
+ cli_logger.print(f"Collecting data from remote node: {remote_node.host}")
416
+ tmp = tempfile.mkstemp(prefix=f"ray_{cat}_{remote_node.host}_", suffix=".tar.gz")[1]
417
+ with open(tmp, "wb") as fp:
418
+ try:
419
+ subprocess.check_call(cmd, stdout=fp, stderr=sys.stderr)
420
+ except subprocess.CalledProcessError as exc:
421
+ raise RemoteCommandFailed(
422
+ f"Gathering logs from remote node failed: {' '.join(cmd)}"
423
+ ) from exc
424
+
425
+ return tmp
426
+
427
+
428
+ def create_and_add_remote_data_to_local_archive(
429
+ archive: Archive, remote_node: Node, parameters: GetParameters
430
+ ):
431
+ """Create and get data from remote node and add to local archive.
432
+
433
+ Args:
434
+ archive: Archive object to add remote data to.
435
+ remote_node: Remote node to gather archive from.
436
+ parameters: Parameters (settings) for getting data.
437
+
438
+ Returns:
439
+ Open archive object.
440
+ """
441
+ tmp = create_and_get_archive_from_remote_node(remote_node, parameters)
442
+
443
+ if not archive.is_open:
444
+ archive.open()
445
+
446
+ cat = "node" if not remote_node.is_head else "head"
447
+
448
+ with archive.subdir("", root=os.path.dirname(tmp)) as sd:
449
+ sd.add(tmp, arcname=f"ray_{cat}_{remote_node.host}.tar.gz")
450
+
451
+ return archive
452
+
453
+
454
+ def create_and_add_local_data_to_local_archive(
455
+ archive: Archive, parameters: GetParameters
456
+ ):
457
+ """Create and get data from this node and add to archive.
458
+
459
+ Args:
460
+ archive: Archive object to add remote data to.
461
+ parameters: Parameters (settings) for getting data.
462
+
463
+ Returns:
464
+ Open archive object.
465
+ """
466
+ with Archive() as local_data_archive:
467
+ get_all_local_data(local_data_archive, parameters)
468
+
469
+ if not archive.is_open:
470
+ archive.open()
471
+
472
+ with archive.subdir("", root=os.path.dirname(local_data_archive.file)) as sd:
473
+ sd.add(local_data_archive.file, arcname="local_node.tar.gz")
474
+
475
+ os.remove(local_data_archive.file)
476
+
477
+ return archive
478
+
479
+
480
+ def create_archive_for_remote_nodes(
481
+ archive: Archive, remote_nodes: Sequence[Node], parameters: GetParameters
482
+ ):
483
+ """Create an archive combining data from the remote nodes.
484
+
485
+ This will parallelize calls to get data from remote nodes.
486
+
487
+ Args:
488
+ archive: Archive object to add remote data to.
489
+ remote_nodes (Sequence[Node]): Sequence of remote nodes.
490
+ parameters: Parameters (settings) for getting data.
491
+
492
+ Returns:
493
+ Open archive object.
494
+
495
+ """
496
+ if not archive.is_open:
497
+ archive.open()
498
+
499
+ with ThreadPoolExecutor(max_workers=MAX_PARALLEL_SSH_WORKERS) as executor:
500
+ for remote_node in remote_nodes:
501
+ executor.submit(
502
+ create_and_add_remote_data_to_local_archive,
503
+ archive=archive,
504
+ remote_node=remote_node,
505
+ parameters=parameters,
506
+ )
507
+
508
+ return archive
509
+
510
+
511
+ def create_archive_for_local_and_remote_nodes(
512
+ archive: Archive, remote_nodes: Sequence[Node], parameters: GetParameters
513
+ ):
514
+ """Create an archive combining data from the local and remote nodes.
515
+
516
+ This will parallelize calls to get data from remote nodes.
517
+
518
+ Args:
519
+ archive: Archive object to add data to.
520
+ remote_nodes (Sequence[Node]): Sequence of remote nodes.
521
+ parameters: Parameters (settings) for getting data.
522
+
523
+ Returns:
524
+ Open archive object.
525
+
526
+ """
527
+ if not archive.is_open:
528
+ archive.open()
529
+
530
+ try:
531
+ create_and_add_local_data_to_local_archive(archive, parameters)
532
+ except CommandFailed as exc:
533
+ cli_logger.error(exc)
534
+
535
+ create_archive_for_remote_nodes(archive, remote_nodes, parameters)
536
+
537
+ cli_logger.print(
538
+ f"Collected data from local node and {len(remote_nodes)} " f"remote nodes."
539
+ )
540
+ return archive
541
+
542
+
543
+ ###
544
+ # Ray cluster info
545
+ ###
546
+ def get_info_from_ray_cluster_config(
547
+ cluster_config: str,
548
+ ) -> Tuple[List[str], str, str, Optional[str], Optional[str]]:
549
+ """Get information from Ray cluster config.
550
+
551
+ Return list of host IPs, ssh user, ssh key file, and optional docker
552
+ container.
553
+
554
+ Args:
555
+ cluster_config: Path to ray cluster config.
556
+
557
+ Returns:
558
+ Tuple of list of host IPs, ssh user name, ssh key file path,
559
+ optional docker container name, optional cluster name.
560
+ """
561
+ from ray.autoscaler._private.commands import _bootstrap_config
562
+
563
+ cli_logger.print(
564
+ f"Retrieving cluster information from ray cluster file: " f"{cluster_config}"
565
+ )
566
+
567
+ cluster_config = os.path.expanduser(cluster_config)
568
+
569
+ config = yaml.safe_load(open(cluster_config).read())
570
+ config = _bootstrap_config(config, no_config_cache=True)
571
+
572
+ provider = _get_node_provider(config["provider"], config["cluster_name"])
573
+ head_nodes = provider.non_terminated_nodes({TAG_RAY_NODE_KIND: NODE_KIND_HEAD})
574
+ worker_nodes = provider.non_terminated_nodes({TAG_RAY_NODE_KIND: NODE_KIND_WORKER})
575
+
576
+ hosts = [provider.external_ip(node) for node in head_nodes + worker_nodes]
577
+ ssh_user = config["auth"]["ssh_user"]
578
+ ssh_key = config["auth"]["ssh_private_key"]
579
+
580
+ docker = None
581
+ docker_config = config.get("docker", None)
582
+ if docker_config:
583
+ docker = docker_config.get("container_name", None)
584
+
585
+ cluster_name = config.get("cluster_name", None)
586
+
587
+ return hosts, ssh_user, ssh_key, docker, cluster_name
588
+
589
+
590
+ def _info_from_params(
591
+ cluster: Optional[str] = None,
592
+ host: Optional[str] = None,
593
+ ssh_user: Optional[str] = None,
594
+ ssh_key: Optional[str] = None,
595
+ docker: Optional[str] = None,
596
+ ):
597
+ """Parse command line arguments.
598
+
599
+ Note: This returns a list of hosts, not a comma separated string!
600
+ """
601
+ if not host and not cluster:
602
+ bootstrap_config = os.path.expanduser("~/ray_bootstrap_config.yaml")
603
+ if os.path.exists(bootstrap_config):
604
+ cluster = bootstrap_config
605
+ cli_logger.warning(
606
+ f"Detected cluster config file at {cluster}. "
607
+ f"If this is incorrect, specify with "
608
+ f"`ray cluster-dump <config>`"
609
+ )
610
+ elif cluster:
611
+ cluster = os.path.expanduser(cluster)
612
+
613
+ cluster_name = None
614
+
615
+ if cluster:
616
+ h, u, k, d, cluster_name = get_info_from_ray_cluster_config(cluster)
617
+
618
+ ssh_user = ssh_user or u
619
+ ssh_key = ssh_key or k
620
+ docker = docker or d
621
+ hosts = host.split(",") if host else h
622
+
623
+ if not hosts:
624
+ raise LocalCommandFailed(
625
+ f"Invalid cluster file or cluster has no running nodes: " f"{cluster}"
626
+ )
627
+ elif host:
628
+ hosts = host.split(",")
629
+ else:
630
+ raise LocalCommandFailed(
631
+ "You need to either specify a `<cluster_config>` or `--host`."
632
+ )
633
+
634
+ if not ssh_user:
635
+ ssh_user = DEFAULT_SSH_USER
636
+ cli_logger.warning(
637
+ f"Using default SSH user `{ssh_user}`. "
638
+ f"If this is incorrect, specify with `--ssh-user <user>`"
639
+ )
640
+
641
+ if not ssh_key:
642
+ for cand_key in DEFAULT_SSH_KEYS:
643
+ cand_key_file = os.path.expanduser(cand_key)
644
+ if os.path.exists(cand_key_file):
645
+ ssh_key = cand_key_file
646
+ cli_logger.warning(
647
+ f"Auto detected SSH key file: {ssh_key}. "
648
+ f"If this is incorrect, specify with `--ssh-key <key>`"
649
+ )
650
+ break
651
+
652
+ return cluster, hosts, ssh_user, ssh_key, docker, cluster_name
.venv/lib/python3.11/site-packages/ray/autoscaler/_private/command_runner.py ADDED
@@ -0,0 +1,921 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import hashlib
2
+ import json
3
+ import logging
4
+ import os
5
+ import subprocess
6
+ import sys
7
+ import time
8
+ from getpass import getuser
9
+ from shlex import quote
10
+ from typing import Dict, List
11
+
12
+ import click
13
+
14
+ from ray._private.ray_constants import DEFAULT_OBJECT_STORE_MAX_MEMORY_BYTES
15
+ from ray.autoscaler._private.cli_logger import cf, cli_logger
16
+ from ray.autoscaler._private.constants import (
17
+ AUTOSCALER_NODE_SSH_INTERVAL_S,
18
+ AUTOSCALER_NODE_START_WAIT_S,
19
+ DEFAULT_OBJECT_STORE_MEMORY_PROPORTION,
20
+ )
21
+ from ray.autoscaler._private.docker import (
22
+ check_bind_mounts_cmd,
23
+ check_docker_image,
24
+ check_docker_running_cmd,
25
+ docker_start_cmds,
26
+ with_docker_exec,
27
+ )
28
+ from ray.autoscaler._private.log_timer import LogTimer
29
+ from ray.autoscaler._private.subprocess_output_util import (
30
+ ProcessRunnerError,
31
+ is_output_redirected,
32
+ run_cmd_redirected,
33
+ )
34
+ from ray.autoscaler.command_runner import CommandRunnerInterface
35
+
36
+ logger = logging.getLogger(__name__)
37
+
38
+ # How long to wait for a node to start, in seconds
39
+ HASH_MAX_LENGTH = 10
40
+ KUBECTL_RSYNC = os.path.join(
41
+ os.path.dirname(os.path.abspath(__file__)), "_kubernetes/kubectl-rsync.sh"
42
+ )
43
+ MAX_HOME_RETRIES = 3
44
+ HOME_RETRY_DELAY_S = 5
45
+
46
+ _config = {"use_login_shells": True, "silent_rsync": True}
47
+
48
+
49
+ def is_rsync_silent():
50
+ return _config["silent_rsync"]
51
+
52
+
53
+ def set_rsync_silent(val):
54
+ """Choose whether to silence rsync output.
55
+
56
+ Most commands will want to list rsync'd files themselves rather than
57
+ print the default rsync spew.
58
+ """
59
+ _config["silent_rsync"] = val
60
+
61
+
62
+ def is_using_login_shells():
63
+ return _config["use_login_shells"]
64
+
65
+
66
+ def set_using_login_shells(val: bool):
67
+ """Choose between login and non-interactive shells.
68
+
69
+ Non-interactive shells have the benefit of receiving less output from
70
+ subcommands (since progress bars and TTY control codes are not printed).
71
+ Sometimes this can be significant since e.g. `pip install` prints
72
+ hundreds of progress bar lines when downloading.
73
+
74
+ Login shells have the benefit of working very close to how a proper bash
75
+ session does, regarding how scripts execute and how the environment is
76
+ setup. This is also how all commands were ran in the past. The only reason
77
+ to use login shells over non-interactive shells is if you need some weird
78
+ and non-robust tool to work.
79
+
80
+ Args:
81
+ val: If true, login shells will be used to run all commands.
82
+ """
83
+ _config["use_login_shells"] = val
84
+
85
+
86
+ def _with_environment_variables(cmd: str, environment_variables: Dict[str, object]):
87
+ """Prepend environment variables to a shell command.
88
+
89
+ Args:
90
+ cmd: The base command.
91
+ environment_variables (Dict[str, object]): The set of environment
92
+ variables. If an environment variable value is a dict, it will
93
+ automatically be converted to a one line yaml string.
94
+ """
95
+
96
+ as_strings = []
97
+ for key, val in environment_variables.items():
98
+ val = json.dumps(val, separators=(",", ":"))
99
+ s = "export {}={};".format(key, quote(val))
100
+ as_strings.append(s)
101
+ all_vars = "".join(as_strings)
102
+ return all_vars + cmd
103
+
104
+
105
+ def _with_interactive(cmd):
106
+ force_interactive = (
107
+ f"source ~/.bashrc; "
108
+ f"export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && ({cmd})"
109
+ )
110
+ return ["bash", "--login", "-c", "-i", quote(force_interactive)]
111
+
112
+
113
+ class SSHOptions:
114
+ def __init__(self, ssh_key, control_path=None, **kwargs):
115
+ self.ssh_key = ssh_key
116
+ self.arg_dict = {
117
+ # Supresses initial fingerprint verification.
118
+ "StrictHostKeyChecking": "no",
119
+ # SSH IP and fingerprint pairs no longer added to known_hosts.
120
+ # This is to remove a "REMOTE HOST IDENTIFICATION HAS CHANGED"
121
+ # warning if a new node has the same IP as a previously
122
+ # deleted node, because the fingerprints will not match in
123
+ # that case.
124
+ "UserKnownHostsFile": os.devnull,
125
+ # Try fewer extraneous key pairs.
126
+ "IdentitiesOnly": "yes",
127
+ # Abort if port forwarding fails (instead of just printing to
128
+ # stderr).
129
+ "ExitOnForwardFailure": "yes",
130
+ # Quickly kill the connection if network connection breaks (as
131
+ # opposed to hanging/blocking).
132
+ "ServerAliveInterval": 5,
133
+ "ServerAliveCountMax": 3,
134
+ }
135
+ if control_path:
136
+ self.arg_dict.update(
137
+ {
138
+ "ControlMaster": "auto",
139
+ "ControlPath": "{}/%C".format(control_path),
140
+ "ControlPersist": "10s",
141
+ }
142
+ )
143
+ self.arg_dict.update(kwargs)
144
+
145
+ def to_ssh_options_list(self, *, timeout=60):
146
+ self.arg_dict["ConnectTimeout"] = "{}s".format(timeout)
147
+ ssh_key_option = ["-i", self.ssh_key] if self.ssh_key else []
148
+ return ssh_key_option + [
149
+ x
150
+ for y in (
151
+ ["-o", "{}={}".format(k, v)]
152
+ for k, v in self.arg_dict.items()
153
+ if v is not None
154
+ )
155
+ for x in y
156
+ ]
157
+
158
+
159
+ class SSHCommandRunner(CommandRunnerInterface):
160
+ def __init__(
161
+ self,
162
+ log_prefix,
163
+ node_id,
164
+ provider,
165
+ auth_config,
166
+ cluster_name,
167
+ process_runner,
168
+ use_internal_ip,
169
+ ):
170
+
171
+ ssh_control_hash = hashlib.sha1(cluster_name.encode()).hexdigest()
172
+ ssh_user_hash = hashlib.sha1(getuser().encode()).hexdigest()
173
+ ssh_control_path = "/tmp/ray_ssh_{}/{}".format(
174
+ ssh_user_hash[:HASH_MAX_LENGTH], ssh_control_hash[:HASH_MAX_LENGTH]
175
+ )
176
+
177
+ self.cluster_name = cluster_name
178
+ self.log_prefix = log_prefix
179
+ self.process_runner = process_runner
180
+ self.node_id = node_id
181
+ self.use_internal_ip = use_internal_ip
182
+ self.provider = provider
183
+ self.ssh_private_key = auth_config.get("ssh_private_key")
184
+ self.ssh_user = auth_config["ssh_user"]
185
+ self.ssh_control_path = ssh_control_path
186
+ self.ssh_ip = None
187
+ self.ssh_proxy_command = auth_config.get("ssh_proxy_command", None)
188
+ self.ssh_options = SSHOptions(
189
+ self.ssh_private_key,
190
+ self.ssh_control_path,
191
+ ProxyCommand=self.ssh_proxy_command,
192
+ )
193
+
194
+ def _get_node_ip(self):
195
+ if self.use_internal_ip:
196
+ return self.provider.internal_ip(self.node_id)
197
+ else:
198
+ return self.provider.external_ip(self.node_id)
199
+
200
+ def _wait_for_ip(self, deadline):
201
+ # if we have IP do not print waiting info
202
+ ip = self._get_node_ip()
203
+ if ip is not None:
204
+ cli_logger.labeled_value("Fetched IP", ip)
205
+ return ip
206
+
207
+ interval = AUTOSCALER_NODE_SSH_INTERVAL_S
208
+ with cli_logger.group("Waiting for IP"):
209
+ while time.time() < deadline and not self.provider.is_terminated(
210
+ self.node_id
211
+ ):
212
+ ip = self._get_node_ip()
213
+ if ip is not None:
214
+ cli_logger.labeled_value("Received", ip)
215
+ return ip
216
+ cli_logger.print(
217
+ "Not yet available, retrying in {} seconds", cf.bold(str(interval))
218
+ )
219
+ time.sleep(interval)
220
+
221
+ return None
222
+
223
+ def _set_ssh_ip_if_required(self):
224
+ if self.ssh_ip is not None:
225
+ return
226
+
227
+ # We assume that this never changes.
228
+ # I think that's reasonable.
229
+ deadline = time.time() + AUTOSCALER_NODE_START_WAIT_S
230
+ with LogTimer(self.log_prefix + "Got IP"):
231
+ ip = self._wait_for_ip(deadline)
232
+
233
+ cli_logger.doassert(ip is not None, "Could not get node IP.") # todo: msg
234
+ assert ip is not None, "Unable to find IP of node"
235
+
236
+ self.ssh_ip = ip
237
+
238
+ # This should run before any SSH commands and therefore ensure that
239
+ # the ControlPath directory exists, allowing SSH to maintain
240
+ # persistent sessions later on.
241
+ try:
242
+ os.makedirs(self.ssh_control_path, mode=0o700, exist_ok=True)
243
+ except OSError as e:
244
+ cli_logger.warning("{}", str(e)) # todo: msg
245
+
246
+ def _run_helper(
247
+ self, final_cmd, with_output=False, exit_on_fail=False, silent=False
248
+ ):
249
+ """Run a command that was already setup with SSH and `bash` settings.
250
+
251
+ Args:
252
+ cmd (List[str]):
253
+ Full command to run. Should include SSH options and other
254
+ processing that we do.
255
+ with_output (bool):
256
+ If `with_output` is `True`, command stdout will be captured and
257
+ returned.
258
+ exit_on_fail (bool):
259
+ If `exit_on_fail` is `True`, the process will exit
260
+ if the command fails (exits with a code other than 0).
261
+
262
+ Raises:
263
+ ProcessRunnerError if using new log style and disabled
264
+ login shells.
265
+ click.ClickException if using login shells.
266
+ """
267
+ try:
268
+ # For now, if the output is needed we just skip the new logic.
269
+ # In the future we could update the new logic to support
270
+ # capturing output, but it is probably not needed.
271
+ if not with_output:
272
+ return run_cmd_redirected(
273
+ final_cmd,
274
+ process_runner=self.process_runner,
275
+ silent=silent,
276
+ use_login_shells=is_using_login_shells(),
277
+ )
278
+ else:
279
+ return self.process_runner.check_output(final_cmd)
280
+ except subprocess.CalledProcessError as e:
281
+ joined_cmd = " ".join(final_cmd)
282
+ if not is_using_login_shells():
283
+ raise ProcessRunnerError(
284
+ "Command failed",
285
+ "ssh_command_failed",
286
+ code=e.returncode,
287
+ command=joined_cmd,
288
+ )
289
+
290
+ if exit_on_fail:
291
+ raise click.ClickException(
292
+ "Command failed:\n\n {}\n".format(joined_cmd)
293
+ ) from None
294
+ else:
295
+ fail_msg = "SSH command failed."
296
+ if is_output_redirected():
297
+ fail_msg += " See above for the output from the failure."
298
+ raise click.ClickException(fail_msg) from None
299
+ finally:
300
+ # Do our best to flush output to terminal.
301
+ # See https://github.com/ray-project/ray/pull/19473.
302
+ sys.stdout.flush()
303
+ sys.stderr.flush()
304
+
305
+ def run(
306
+ self,
307
+ cmd,
308
+ timeout=120,
309
+ exit_on_fail=False,
310
+ port_forward=None,
311
+ with_output=False,
312
+ environment_variables: Dict[str, object] = None,
313
+ run_env="auto", # Unused argument.
314
+ ssh_options_override_ssh_key="",
315
+ shutdown_after_run=False,
316
+ silent=False,
317
+ ):
318
+ if shutdown_after_run:
319
+ cmd += "; sudo shutdown -h now"
320
+
321
+ if ssh_options_override_ssh_key:
322
+ if self.ssh_proxy_command:
323
+ ssh_options = SSHOptions(
324
+ ssh_options_override_ssh_key, ProxyCommand=self.ssh_proxy_command
325
+ )
326
+ else:
327
+ ssh_options = SSHOptions(ssh_options_override_ssh_key)
328
+ else:
329
+ ssh_options = self.ssh_options
330
+
331
+ assert isinstance(
332
+ ssh_options, SSHOptions
333
+ ), "ssh_options must be of type SSHOptions, got {}".format(type(ssh_options))
334
+
335
+ self._set_ssh_ip_if_required()
336
+
337
+ if is_using_login_shells():
338
+ ssh = ["ssh", "-tt"]
339
+ else:
340
+ ssh = ["ssh"]
341
+
342
+ if port_forward:
343
+ with cli_logger.group("Forwarding ports"):
344
+ if not isinstance(port_forward, list):
345
+ port_forward = [port_forward]
346
+ for local, remote in port_forward:
347
+ cli_logger.verbose(
348
+ "Forwarding port {} to port {} on localhost.",
349
+ cf.bold(local),
350
+ cf.bold(remote),
351
+ ) # todo: msg
352
+ ssh += ["-L", "{}:localhost:{}".format(local, remote)]
353
+
354
+ final_cmd = (
355
+ ssh
356
+ + ssh_options.to_ssh_options_list(timeout=timeout)
357
+ + ["{}@{}".format(self.ssh_user, self.ssh_ip)]
358
+ )
359
+ if cmd:
360
+ if environment_variables:
361
+ cmd = _with_environment_variables(cmd, environment_variables)
362
+ if is_using_login_shells():
363
+ final_cmd += _with_interactive(cmd)
364
+ else:
365
+ final_cmd += [cmd]
366
+ else:
367
+ # We do this because `-o ControlMaster` causes the `-N` flag to
368
+ # still create an interactive shell in some ssh versions.
369
+ final_cmd.append("while true; do sleep 86400; done")
370
+
371
+ cli_logger.verbose("Running `{}`", cf.bold(cmd))
372
+ with cli_logger.indented():
373
+ cli_logger.very_verbose(
374
+ "Full command is `{}`", cf.bold(" ".join(final_cmd))
375
+ )
376
+
377
+ if cli_logger.verbosity > 0:
378
+ with cli_logger.indented():
379
+ return self._run_helper(
380
+ final_cmd, with_output, exit_on_fail, silent=silent
381
+ )
382
+ else:
383
+ return self._run_helper(final_cmd, with_output, exit_on_fail, silent=silent)
384
+
385
+ def _create_rsync_filter_args(self, options):
386
+ rsync_excludes = options.get("rsync_exclude") or []
387
+ rsync_filters = options.get("rsync_filter") or []
388
+
389
+ exclude_args = [
390
+ ["--exclude", rsync_exclude] for rsync_exclude in rsync_excludes
391
+ ]
392
+ filter_args = [
393
+ ["--filter", "dir-merge,- {}".format(rsync_filter)]
394
+ for rsync_filter in rsync_filters
395
+ ]
396
+
397
+ # Combine and flatten the two lists
398
+ return [arg for args_list in exclude_args + filter_args for arg in args_list]
399
+
400
+ def run_rsync_up(self, source, target, options=None):
401
+ self._set_ssh_ip_if_required()
402
+ options = options or {}
403
+
404
+ command = ["rsync"]
405
+ command += [
406
+ "--rsh",
407
+ subprocess.list2cmdline(
408
+ ["ssh"] + self.ssh_options.to_ssh_options_list(timeout=120)
409
+ ),
410
+ ]
411
+ command += ["-avz"]
412
+ command += self._create_rsync_filter_args(options=options)
413
+ command += [source, "{}@{}:{}".format(self.ssh_user, self.ssh_ip, target)]
414
+ cli_logger.verbose("Running `{}`", cf.bold(" ".join(command)))
415
+ self._run_helper(command, silent=is_rsync_silent())
416
+
417
+ def run_rsync_down(self, source, target, options=None):
418
+ self._set_ssh_ip_if_required()
419
+
420
+ command = ["rsync"]
421
+ command += [
422
+ "--rsh",
423
+ subprocess.list2cmdline(
424
+ ["ssh"] + self.ssh_options.to_ssh_options_list(timeout=120)
425
+ ),
426
+ ]
427
+ command += ["-avz"]
428
+ command += self._create_rsync_filter_args(options=options)
429
+ command += ["{}@{}:{}".format(self.ssh_user, self.ssh_ip, source), target]
430
+ cli_logger.verbose("Running `{}`", cf.bold(" ".join(command)))
431
+ self._run_helper(command, silent=is_rsync_silent())
432
+
433
+ def remote_shell_command_str(self):
434
+ if self.ssh_private_key:
435
+ return "ssh -o IdentitiesOnly=yes -i {} {}@{}\n".format(
436
+ self.ssh_private_key, self.ssh_user, self.ssh_ip
437
+ )
438
+ else:
439
+ return "ssh -o IdentitiesOnly=yes {}@{}\n".format(
440
+ self.ssh_user, self.ssh_ip
441
+ )
442
+
443
+
444
+ class DockerCommandRunner(CommandRunnerInterface):
445
+ def __init__(self, docker_config, **common_args):
446
+ self.ssh_command_runner = SSHCommandRunner(**common_args)
447
+ self.container_name = docker_config["container_name"]
448
+ self.docker_config = docker_config
449
+ self.home_dir = None
450
+ self.initialized = False
451
+ # Optionally use 'podman' instead of 'docker'
452
+ use_podman = docker_config.get("use_podman", False)
453
+ self.docker_cmd = "podman" if use_podman else "docker"
454
+
455
+ def run(
456
+ self,
457
+ cmd,
458
+ timeout=120,
459
+ exit_on_fail=False,
460
+ port_forward=None,
461
+ with_output=False,
462
+ environment_variables: Dict[str, object] = None,
463
+ run_env="auto",
464
+ ssh_options_override_ssh_key="",
465
+ shutdown_after_run=False,
466
+ ):
467
+ if run_env == "auto":
468
+ run_env = (
469
+ "host"
470
+ if (not bool(cmd) or cmd.find(self.docker_cmd) == 0)
471
+ else self.docker_cmd
472
+ )
473
+
474
+ if environment_variables:
475
+ cmd = _with_environment_variables(cmd, environment_variables)
476
+
477
+ if run_env == "docker":
478
+ cmd = self._docker_expand_user(cmd, any_char=True)
479
+ if is_using_login_shells():
480
+ cmd = " ".join(_with_interactive(cmd))
481
+ cmd = with_docker_exec(
482
+ [cmd],
483
+ container_name=self.container_name,
484
+ with_interactive=is_using_login_shells(),
485
+ docker_cmd=self.docker_cmd,
486
+ )[0]
487
+
488
+ if shutdown_after_run:
489
+ # sudo shutdown should run after `with_docker_exec` command above
490
+ cmd += "; sudo shutdown -h now"
491
+ # Do not pass shutdown_after_run argument to ssh_command_runner.run()
492
+ # since it is handled above.
493
+ return self.ssh_command_runner.run(
494
+ cmd,
495
+ timeout=timeout,
496
+ exit_on_fail=exit_on_fail,
497
+ port_forward=port_forward,
498
+ with_output=with_output,
499
+ ssh_options_override_ssh_key=ssh_options_override_ssh_key,
500
+ )
501
+
502
+ def run_rsync_up(self, source, target, options=None):
503
+ options = options or {}
504
+ host_destination = os.path.join(
505
+ self._get_docker_host_mount_location(self.ssh_command_runner.cluster_name),
506
+ target.lstrip("/"),
507
+ )
508
+
509
+ host_mount_location = os.path.dirname(host_destination.rstrip("/"))
510
+ self.ssh_command_runner.run(
511
+ f"mkdir -p {host_mount_location} && chown -R "
512
+ f"{self.ssh_command_runner.ssh_user} {host_mount_location}",
513
+ silent=is_rsync_silent(),
514
+ )
515
+
516
+ self.ssh_command_runner.run_rsync_up(source, host_destination, options=options)
517
+ if self._check_container_status() and not options.get(
518
+ "docker_mount_if_possible", False
519
+ ):
520
+ if os.path.isdir(source):
521
+ # Adding a "." means that docker copies the *contents*
522
+ # Without it, docker copies the source *into* the target
523
+ host_destination += "/."
524
+
525
+ # This path may not exist inside the container. This ensures
526
+ # that the path is created!
527
+ prefix = with_docker_exec(
528
+ [
529
+ "mkdir -p {}".format(
530
+ os.path.dirname(self._docker_expand_user(target))
531
+ )
532
+ ],
533
+ container_name=self.container_name,
534
+ with_interactive=is_using_login_shells(),
535
+ docker_cmd=self.docker_cmd,
536
+ )[0]
537
+
538
+ self.ssh_command_runner.run(
539
+ "{} && rsync -e '{} exec -i' -avz {} {}:{}".format(
540
+ prefix,
541
+ self.docker_cmd,
542
+ host_destination,
543
+ self.container_name,
544
+ self._docker_expand_user(target),
545
+ ),
546
+ silent=is_rsync_silent(),
547
+ )
548
+
549
+ def run_rsync_down(self, source, target, options=None):
550
+ options = options or {}
551
+ host_source = os.path.join(
552
+ self._get_docker_host_mount_location(self.ssh_command_runner.cluster_name),
553
+ source.lstrip("/"),
554
+ )
555
+ host_mount_location = os.path.dirname(host_source.rstrip("/"))
556
+ self.ssh_command_runner.run(
557
+ f"mkdir -p {host_mount_location} && chown -R "
558
+ f"{self.ssh_command_runner.ssh_user} {host_mount_location}",
559
+ silent=is_rsync_silent(),
560
+ )
561
+ if source[-1] == "/":
562
+ source += "."
563
+ # Adding a "." means that docker copies the *contents*
564
+ # Without it, docker copies the source *into* the target
565
+ if not options.get("docker_mount_if_possible", False):
566
+ # NOTE: `--delete` is okay here because the container is the source
567
+ # of truth.
568
+ self.ssh_command_runner.run(
569
+ "rsync -e '{} exec -i' -avz --delete {}:{} {}".format(
570
+ self.docker_cmd,
571
+ self.container_name,
572
+ self._docker_expand_user(source),
573
+ host_source,
574
+ ),
575
+ silent=is_rsync_silent(),
576
+ )
577
+ self.ssh_command_runner.run_rsync_down(host_source, target, options=options)
578
+
579
+ def remote_shell_command_str(self):
580
+ inner_str = (
581
+ self.ssh_command_runner.remote_shell_command_str()
582
+ .replace("ssh", "ssh -tt", 1)
583
+ .strip("\n")
584
+ )
585
+ return inner_str + " {} exec -it {} /bin/bash\n".format(
586
+ self.docker_cmd, self.container_name
587
+ )
588
+
589
+ def _check_docker_installed(self):
590
+ no_exist = "NoExist"
591
+ output = self.ssh_command_runner.run(
592
+ f"command -v {self.docker_cmd} || echo '{no_exist}'", with_output=True
593
+ )
594
+ cleaned_output = output.decode().strip()
595
+ if no_exist in cleaned_output or "docker" not in cleaned_output:
596
+ if self.docker_cmd == "docker":
597
+ install_commands = [
598
+ "curl -fsSL https://get.docker.com -o get-docker.sh",
599
+ "sudo sh get-docker.sh",
600
+ "sudo usermod -aG docker $USER",
601
+ "sudo systemctl restart docker -f",
602
+ ]
603
+ else:
604
+ install_commands = [
605
+ "sudo apt-get update",
606
+ "sudo apt-get -y install podman",
607
+ ]
608
+
609
+ logger.error(
610
+ f"{self.docker_cmd.capitalize()} not installed. You can "
611
+ f"install {self.docker_cmd.capitalize()} by adding the "
612
+ "following commands to 'initialization_commands':\n"
613
+ + "\n".join(install_commands)
614
+ )
615
+
616
+ def _check_container_status(self):
617
+ if self.initialized:
618
+ return True
619
+ output = (
620
+ self.ssh_command_runner.run(
621
+ check_docker_running_cmd(self.container_name, self.docker_cmd),
622
+ with_output=True,
623
+ )
624
+ .decode("utf-8")
625
+ .strip()
626
+ )
627
+ # Checks for the false positive where "true" is in the container name
628
+ return "true" in output.lower() and "no such object" not in output.lower()
629
+
630
+ def _docker_expand_user(self, string, any_char=False):
631
+ user_pos = string.find("~")
632
+ if user_pos > -1:
633
+ if self.home_dir is None:
634
+ self.home_dir = (
635
+ self.ssh_command_runner.run(
636
+ f"{self.docker_cmd} exec {self.container_name} "
637
+ "printenv HOME",
638
+ with_output=True,
639
+ )
640
+ .decode("utf-8")
641
+ .strip()
642
+ )
643
+
644
+ if any_char:
645
+ return string.replace("~/", self.home_dir + "/")
646
+
647
+ elif not any_char and user_pos == 0:
648
+ return string.replace("~", self.home_dir, 1)
649
+
650
+ return string
651
+
652
+ def _check_if_container_restart_is_needed(
653
+ self, image: str, cleaned_bind_mounts: Dict[str, str]
654
+ ) -> bool:
655
+ re_init_required = False
656
+ running_image = (
657
+ self.run(
658
+ check_docker_image(self.container_name, self.docker_cmd),
659
+ with_output=True,
660
+ run_env="host",
661
+ )
662
+ .decode("utf-8")
663
+ .strip()
664
+ )
665
+ if running_image != image:
666
+ cli_logger.error(
667
+ "A container with name {} is running image {} instead "
668
+ + "of {} (which was provided in the YAML)",
669
+ self.container_name,
670
+ running_image,
671
+ image,
672
+ )
673
+ mounts = (
674
+ self.run(
675
+ check_bind_mounts_cmd(self.container_name, self.docker_cmd),
676
+ with_output=True,
677
+ run_env="host",
678
+ )
679
+ .decode("utf-8")
680
+ .strip()
681
+ )
682
+ try:
683
+ active_mounts = json.loads(mounts)
684
+ active_remote_mounts = {
685
+ mnt["Destination"].strip("/") for mnt in active_mounts
686
+ }
687
+ # Ignore ray bootstrap files.
688
+ requested_remote_mounts = {
689
+ self._docker_expand_user(remote).strip("/")
690
+ for remote in cleaned_bind_mounts.keys()
691
+ }
692
+ unfulfilled_mounts = requested_remote_mounts - active_remote_mounts
693
+ if unfulfilled_mounts:
694
+ re_init_required = True
695
+ cli_logger.warning(
696
+ "This Docker Container is already running. "
697
+ "Restarting the Docker container on "
698
+ "this node to pick up the following file_mounts {}",
699
+ unfulfilled_mounts,
700
+ )
701
+ except json.JSONDecodeError:
702
+ cli_logger.verbose(
703
+ "Unable to check if file_mounts specified in the YAML "
704
+ "differ from those on the running container."
705
+ )
706
+ return re_init_required
707
+
708
+ def run_init(
709
+ self, *, as_head: bool, file_mounts: Dict[str, str], sync_run_yet: bool
710
+ ):
711
+ BOOTSTRAP_MOUNTS = ["~/ray_bootstrap_config.yaml", "~/ray_bootstrap_key.pem"]
712
+
713
+ specific_image = self.docker_config.get(
714
+ f"{'head' if as_head else 'worker'}_image", self.docker_config.get("image")
715
+ )
716
+
717
+ self._check_docker_installed()
718
+ if self.docker_config.get("pull_before_run", True):
719
+ assert specific_image, (
720
+ "Image must be included in config if " + "pull_before_run is specified"
721
+ )
722
+ self.run(
723
+ "{} pull {}".format(self.docker_cmd, specific_image), run_env="host"
724
+ )
725
+ else:
726
+
727
+ self.run(
728
+ f"{self.docker_cmd} image inspect {specific_image} "
729
+ "1> /dev/null 2>&1 || "
730
+ f"{self.docker_cmd} pull {specific_image}"
731
+ )
732
+
733
+ # Bootstrap files cannot be bind mounted because docker opens the
734
+ # underlying inode. When the file is switched, docker becomes outdated.
735
+ cleaned_bind_mounts = file_mounts.copy()
736
+ for mnt in BOOTSTRAP_MOUNTS:
737
+ cleaned_bind_mounts.pop(mnt, None)
738
+
739
+ docker_run_executed = False
740
+
741
+ container_running = self._check_container_status()
742
+ requires_re_init = False
743
+ if container_running:
744
+ requires_re_init = self._check_if_container_restart_is_needed(
745
+ specific_image, cleaned_bind_mounts
746
+ )
747
+ if requires_re_init:
748
+ self.run(
749
+ f"{self.docker_cmd} stop {self.container_name}", run_env="host"
750
+ )
751
+
752
+ if (not container_running) or requires_re_init:
753
+ if not sync_run_yet:
754
+ # Do not start the actual image as we need to run file_sync
755
+ # first to ensure that all folders are created with the
756
+ # correct ownership. Docker will create the folders with
757
+ # `root` as the owner.
758
+ return True
759
+ # Get home directory
760
+ image_env = (
761
+ self.ssh_command_runner.run(
762
+ f"{self.docker_cmd} "
763
+ + "inspect -f '{{json .Config.Env}}' "
764
+ + specific_image,
765
+ with_output=True,
766
+ )
767
+ .decode()
768
+ .strip()
769
+ )
770
+ home_directory = "/root"
771
+ try:
772
+ for env_var in json.loads(image_env):
773
+ if env_var.startswith("HOME="):
774
+ home_directory = env_var.split("HOME=")[1]
775
+ break
776
+ except json.JSONDecodeError as e:
777
+ cli_logger.error(
778
+ "Unable to deserialize `image_env` to Python object. "
779
+ f"The `image_env` is:\n{image_env}"
780
+ )
781
+ raise e
782
+
783
+ user_docker_run_options = self.docker_config.get(
784
+ "run_options", []
785
+ ) + self.docker_config.get(
786
+ f"{'head' if as_head else 'worker'}_run_options", []
787
+ )
788
+ start_command = docker_start_cmds(
789
+ self.ssh_command_runner.ssh_user,
790
+ specific_image,
791
+ cleaned_bind_mounts,
792
+ self.container_name,
793
+ self._configure_runtime(
794
+ self._auto_configure_shm(user_docker_run_options)
795
+ ),
796
+ self.ssh_command_runner.cluster_name,
797
+ home_directory,
798
+ self.docker_cmd,
799
+ )
800
+ self.run(start_command, run_env="host")
801
+ docker_run_executed = True
802
+
803
+ # Explicitly copy in ray bootstrap files.
804
+ for mount in BOOTSTRAP_MOUNTS:
805
+ if mount in file_mounts:
806
+ if not sync_run_yet:
807
+ # NOTE(ilr) This rsync is needed because when starting from
808
+ # a stopped instance, /tmp may be deleted and `run_init`
809
+ # is called before the first `file_sync` happens
810
+ self.run_rsync_up(file_mounts[mount], mount)
811
+ self.ssh_command_runner.run(
812
+ "rsync -e '{cmd} exec -i' -avz {src} {container}:{dst}".format(
813
+ cmd=self.docker_cmd,
814
+ src=os.path.join(
815
+ self._get_docker_host_mount_location(
816
+ self.ssh_command_runner.cluster_name
817
+ ),
818
+ mount,
819
+ ),
820
+ container=self.container_name,
821
+ dst=self._docker_expand_user(mount),
822
+ )
823
+ )
824
+ try:
825
+ # Check if the current user has read permission.
826
+ # If they do not, try to change ownership!
827
+ self.run(
828
+ f"cat {mount} >/dev/null 2>&1 || "
829
+ f"sudo chown $(id -u):$(id -g) {mount}"
830
+ )
831
+ except Exception:
832
+ lsl_string = (
833
+ self.run(f"ls -l {mount}", with_output=True)
834
+ .decode("utf-8")
835
+ .strip()
836
+ )
837
+ # The string is of format <Permission> <Links>
838
+ # <Owner> <Group> <Size> <Date> <Name>
839
+ permissions = lsl_string.split(" ")[0]
840
+ owner = lsl_string.split(" ")[2]
841
+ group = lsl_string.split(" ")[3]
842
+ current_user = (
843
+ self.run("whoami", with_output=True).decode("utf-8").strip()
844
+ )
845
+ cli_logger.warning(
846
+ f"File ({mount}) is owned by user:{owner} and group:"
847
+ f"{group} with permissions ({permissions}). The "
848
+ f"current user ({current_user}) does not have "
849
+ "permission to read these files, and Ray may not be "
850
+ "able to autoscale. This can be resolved by "
851
+ "installing `sudo` in your container, or adding a "
852
+ f"command like 'chown {current_user} {mount}' to "
853
+ "your `setup_commands`."
854
+ )
855
+ self.initialized = True
856
+ return docker_run_executed
857
+
858
+ def _configure_runtime(self, run_options: List[str]) -> List[str]:
859
+ if self.docker_config.get("disable_automatic_runtime_detection"):
860
+ return run_options
861
+
862
+ runtime_output = (
863
+ self.ssh_command_runner.run(
864
+ f"{self.docker_cmd} " + "info -f '{{.Runtimes}}' ", with_output=True
865
+ )
866
+ .decode()
867
+ .strip()
868
+ )
869
+ if "nvidia-container-runtime" in runtime_output:
870
+ try:
871
+ self.ssh_command_runner.run("nvidia-smi", with_output=False)
872
+ return run_options + ["--runtime=nvidia"]
873
+ except Exception as e:
874
+ logger.warning(
875
+ "Nvidia Container Runtime is present, but no GPUs found."
876
+ )
877
+ logger.debug(f"nvidia-smi error: {e}")
878
+ return run_options
879
+
880
+ return run_options
881
+
882
+ def _auto_configure_shm(self, run_options: List[str]) -> List[str]:
883
+ if self.docker_config.get("disable_shm_size_detection"):
884
+ return run_options
885
+ for run_opt in run_options:
886
+ if "--shm-size" in run_opt:
887
+ logger.info(
888
+ "Bypassing automatic SHM-Detection because of "
889
+ f"`run_option`: {run_opt}"
890
+ )
891
+ return run_options
892
+ try:
893
+ shm_output = (
894
+ self.ssh_command_runner.run(
895
+ "cat /proc/meminfo || true", with_output=True
896
+ )
897
+ .decode()
898
+ .strip()
899
+ )
900
+ available_memory = int(
901
+ [ln for ln in shm_output.split("\n") if "MemAvailable" in ln][
902
+ 0
903
+ ].split()[1]
904
+ )
905
+ available_memory_bytes = available_memory * 1024
906
+ # Overestimate SHM size by 10%
907
+ shm_size = min(
908
+ (available_memory_bytes * DEFAULT_OBJECT_STORE_MEMORY_PROPORTION * 1.1),
909
+ DEFAULT_OBJECT_STORE_MAX_MEMORY_BYTES,
910
+ )
911
+ return run_options + [f"--shm-size='{shm_size}b'"]
912
+ except Exception as e:
913
+ logger.warning(f"Received error while trying to auto-compute SHM size {e}")
914
+ return run_options
915
+
916
+ def _get_docker_host_mount_location(self, cluster_name: str) -> str:
917
+ """Return the docker host mount directory location."""
918
+ # Imported here due to circular dependency in imports.
919
+ from ray.autoscaler.sdk import get_docker_host_mount_location
920
+
921
+ return get_docker_host_mount_location(cluster_name)
.venv/lib/python3.11/site-packages/ray/autoscaler/_private/commands.py ADDED
@@ -0,0 +1,1631 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import copy
2
+ import datetime
3
+ import hashlib
4
+ import json
5
+ import logging
6
+ import os
7
+ import random
8
+ import shutil
9
+ import subprocess
10
+ import sys
11
+ import tempfile
12
+ import time
13
+ from concurrent.futures import ThreadPoolExecutor
14
+ from types import ModuleType
15
+ from typing import Any, Dict, List, Optional, Tuple, Union
16
+
17
+ import click
18
+ import yaml
19
+
20
+ import ray
21
+ from ray._private.usage import usage_lib
22
+ from ray.autoscaler._private import subprocess_output_util as cmd_output_util
23
+ from ray.autoscaler._private.autoscaler import AutoscalerSummary
24
+ from ray.autoscaler._private.cli_logger import cf, cli_logger
25
+ from ray.autoscaler._private.cluster_dump import (
26
+ Archive,
27
+ GetParameters,
28
+ Node,
29
+ _info_from_params,
30
+ create_archive_for_local_and_remote_nodes,
31
+ create_archive_for_remote_nodes,
32
+ get_all_local_data,
33
+ )
34
+ from ray.autoscaler._private.command_runner import (
35
+ set_rsync_silent,
36
+ set_using_login_shells,
37
+ )
38
+ from ray.autoscaler._private.constants import (
39
+ AUTOSCALER_RESOURCE_REQUEST_CHANNEL,
40
+ MAX_PARALLEL_SHUTDOWN_WORKERS,
41
+ )
42
+ from ray.autoscaler._private.event_system import CreateClusterEvent, global_event_system
43
+ from ray.autoscaler._private.log_timer import LogTimer
44
+ from ray.autoscaler._private.node_provider_availability_tracker import (
45
+ NodeAvailabilitySummary,
46
+ )
47
+ from ray.autoscaler._private.providers import (
48
+ _NODE_PROVIDERS,
49
+ _PROVIDER_PRETTY_NAMES,
50
+ _get_node_provider,
51
+ )
52
+ from ray.autoscaler._private.updater import NodeUpdaterThread
53
+ from ray.autoscaler._private.util import (
54
+ LoadMetricsSummary,
55
+ format_info_string,
56
+ hash_launch_conf,
57
+ hash_runtime_conf,
58
+ prepare_config,
59
+ validate_config,
60
+ )
61
+ from ray.autoscaler.node_provider import NodeProvider
62
+ from ray.autoscaler.tags import (
63
+ NODE_KIND_HEAD,
64
+ NODE_KIND_WORKER,
65
+ STATUS_UNINITIALIZED,
66
+ STATUS_UP_TO_DATE,
67
+ TAG_RAY_LAUNCH_CONFIG,
68
+ TAG_RAY_NODE_KIND,
69
+ TAG_RAY_NODE_NAME,
70
+ TAG_RAY_NODE_STATUS,
71
+ TAG_RAY_USER_NODE_TYPE,
72
+ )
73
+ from ray.experimental.internal_kv import _internal_kv_put, internal_kv_get_gcs_client
74
+ from ray.util.debug import log_once
75
+
76
+ try: # py3
77
+ from shlex import quote
78
+ except ImportError: # py2
79
+ from pipes import quote
80
+
81
+
82
+ logger = logging.getLogger(__name__)
83
+
84
+ RUN_ENV_TYPES = ["auto", "host", "docker"]
85
+
86
+ POLL_INTERVAL = 5
87
+
88
+ Port_forward = Union[Tuple[int, int], List[Tuple[int, int]]]
89
+
90
+
91
+ def try_logging_config(config: Dict[str, Any]) -> None:
92
+ if config["provider"]["type"] == "aws":
93
+ from ray.autoscaler._private.aws.config import log_to_cli
94
+
95
+ log_to_cli(config)
96
+
97
+
98
+ def try_get_log_state(provider_config: Dict[str, Any]) -> Optional[dict]:
99
+ if provider_config["type"] == "aws":
100
+ from ray.autoscaler._private.aws.config import get_log_state
101
+
102
+ return get_log_state()
103
+ return None
104
+
105
+
106
+ def try_reload_log_state(provider_config: Dict[str, Any], log_state: dict) -> None:
107
+ if not log_state:
108
+ return
109
+ if provider_config["type"] == "aws":
110
+ from ray.autoscaler._private.aws.config import reload_log_state
111
+
112
+ return reload_log_state(log_state)
113
+
114
+
115
+ def debug_status(
116
+ status, error, verbose: bool = False, address: Optional[str] = None
117
+ ) -> str:
118
+ """
119
+ Return a debug string for the autoscaler.
120
+
121
+ Args:
122
+ status: The autoscaler status string for v1
123
+ error: The autoscaler error string for v1
124
+ verbose: Whether to print verbose information.
125
+ address: The address of the cluster (gcs address).
126
+
127
+ Returns:
128
+ str: A debug string for the cluster's status.
129
+ """
130
+ from ray.autoscaler.v2.utils import is_autoscaler_v2
131
+
132
+ if is_autoscaler_v2():
133
+ from ray.autoscaler.v2.sdk import get_cluster_status
134
+ from ray.autoscaler.v2.utils import ClusterStatusFormatter
135
+
136
+ cluster_status = get_cluster_status(address)
137
+ status = ClusterStatusFormatter.format(cluster_status, verbose=verbose)
138
+ elif status:
139
+ status = status.decode("utf-8")
140
+ status_dict = json.loads(status)
141
+ lm_summary_dict = status_dict.get("load_metrics_report")
142
+ autoscaler_summary_dict = status_dict.get("autoscaler_report")
143
+ timestamp = status_dict.get("time")
144
+ gcs_request_time = status_dict.get("gcs_request_time")
145
+ non_terminated_nodes_time = status_dict.get("non_terminated_nodes_time")
146
+ if lm_summary_dict and autoscaler_summary_dict and timestamp:
147
+ lm_summary = LoadMetricsSummary(**lm_summary_dict)
148
+ node_availability_summary_dict = autoscaler_summary_dict.pop(
149
+ "node_availability_summary", {}
150
+ )
151
+ node_availability_summary = NodeAvailabilitySummary.from_fields(
152
+ **node_availability_summary_dict
153
+ )
154
+ autoscaler_summary = AutoscalerSummary(
155
+ node_availability_summary=node_availability_summary,
156
+ **autoscaler_summary_dict,
157
+ )
158
+ report_time = datetime.datetime.fromtimestamp(timestamp)
159
+ status = format_info_string(
160
+ lm_summary,
161
+ autoscaler_summary,
162
+ time=report_time,
163
+ gcs_request_time=gcs_request_time,
164
+ non_terminated_nodes_time=non_terminated_nodes_time,
165
+ verbose=verbose,
166
+ )
167
+ else:
168
+ status = (
169
+ "No cluster status. It may take a few seconds "
170
+ "for the Ray internal services to start up."
171
+ )
172
+ else:
173
+ status = (
174
+ "No cluster status. It may take a few seconds "
175
+ "for the Ray internal services to start up."
176
+ )
177
+
178
+ if error:
179
+ status += "\n"
180
+ status += error.decode("utf-8")
181
+
182
+ return status
183
+
184
+
185
+ def request_resources(
186
+ num_cpus: Optional[int] = None, bundles: Optional[List[dict]] = None
187
+ ) -> None:
188
+ """Remotely request some CPU or GPU resources from the autoscaler.
189
+
190
+ This function is to be called e.g. on a node before submitting a bunch of
191
+ ray.remote calls to ensure that resources rapidly become available.
192
+
193
+ Args:
194
+ num_cpus: Scale the cluster to ensure this number of CPUs are
195
+ available. This request is persistent until another call to
196
+ request_resources() is made.
197
+ bundles (List[ResourceDict]): Scale the cluster to ensure this set of
198
+ resource shapes can fit. This request is persistent until another
199
+ call to request_resources() is made.
200
+ """
201
+ if not ray.is_initialized():
202
+ raise RuntimeError("Ray is not initialized yet")
203
+ to_request = []
204
+ if num_cpus:
205
+ to_request += [{"CPU": 1}] * num_cpus
206
+ if bundles:
207
+ to_request += bundles
208
+ _internal_kv_put(
209
+ AUTOSCALER_RESOURCE_REQUEST_CHANNEL, json.dumps(to_request), overwrite=True
210
+ )
211
+
212
+ from ray.autoscaler.v2.utils import is_autoscaler_v2
213
+
214
+ if is_autoscaler_v2():
215
+ from ray.autoscaler.v2.sdk import request_cluster_resources
216
+
217
+ gcs_address = internal_kv_get_gcs_client().address
218
+ request_cluster_resources(gcs_address, to_request)
219
+
220
+
221
+ def create_or_update_cluster(
222
+ config_file: str,
223
+ override_min_workers: Optional[int],
224
+ override_max_workers: Optional[int],
225
+ no_restart: bool,
226
+ restart_only: bool,
227
+ yes: bool,
228
+ override_cluster_name: Optional[str] = None,
229
+ no_config_cache: bool = False,
230
+ redirect_command_output: Optional[bool] = False,
231
+ use_login_shells: bool = True,
232
+ no_monitor_on_head: bool = False,
233
+ ) -> Dict[str, Any]:
234
+ """Creates or updates an autoscaling Ray cluster from a config json."""
235
+ # no_monitor_on_head is an internal flag used by the Ray K8s operator.
236
+ # If True, prevents autoscaling config sync to the Ray head during cluster
237
+ # creation. See https://github.com/ray-project/ray/pull/13720.
238
+ set_using_login_shells(use_login_shells)
239
+ if not use_login_shells:
240
+ cmd_output_util.set_allow_interactive(False)
241
+ if redirect_command_output is None:
242
+ # Do not redirect by default.
243
+ cmd_output_util.set_output_redirected(False)
244
+ else:
245
+ cmd_output_util.set_output_redirected(redirect_command_output)
246
+
247
+ def handle_yaml_error(e):
248
+ cli_logger.error("Cluster config invalid")
249
+ cli_logger.newline()
250
+ cli_logger.error("Failed to load YAML file " + cf.bold("{}"), config_file)
251
+ cli_logger.newline()
252
+ with cli_logger.verbatim_error_ctx("PyYAML error:"):
253
+ cli_logger.error(e)
254
+ cli_logger.abort()
255
+
256
+ try:
257
+ config = yaml.safe_load(open(config_file).read())
258
+ except FileNotFoundError:
259
+ cli_logger.abort(
260
+ "Provided cluster configuration file ({}) does not exist",
261
+ cf.bold(config_file),
262
+ )
263
+ except yaml.parser.ParserError as e:
264
+ handle_yaml_error(e)
265
+ raise
266
+ except yaml.scanner.ScannerError as e:
267
+ handle_yaml_error(e)
268
+ raise
269
+ global_event_system.execute_callback(
270
+ CreateClusterEvent.up_started, {"cluster_config": config}
271
+ )
272
+
273
+ # todo: validate file_mounts, ssh keys, etc.
274
+
275
+ importer = _NODE_PROVIDERS.get(config["provider"]["type"])
276
+ if not importer:
277
+ cli_logger.abort(
278
+ "Unknown provider type " + cf.bold("{}") + "\n"
279
+ "Available providers are: {}",
280
+ config["provider"]["type"],
281
+ cli_logger.render_list(
282
+ [k for k in _NODE_PROVIDERS.keys() if _NODE_PROVIDERS[k] is not None]
283
+ ),
284
+ )
285
+
286
+ printed_overrides = False
287
+
288
+ def handle_cli_override(key, override):
289
+ if override is not None:
290
+ if key in config:
291
+ nonlocal printed_overrides
292
+ printed_overrides = True
293
+ cli_logger.warning(
294
+ "`{}` override provided on the command line.\n"
295
+ " Using "
296
+ + cf.bold("{}")
297
+ + cf.dimmed(" [configuration file has " + cf.bold("{}") + "]"),
298
+ key,
299
+ override,
300
+ config[key],
301
+ )
302
+ config[key] = override
303
+
304
+ handle_cli_override("min_workers", override_min_workers)
305
+ handle_cli_override("max_workers", override_max_workers)
306
+ handle_cli_override("cluster_name", override_cluster_name)
307
+
308
+ if printed_overrides:
309
+ cli_logger.newline()
310
+
311
+ cli_logger.labeled_value("Cluster", config["cluster_name"])
312
+
313
+ cli_logger.newline()
314
+ config = _bootstrap_config(config, no_config_cache=no_config_cache)
315
+
316
+ try_logging_config(config)
317
+ get_or_create_head_node(
318
+ config,
319
+ config_file,
320
+ no_restart,
321
+ restart_only,
322
+ yes,
323
+ override_cluster_name,
324
+ no_monitor_on_head,
325
+ )
326
+ return config
327
+
328
+
329
+ CONFIG_CACHE_VERSION = 1
330
+
331
+
332
+ def _bootstrap_config(
333
+ config: Dict[str, Any], no_config_cache: bool = False
334
+ ) -> Dict[str, Any]:
335
+ config = prepare_config(config)
336
+ # NOTE: multi-node-type autoscaler is guaranteed to be in use after this.
337
+
338
+ hasher = hashlib.sha1()
339
+ hasher.update(json.dumps([config], sort_keys=True).encode("utf-8"))
340
+ cache_key = os.path.join(
341
+ tempfile.gettempdir(), "ray-config-{}".format(hasher.hexdigest())
342
+ )
343
+
344
+ if os.path.exists(cache_key) and not no_config_cache:
345
+ config_cache = json.loads(open(cache_key).read())
346
+ if config_cache.get("_version", -1) == CONFIG_CACHE_VERSION:
347
+ # todo: is it fine to re-resolve? afaik it should be.
348
+ # we can have migrations otherwise or something
349
+ # but this seems overcomplicated given that resolving is
350
+ # relatively cheap
351
+ try_reload_log_state(
352
+ config_cache["config"]["provider"],
353
+ config_cache.get("provider_log_info"),
354
+ )
355
+
356
+ if log_once("_printed_cached_config_warning"):
357
+ cli_logger.verbose_warning(
358
+ "Loaded cached provider configuration from " + cf.bold("{}"),
359
+ cache_key,
360
+ )
361
+ if cli_logger.verbosity == 0:
362
+ cli_logger.warning("Loaded cached provider configuration")
363
+ cli_logger.warning(
364
+ "If you experience issues with "
365
+ "the cloud provider, try re-running "
366
+ "the command with {}.",
367
+ cf.bold("--no-config-cache"),
368
+ )
369
+
370
+ return config_cache["config"]
371
+ else:
372
+ cli_logger.warning(
373
+ "Found cached cluster config "
374
+ "but the version " + cf.bold("{}") + " "
375
+ "(expected " + cf.bold("{}") + ") does not match.\n"
376
+ "This is normal if cluster launcher was updated.\n"
377
+ "Config will be re-resolved.",
378
+ config_cache.get("_version", "none"),
379
+ CONFIG_CACHE_VERSION,
380
+ )
381
+
382
+ importer = _NODE_PROVIDERS.get(config["provider"]["type"])
383
+ if not importer:
384
+ raise NotImplementedError("Unsupported provider {}".format(config["provider"]))
385
+
386
+ provider_cls = importer(config["provider"])
387
+
388
+ cli_logger.print(
389
+ "Checking {} environment settings",
390
+ _PROVIDER_PRETTY_NAMES.get(config["provider"]["type"]),
391
+ )
392
+ try:
393
+ config = provider_cls.fillout_available_node_types_resources(config)
394
+ except Exception as exc:
395
+ if cli_logger.verbosity > 2:
396
+ logger.exception("Failed to autodetect node resources.")
397
+ else:
398
+ cli_logger.warning(
399
+ f"Failed to autodetect node resources: {str(exc)}. "
400
+ "You can see full stack trace with higher verbosity."
401
+ )
402
+
403
+ try:
404
+ # NOTE: if `resources` field is missing, validate_config for providers
405
+ # other than AWS and Kubernetes will fail (the schema error will ask
406
+ # the user to manually fill the resources) as we currently support
407
+ # autofilling resources for AWS and Kubernetes only.
408
+ validate_config(config)
409
+ except (ModuleNotFoundError, ImportError):
410
+ cli_logger.abort(
411
+ "Not all Ray autoscaler dependencies were found. "
412
+ "In Ray 1.4+, the Ray CLI, autoscaler, and dashboard will "
413
+ 'only be usable via `pip install "ray[default]"`. Please '
414
+ "update your install command."
415
+ )
416
+ resolved_config = provider_cls.bootstrap_config(config)
417
+
418
+ if not no_config_cache:
419
+ with open(cache_key, "w") as f:
420
+ config_cache = {
421
+ "_version": CONFIG_CACHE_VERSION,
422
+ "provider_log_info": try_get_log_state(resolved_config["provider"]),
423
+ "config": resolved_config,
424
+ }
425
+ f.write(json.dumps(config_cache))
426
+ return resolved_config
427
+
428
+
429
+ def teardown_cluster(
430
+ config_file: str,
431
+ yes: bool,
432
+ workers_only: bool,
433
+ override_cluster_name: Optional[str],
434
+ keep_min_workers: bool,
435
+ ) -> None:
436
+ """Destroys all nodes of a Ray cluster described by a config json."""
437
+ config = yaml.safe_load(open(config_file).read())
438
+ if override_cluster_name is not None:
439
+ config["cluster_name"] = override_cluster_name
440
+
441
+ config = _bootstrap_config(config)
442
+
443
+ cli_logger.confirm(yes, "Destroying cluster.", _abort=True)
444
+
445
+ if not workers_only:
446
+ try:
447
+ exec_cluster(
448
+ config_file,
449
+ cmd="ray stop",
450
+ run_env="auto",
451
+ screen=False,
452
+ tmux=False,
453
+ stop=False,
454
+ start=False,
455
+ override_cluster_name=override_cluster_name,
456
+ port_forward=None,
457
+ with_output=False,
458
+ )
459
+ except Exception as e:
460
+ # todo: add better exception info
461
+ cli_logger.verbose_error("{}", str(e))
462
+ cli_logger.warning(
463
+ "Exception occurred when stopping the cluster Ray runtime "
464
+ "(use -v to dump teardown exceptions)."
465
+ )
466
+ cli_logger.warning(
467
+ "Ignoring the exception and "
468
+ "attempting to shut down the cluster nodes anyway."
469
+ )
470
+
471
+ provider = _get_node_provider(config["provider"], config["cluster_name"])
472
+
473
+ def remaining_nodes():
474
+ workers = provider.non_terminated_nodes({TAG_RAY_NODE_KIND: NODE_KIND_WORKER})
475
+
476
+ if keep_min_workers:
477
+ min_workers = config.get("min_workers", 0)
478
+ cli_logger.print(
479
+ "{} random worker nodes will not be shut down. "
480
+ + cf.dimmed("(due to {})"),
481
+ cf.bold(min_workers),
482
+ cf.bold("--keep-min-workers"),
483
+ )
484
+
485
+ workers = random.sample(workers, len(workers) - min_workers)
486
+
487
+ # todo: it's weird to kill the head node but not all workers
488
+ if workers_only:
489
+ cli_logger.print(
490
+ "The head node will not be shut down. " + cf.dimmed("(due to {})"),
491
+ cf.bold("--workers-only"),
492
+ )
493
+
494
+ return workers
495
+
496
+ head = provider.non_terminated_nodes({TAG_RAY_NODE_KIND: NODE_KIND_HEAD})
497
+
498
+ return head + workers
499
+
500
+ def run_docker_stop(node, container_name):
501
+ try:
502
+ updater = NodeUpdaterThread(
503
+ node_id=node,
504
+ provider_config=config["provider"],
505
+ provider=provider,
506
+ auth_config=config["auth"],
507
+ cluster_name=config["cluster_name"],
508
+ file_mounts=config["file_mounts"],
509
+ initialization_commands=[],
510
+ setup_commands=[],
511
+ ray_start_commands=[],
512
+ runtime_hash="",
513
+ file_mounts_contents_hash="",
514
+ is_head_node=False,
515
+ docker_config=config.get("docker"),
516
+ )
517
+
518
+ _exec(
519
+ updater,
520
+ f"docker stop {container_name}",
521
+ with_output=False,
522
+ run_env="host",
523
+ )
524
+ except Exception:
525
+ cli_logger.warning(f"Docker stop failed on {node}")
526
+
527
+ # Loop here to check that both the head and worker nodes are actually
528
+ # really gone
529
+ A = remaining_nodes()
530
+
531
+ container_name = config.get("docker", {}).get("container_name")
532
+ if container_name:
533
+ # This is to ensure that the parallel SSH calls below do not mess with
534
+ # the users terminal.
535
+ output_redir = cmd_output_util.is_output_redirected()
536
+ cmd_output_util.set_output_redirected(True)
537
+ allow_interactive = cmd_output_util.does_allow_interactive()
538
+ cmd_output_util.set_allow_interactive(False)
539
+
540
+ with ThreadPoolExecutor(max_workers=MAX_PARALLEL_SHUTDOWN_WORKERS) as executor:
541
+ for node in A:
542
+ executor.submit(
543
+ run_docker_stop, node=node, container_name=container_name
544
+ )
545
+ cmd_output_util.set_output_redirected(output_redir)
546
+ cmd_output_util.set_allow_interactive(allow_interactive)
547
+ with LogTimer("teardown_cluster: done."):
548
+ while A:
549
+ provider.terminate_nodes(A)
550
+
551
+ cli_logger.print(
552
+ "Requested {} nodes to shut down.",
553
+ cf.bold(len(A)),
554
+ _tags=dict(interval="1s"),
555
+ )
556
+
557
+ time.sleep(POLL_INTERVAL) # todo: interval should be a variable
558
+ A = remaining_nodes()
559
+ cli_logger.print(
560
+ "{} nodes remaining after {} second(s).", cf.bold(len(A)), POLL_INTERVAL
561
+ )
562
+ cli_logger.success("No nodes remaining.")
563
+
564
+
565
+ def kill_node(
566
+ config_file: str, yes: bool, hard: bool, override_cluster_name: Optional[str]
567
+ ) -> Optional[str]:
568
+ """Kills a random Raylet worker."""
569
+
570
+ config = yaml.safe_load(open(config_file).read())
571
+ if override_cluster_name is not None:
572
+ config["cluster_name"] = override_cluster_name
573
+ config = _bootstrap_config(config)
574
+
575
+ cli_logger.confirm(yes, "A random node will be killed.")
576
+
577
+ provider = _get_node_provider(config["provider"], config["cluster_name"])
578
+ nodes = provider.non_terminated_nodes({TAG_RAY_NODE_KIND: NODE_KIND_WORKER})
579
+ if not nodes:
580
+ cli_logger.print("No worker nodes detected.")
581
+ return None
582
+ node = random.choice(nodes)
583
+ cli_logger.print("Shutdown " + cf.bold("{}"), node)
584
+ if hard:
585
+ provider.terminate_node(node)
586
+ else:
587
+ updater = NodeUpdaterThread(
588
+ node_id=node,
589
+ provider_config=config["provider"],
590
+ provider=provider,
591
+ auth_config=config["auth"],
592
+ cluster_name=config["cluster_name"],
593
+ file_mounts=config["file_mounts"],
594
+ initialization_commands=[],
595
+ setup_commands=[],
596
+ ray_start_commands=[],
597
+ runtime_hash="",
598
+ file_mounts_contents_hash="",
599
+ is_head_node=False,
600
+ docker_config=config.get("docker"),
601
+ )
602
+
603
+ _exec(updater, "ray stop", False, False)
604
+
605
+ time.sleep(POLL_INTERVAL)
606
+
607
+ if config.get("provider", {}).get("use_internal_ips", False):
608
+ node_ip = provider.internal_ip(node)
609
+ else:
610
+ node_ip = provider.external_ip(node)
611
+
612
+ return node_ip
613
+
614
+
615
+ def monitor_cluster(
616
+ cluster_config_file: str, num_lines: int, override_cluster_name: Optional[str]
617
+ ) -> None:
618
+ """Tails the autoscaler logs of a Ray cluster."""
619
+ cmd = f"tail -n {num_lines} -f /tmp/ray/session_latest/logs/monitor*"
620
+ exec_cluster(
621
+ cluster_config_file,
622
+ cmd=cmd,
623
+ run_env="auto",
624
+ screen=False,
625
+ tmux=False,
626
+ stop=False,
627
+ start=False,
628
+ override_cluster_name=override_cluster_name,
629
+ port_forward=None,
630
+ )
631
+
632
+
633
+ def warn_about_bad_start_command(
634
+ start_commands: List[str], no_monitor_on_head: bool = False
635
+ ) -> None:
636
+ ray_start_cmd = list(filter(lambda x: "ray start" in x, start_commands))
637
+ if len(ray_start_cmd) == 0:
638
+ cli_logger.warning(
639
+ "Ray runtime will not be started because `{}` is not in `{}`.",
640
+ cf.bold("ray start"),
641
+ cf.bold("head_start_ray_commands"),
642
+ )
643
+
644
+ autoscaling_config_in_ray_start_cmd = any(
645
+ "autoscaling-config" in x for x in ray_start_cmd
646
+ )
647
+ if not (autoscaling_config_in_ray_start_cmd or no_monitor_on_head):
648
+ cli_logger.warning(
649
+ "The head node will not launch any workers because "
650
+ "`{}` does not have `{}` set.\n"
651
+ "Potential fix: add `{}` to the `{}` command under `{}`.",
652
+ cf.bold("ray start"),
653
+ cf.bold("--autoscaling-config"),
654
+ cf.bold("--autoscaling-config=~/ray_bootstrap_config.yaml"),
655
+ cf.bold("ray start"),
656
+ cf.bold("head_start_ray_commands"),
657
+ )
658
+
659
+
660
+ def get_or_create_head_node(
661
+ config: Dict[str, Any],
662
+ printable_config_file: str,
663
+ no_restart: bool,
664
+ restart_only: bool,
665
+ yes: bool,
666
+ override_cluster_name: Optional[str],
667
+ no_monitor_on_head: bool = False,
668
+ _provider: Optional[NodeProvider] = None,
669
+ _runner: ModuleType = subprocess,
670
+ ) -> None:
671
+ """Create the cluster head node, which in turn creates the workers."""
672
+ global_event_system.execute_callback(CreateClusterEvent.cluster_booting_started)
673
+ provider = _provider or _get_node_provider(
674
+ config["provider"], config["cluster_name"]
675
+ )
676
+
677
+ config = copy.deepcopy(config)
678
+ head_node_tags = {
679
+ TAG_RAY_NODE_KIND: NODE_KIND_HEAD,
680
+ }
681
+ nodes = provider.non_terminated_nodes(head_node_tags)
682
+ if len(nodes) > 0:
683
+ head_node = nodes[0]
684
+ else:
685
+ head_node = None
686
+
687
+ if not head_node:
688
+ cli_logger.confirm(
689
+ yes, "No head node found. Launching a new cluster.", _abort=True
690
+ )
691
+ cli_logger.newline()
692
+ usage_lib.show_usage_stats_prompt(cli=True)
693
+
694
+ if head_node:
695
+ if restart_only:
696
+ cli_logger.confirm(
697
+ yes,
698
+ "Updating cluster configuration and "
699
+ "restarting the cluster Ray runtime. "
700
+ "Setup commands will not be run due to `{}`.\n",
701
+ cf.bold("--restart-only"),
702
+ _abort=True,
703
+ )
704
+ cli_logger.newline()
705
+ usage_lib.show_usage_stats_prompt(cli=True)
706
+ elif no_restart:
707
+ cli_logger.print(
708
+ "Cluster Ray runtime will not be restarted due to `{}`.",
709
+ cf.bold("--no-restart"),
710
+ )
711
+ cli_logger.confirm(
712
+ yes,
713
+ "Updating cluster configuration and running setup commands.",
714
+ _abort=True,
715
+ )
716
+ else:
717
+ cli_logger.print("Updating cluster configuration and running full setup.")
718
+ cli_logger.confirm(
719
+ yes, cf.bold("Cluster Ray runtime will be restarted."), _abort=True
720
+ )
721
+ cli_logger.newline()
722
+ usage_lib.show_usage_stats_prompt(cli=True)
723
+
724
+ cli_logger.newline()
725
+ # TODO(ekl) this logic is duplicated in node_launcher.py (keep in sync)
726
+ head_node_config = copy.deepcopy(config.get("head_node", {}))
727
+ # The above `head_node` field is deprecated in favor of per-node-type
728
+ # node_configs. We allow it for backwards-compatibility.
729
+ head_node_resources = None
730
+ head_node_labels = None
731
+ head_node_type = config.get("head_node_type")
732
+ if head_node_type:
733
+ head_node_tags[TAG_RAY_USER_NODE_TYPE] = head_node_type
734
+ head_config = config["available_node_types"][head_node_type]
735
+ head_node_config.update(head_config["node_config"])
736
+
737
+ # Not necessary to keep in sync with node_launcher.py
738
+ # Keep in sync with autoscaler.py _node_resources
739
+ head_node_resources = head_config.get("resources")
740
+ head_node_labels = head_config.get("labels")
741
+
742
+ launch_hash = hash_launch_conf(head_node_config, config["auth"])
743
+ creating_new_head = _should_create_new_head(
744
+ head_node, launch_hash, head_node_type, provider
745
+ )
746
+ if creating_new_head:
747
+ with cli_logger.group("Acquiring an up-to-date head node"):
748
+ global_event_system.execute_callback(
749
+ CreateClusterEvent.acquiring_new_head_node
750
+ )
751
+ if head_node is not None:
752
+ cli_logger.confirm(yes, "Relaunching the head node.", _abort=True)
753
+
754
+ provider.terminate_node(head_node)
755
+ cli_logger.print("Terminated head node {}", head_node)
756
+
757
+ head_node_tags[TAG_RAY_LAUNCH_CONFIG] = launch_hash
758
+ head_node_tags[TAG_RAY_NODE_NAME] = "ray-{}-head".format(
759
+ config["cluster_name"]
760
+ )
761
+ head_node_tags[TAG_RAY_NODE_STATUS] = STATUS_UNINITIALIZED
762
+ provider.create_node(head_node_config, head_node_tags, 1)
763
+ cli_logger.print("Launched a new head node")
764
+
765
+ start = time.time()
766
+ head_node = None
767
+ with cli_logger.group("Fetching the new head node"):
768
+ while True:
769
+ if time.time() - start > 50:
770
+ cli_logger.abort(
771
+ "Head node fetch timed out. Failed to create head node."
772
+ )
773
+ nodes = provider.non_terminated_nodes(head_node_tags)
774
+ if len(nodes) == 1:
775
+ head_node = nodes[0]
776
+ break
777
+ time.sleep(POLL_INTERVAL)
778
+ cli_logger.newline()
779
+
780
+ global_event_system.execute_callback(CreateClusterEvent.head_node_acquired)
781
+
782
+ with cli_logger.group(
783
+ "Setting up head node",
784
+ _numbered=("<>", 1, 1),
785
+ # cf.bold(provider.node_tags(head_node)[TAG_RAY_NODE_NAME]),
786
+ _tags=dict(),
787
+ ): # add id, ARN to tags?
788
+ # TODO(ekl) right now we always update the head node even if the
789
+ # hash matches.
790
+ # We could prompt the user for what they want to do here.
791
+ # No need to pass in cluster_sync_files because we use this
792
+ # hash to set up the head node
793
+ (runtime_hash, file_mounts_contents_hash) = hash_runtime_conf(
794
+ config["file_mounts"], None, config
795
+ )
796
+
797
+ if not no_monitor_on_head:
798
+ # Return remote_config_file to avoid prematurely closing it.
799
+ config, remote_config_file = _set_up_config_for_head_node(
800
+ config, provider, no_restart
801
+ )
802
+ cli_logger.print("Prepared bootstrap config")
803
+
804
+ if restart_only:
805
+ # Docker may re-launch nodes, requiring setup
806
+ # commands to be rerun.
807
+ if config.get("docker", {}).get("container_name"):
808
+ setup_commands = config["head_setup_commands"]
809
+ else:
810
+ setup_commands = []
811
+ ray_start_commands = config["head_start_ray_commands"]
812
+ # If user passed in --no-restart and we're not creating a new head,
813
+ # omit start commands.
814
+ elif no_restart and not creating_new_head:
815
+ setup_commands = config["head_setup_commands"]
816
+ ray_start_commands = []
817
+ else:
818
+ setup_commands = config["head_setup_commands"]
819
+ ray_start_commands = config["head_start_ray_commands"]
820
+
821
+ if not no_restart:
822
+ warn_about_bad_start_command(ray_start_commands, no_monitor_on_head)
823
+
824
+ updater = NodeUpdaterThread(
825
+ node_id=head_node,
826
+ provider_config=config["provider"],
827
+ provider=provider,
828
+ auth_config=config["auth"],
829
+ cluster_name=config["cluster_name"],
830
+ file_mounts=config["file_mounts"],
831
+ initialization_commands=config["initialization_commands"],
832
+ setup_commands=setup_commands,
833
+ ray_start_commands=ray_start_commands,
834
+ process_runner=_runner,
835
+ runtime_hash=runtime_hash,
836
+ file_mounts_contents_hash=file_mounts_contents_hash,
837
+ is_head_node=True,
838
+ node_resources=head_node_resources,
839
+ node_labels=head_node_labels,
840
+ rsync_options={
841
+ "rsync_exclude": config.get("rsync_exclude"),
842
+ "rsync_filter": config.get("rsync_filter"),
843
+ },
844
+ docker_config=config.get("docker"),
845
+ restart_only=restart_only,
846
+ )
847
+ updater.start()
848
+ updater.join()
849
+
850
+ # Refresh the node cache so we see the external ip if available
851
+ provider.non_terminated_nodes(head_node_tags)
852
+
853
+ if updater.exitcode != 0:
854
+ # todo: this does not follow the mockup and is not good enough
855
+ cli_logger.abort("Failed to setup head node.")
856
+ sys.exit(1)
857
+
858
+ global_event_system.execute_callback(
859
+ CreateClusterEvent.cluster_booting_completed,
860
+ {
861
+ "head_node_id": head_node,
862
+ },
863
+ )
864
+
865
+ monitor_str = "tail -n 100 -f /tmp/ray/session_latest/logs/monitor*"
866
+ if override_cluster_name:
867
+ modifiers = " --cluster-name={}".format(quote(override_cluster_name))
868
+ else:
869
+ modifiers = ""
870
+
871
+ cli_logger.newline()
872
+ with cli_logger.group("Useful commands:"):
873
+ printable_config_file = os.path.abspath(printable_config_file)
874
+
875
+ cli_logger.print("To terminate the cluster:")
876
+ cli_logger.print(cf.bold(f" ray down {printable_config_file}{modifiers}"))
877
+ cli_logger.newline()
878
+
879
+ cli_logger.print("To retrieve the IP address of the cluster head:")
880
+ cli_logger.print(
881
+ cf.bold(f" ray get-head-ip {printable_config_file}{modifiers}")
882
+ )
883
+ cli_logger.newline()
884
+
885
+ cli_logger.print(
886
+ "To port-forward the cluster's Ray Dashboard to the local machine:"
887
+ )
888
+ cli_logger.print(cf.bold(f" ray dashboard {printable_config_file}{modifiers}"))
889
+ cli_logger.newline()
890
+
891
+ cli_logger.print(
892
+ "To submit a job to the cluster, port-forward the "
893
+ "Ray Dashboard in another terminal and run:"
894
+ )
895
+ cli_logger.print(
896
+ cf.bold(
897
+ " ray job submit --address http://localhost:<dashboard-port> "
898
+ "--working-dir . -- python my_script.py"
899
+ )
900
+ )
901
+ cli_logger.newline()
902
+
903
+ cli_logger.print("To connect to a terminal on the cluster head for debugging:")
904
+ cli_logger.print(cf.bold(f" ray attach {printable_config_file}{modifiers}"))
905
+ cli_logger.newline()
906
+
907
+ cli_logger.print("To monitor autoscaling:")
908
+ cli_logger.print(
909
+ cf.bold(
910
+ f" ray exec {printable_config_file}{modifiers} {quote(monitor_str)}"
911
+ )
912
+ )
913
+ cli_logger.newline()
914
+
915
+
916
+ def _should_create_new_head(
917
+ head_node_id: Optional[str],
918
+ new_launch_hash: str,
919
+ new_head_node_type: str,
920
+ provider: NodeProvider,
921
+ ) -> bool:
922
+ """Decides whether a new head node needs to be created.
923
+
924
+ We need a new head if at least one of the following holds:
925
+ (a) There isn't an existing head node
926
+ (b) The user-submitted head node_config differs from the existing head
927
+ node's node_config.
928
+ (c) The user-submitted head node_type key differs from the existing head
929
+ node's node_type.
930
+
931
+ Args:
932
+ head_node_id (Optional[str]): head node id if a head exists, else None
933
+ new_launch_hash: hash of current user-submitted head config
934
+ new_head_node_type: current user-submitted head node-type key
935
+
936
+ Returns:
937
+ bool: True if a new Ray head node should be launched, False otherwise
938
+ """
939
+ if not head_node_id:
940
+ # No head node exists, need to create it.
941
+ return True
942
+
943
+ # Pull existing head's data.
944
+ head_tags = provider.node_tags(head_node_id)
945
+ current_launch_hash = head_tags.get(TAG_RAY_LAUNCH_CONFIG)
946
+ current_head_type = head_tags.get(TAG_RAY_USER_NODE_TYPE)
947
+
948
+ # Compare to current head
949
+ hashes_mismatch = new_launch_hash != current_launch_hash
950
+ types_mismatch = new_head_node_type != current_head_type
951
+
952
+ new_head_required = hashes_mismatch or types_mismatch
953
+
954
+ # Warn user
955
+ if new_head_required:
956
+ with cli_logger.group(
957
+ "Currently running head node is out-of-date with cluster configuration"
958
+ ):
959
+ if hashes_mismatch:
960
+ cli_logger.print(
961
+ "Current hash is {}, expected {}",
962
+ cf.bold(current_launch_hash),
963
+ cf.bold(new_launch_hash),
964
+ )
965
+
966
+ if types_mismatch:
967
+ cli_logger.print(
968
+ "Current head node type is {}, expected {}",
969
+ cf.bold(current_head_type),
970
+ cf.bold(new_head_node_type),
971
+ )
972
+
973
+ return new_head_required
974
+
975
+
976
+ def _set_up_config_for_head_node(
977
+ config: Dict[str, Any], provider: NodeProvider, no_restart: bool
978
+ ) -> Tuple[Dict[str, Any], Any]:
979
+ """Prepares autoscaling config and, if needed, ssh key, to be mounted onto
980
+ the Ray head node for use by the autoscaler.
981
+
982
+ Returns the modified config and the temporary config file that will be
983
+ mounted onto the head node.
984
+ """
985
+ # Rewrite the auth config so that the head
986
+ # node can update the workers
987
+ remote_config = copy.deepcopy(config)
988
+
989
+ # drop proxy options if they exist, otherwise
990
+ # head node won't be able to connect to workers
991
+ remote_config["auth"].pop("ssh_proxy_command", None)
992
+
993
+ # Drop the head_node field if it was introduced. It is technically not a
994
+ # valid field in the config, but it may have been introduced after
995
+ # validation (see _bootstrap_config() call to
996
+ # provider_cls.bootstrap_config(config)). The head node will never try to
997
+ # launch a head node so it doesn't need these defaults.
998
+ remote_config.pop("head_node", None)
999
+
1000
+ if "ssh_private_key" in config["auth"]:
1001
+ remote_key_path = "~/ray_bootstrap_key.pem"
1002
+ remote_config["auth"]["ssh_private_key"] = remote_key_path
1003
+
1004
+ # Adjust for new file locations
1005
+ new_mounts = {}
1006
+ for remote_path in config["file_mounts"]:
1007
+ new_mounts[remote_path] = remote_path
1008
+ remote_config["file_mounts"] = new_mounts
1009
+ remote_config["no_restart"] = no_restart
1010
+
1011
+ remote_config = provider.prepare_for_head_node(remote_config)
1012
+
1013
+ # Now inject the rewritten config and SSH key into the head node
1014
+ remote_config_file = tempfile.NamedTemporaryFile("w", prefix="ray-bootstrap-")
1015
+ remote_config_file.write(json.dumps(remote_config))
1016
+ remote_config_file.flush()
1017
+ config["file_mounts"].update(
1018
+ {"~/ray_bootstrap_config.yaml": remote_config_file.name}
1019
+ )
1020
+
1021
+ if "ssh_private_key" in config["auth"]:
1022
+ config["file_mounts"].update(
1023
+ {
1024
+ remote_key_path: config["auth"]["ssh_private_key"],
1025
+ }
1026
+ )
1027
+
1028
+ return config, remote_config_file
1029
+
1030
+
1031
+ def attach_cluster(
1032
+ config_file: str,
1033
+ start: bool,
1034
+ use_screen: bool,
1035
+ use_tmux: bool,
1036
+ override_cluster_name: Optional[str],
1037
+ no_config_cache: bool = False,
1038
+ new: bool = False,
1039
+ port_forward: Optional[Port_forward] = None,
1040
+ ) -> None:
1041
+ """Attaches to a screen for the specified cluster.
1042
+
1043
+ Arguments:
1044
+ config_file: path to the cluster yaml
1045
+ start: whether to start the cluster if it isn't up
1046
+ use_screen: whether to use screen as multiplexer
1047
+ use_tmux: whether to use tmux as multiplexer
1048
+ override_cluster_name: set the name of the cluster
1049
+ new: whether to force a new screen
1050
+ port_forward ( (int,int) or list[(int,int)] ): port(s) to forward
1051
+ """
1052
+
1053
+ if use_tmux:
1054
+ if new:
1055
+ cmd = "tmux new"
1056
+ else:
1057
+ cmd = "tmux attach || tmux new"
1058
+ elif use_screen:
1059
+ if new:
1060
+ cmd = "screen -L"
1061
+ else:
1062
+ cmd = "screen -L -xRR"
1063
+ else:
1064
+ if new:
1065
+ raise ValueError("--new only makes sense if passing --screen or --tmux")
1066
+ cmd = "$SHELL"
1067
+
1068
+ exec_cluster(
1069
+ config_file,
1070
+ cmd=cmd,
1071
+ run_env="auto",
1072
+ screen=False,
1073
+ tmux=False,
1074
+ stop=False,
1075
+ start=start,
1076
+ override_cluster_name=override_cluster_name,
1077
+ no_config_cache=no_config_cache,
1078
+ port_forward=port_forward,
1079
+ _allow_uninitialized_state=True,
1080
+ )
1081
+
1082
+
1083
+ def exec_cluster(
1084
+ config_file: str,
1085
+ *,
1086
+ cmd: Optional[str] = None,
1087
+ run_env: str = "auto",
1088
+ screen: bool = False,
1089
+ tmux: bool = False,
1090
+ stop: bool = False,
1091
+ start: bool = False,
1092
+ override_cluster_name: Optional[str] = None,
1093
+ no_config_cache: bool = False,
1094
+ port_forward: Optional[Port_forward] = None,
1095
+ with_output: bool = False,
1096
+ _allow_uninitialized_state: bool = False,
1097
+ extra_screen_args: Optional[str] = None,
1098
+ ) -> str:
1099
+ """Runs a command on the specified cluster.
1100
+
1101
+ Arguments:
1102
+ config_file: path to the cluster yaml
1103
+ cmd: command to run
1104
+ run_env: whether to run the command on the host or in a container.
1105
+ Select between "auto", "host" and "docker"
1106
+ screen: whether to run in a screen
1107
+ extra_screen_args: optional custom additional args to screen command
1108
+ tmux: whether to run in a tmux session
1109
+ stop: whether to stop the cluster after command run
1110
+ start: whether to start the cluster if it isn't up
1111
+ override_cluster_name: set the name of the cluster
1112
+ port_forward ( (int, int) or list[(int, int)] ): port(s) to forward
1113
+ _allow_uninitialized_state: whether to execute on an uninitialized head
1114
+ node.
1115
+ """
1116
+ assert not (screen and tmux), "Can specify only one of `screen` or `tmux`."
1117
+ assert run_env in RUN_ENV_TYPES, "--run_env must be in {}".format(RUN_ENV_TYPES)
1118
+ # TODO(rliaw): We default this to True to maintain backwards-compat.
1119
+ # In the future we would want to support disabling login-shells
1120
+ # and interactivity.
1121
+ cmd_output_util.set_allow_interactive(True)
1122
+
1123
+ config = yaml.safe_load(open(config_file).read())
1124
+ if override_cluster_name is not None:
1125
+ config["cluster_name"] = override_cluster_name
1126
+ config = _bootstrap_config(config, no_config_cache=no_config_cache)
1127
+
1128
+ head_node = _get_running_head_node(
1129
+ config,
1130
+ config_file,
1131
+ override_cluster_name,
1132
+ create_if_needed=start,
1133
+ _allow_uninitialized_state=_allow_uninitialized_state,
1134
+ )
1135
+
1136
+ provider = _get_node_provider(config["provider"], config["cluster_name"])
1137
+ updater = NodeUpdaterThread(
1138
+ node_id=head_node,
1139
+ provider_config=config["provider"],
1140
+ provider=provider,
1141
+ auth_config=config["auth"],
1142
+ cluster_name=config["cluster_name"],
1143
+ file_mounts=config["file_mounts"],
1144
+ initialization_commands=[],
1145
+ setup_commands=[],
1146
+ ray_start_commands=[],
1147
+ runtime_hash="",
1148
+ file_mounts_contents_hash="",
1149
+ is_head_node=True,
1150
+ rsync_options={
1151
+ "rsync_exclude": config.get("rsync_exclude"),
1152
+ "rsync_filter": config.get("rsync_filter"),
1153
+ },
1154
+ docker_config=config.get("docker"),
1155
+ )
1156
+ if cmd and stop:
1157
+ cmd = "; ".join(
1158
+ [
1159
+ cmd,
1160
+ "ray stop",
1161
+ "ray teardown ~/ray_bootstrap_config.yaml --yes --workers-only",
1162
+ "sudo shutdown -h now",
1163
+ ]
1164
+ )
1165
+
1166
+ result = _exec(
1167
+ updater,
1168
+ cmd,
1169
+ screen,
1170
+ tmux,
1171
+ port_forward=port_forward,
1172
+ with_output=with_output,
1173
+ run_env=run_env,
1174
+ shutdown_after_run=False,
1175
+ extra_screen_args=extra_screen_args,
1176
+ )
1177
+ if tmux or screen:
1178
+ attach_command_parts = ["ray attach", config_file]
1179
+ if override_cluster_name is not None:
1180
+ attach_command_parts.append(
1181
+ "--cluster-name={}".format(override_cluster_name)
1182
+ )
1183
+ if tmux:
1184
+ attach_command_parts.append("--tmux")
1185
+ elif screen:
1186
+ attach_command_parts.append("--screen")
1187
+
1188
+ attach_command = " ".join(attach_command_parts)
1189
+ cli_logger.print("Run `{}` to check command status.", cf.bold(attach_command))
1190
+ return result
1191
+
1192
+
1193
+ def _exec(
1194
+ updater: NodeUpdaterThread,
1195
+ cmd: Optional[str] = None,
1196
+ screen: bool = False,
1197
+ tmux: bool = False,
1198
+ port_forward: Optional[Port_forward] = None,
1199
+ with_output: bool = False,
1200
+ run_env: str = "auto",
1201
+ shutdown_after_run: bool = False,
1202
+ extra_screen_args: Optional[str] = None,
1203
+ ) -> str:
1204
+ if cmd:
1205
+ if screen:
1206
+ wrapped_cmd = [
1207
+ "screen",
1208
+ "-L",
1209
+ "-dm",
1210
+ ]
1211
+
1212
+ if extra_screen_args is not None and len(extra_screen_args) > 0:
1213
+ wrapped_cmd += [extra_screen_args]
1214
+
1215
+ wrapped_cmd += [
1216
+ "bash",
1217
+ "-c",
1218
+ quote(cmd + "; exec bash"),
1219
+ ]
1220
+ cmd = " ".join(wrapped_cmd)
1221
+ elif tmux:
1222
+ # TODO: Consider providing named session functionality
1223
+ wrapped_cmd = [
1224
+ "tmux",
1225
+ "new",
1226
+ "-d",
1227
+ "bash",
1228
+ "-c",
1229
+ quote(cmd + "; exec bash"),
1230
+ ]
1231
+ cmd = " ".join(wrapped_cmd)
1232
+ return updater.cmd_runner.run(
1233
+ cmd,
1234
+ exit_on_fail=True,
1235
+ port_forward=port_forward,
1236
+ with_output=with_output,
1237
+ run_env=run_env,
1238
+ shutdown_after_run=shutdown_after_run,
1239
+ )
1240
+
1241
+
1242
+ def rsync(
1243
+ config_file: str,
1244
+ source: Optional[str],
1245
+ target: Optional[str],
1246
+ override_cluster_name: Optional[str],
1247
+ down: bool,
1248
+ ip_address: Optional[str] = None,
1249
+ use_internal_ip: bool = False,
1250
+ no_config_cache: bool = False,
1251
+ all_nodes: bool = False,
1252
+ should_bootstrap: bool = True,
1253
+ _runner: ModuleType = subprocess,
1254
+ ) -> None:
1255
+ """Rsyncs files.
1256
+
1257
+ Arguments:
1258
+ config_file: path to the cluster yaml
1259
+ source: source dir
1260
+ target: target dir
1261
+ override_cluster_name: set the name of the cluster
1262
+ down: whether we're syncing remote -> local
1263
+ ip_address: Address of node. Raise Exception
1264
+ if both ip_address and 'all_nodes' are provided.
1265
+ use_internal_ip: Whether the provided ip_address is
1266
+ public or private.
1267
+ all_nodes: whether to sync worker nodes in addition to the head node
1268
+ should_bootstrap: whether to bootstrap cluster config before syncing
1269
+ """
1270
+ if bool(source) != bool(target):
1271
+ cli_logger.abort("Expected either both a source and a target, or neither.")
1272
+
1273
+ assert bool(source) == bool(
1274
+ target
1275
+ ), "Must either provide both or neither source and target."
1276
+
1277
+ if ip_address and all_nodes:
1278
+ cli_logger.abort("Cannot provide both ip_address and 'all_nodes'.")
1279
+
1280
+ config = yaml.safe_load(open(config_file).read())
1281
+ if override_cluster_name is not None:
1282
+ config["cluster_name"] = override_cluster_name
1283
+ if should_bootstrap:
1284
+ config = _bootstrap_config(config, no_config_cache=no_config_cache)
1285
+
1286
+ is_file_mount = False
1287
+ if source and target:
1288
+ for remote_mount in config.get("file_mounts", {}).keys():
1289
+ if (source if down else target).startswith(remote_mount):
1290
+ is_file_mount = True
1291
+ break
1292
+
1293
+ provider = _get_node_provider(config["provider"], config["cluster_name"])
1294
+
1295
+ def rsync_to_node(node_id, is_head_node):
1296
+ updater = NodeUpdaterThread(
1297
+ node_id=node_id,
1298
+ provider_config=config["provider"],
1299
+ provider=provider,
1300
+ auth_config=config["auth"],
1301
+ cluster_name=config["cluster_name"],
1302
+ file_mounts=config["file_mounts"],
1303
+ initialization_commands=[],
1304
+ setup_commands=[],
1305
+ ray_start_commands=[],
1306
+ runtime_hash="",
1307
+ use_internal_ip=use_internal_ip,
1308
+ process_runner=_runner,
1309
+ file_mounts_contents_hash="",
1310
+ is_head_node=is_head_node,
1311
+ rsync_options={
1312
+ "rsync_exclude": config.get("rsync_exclude"),
1313
+ "rsync_filter": config.get("rsync_filter"),
1314
+ },
1315
+ docker_config=config.get("docker"),
1316
+ )
1317
+ if down:
1318
+ rsync = updater.rsync_down
1319
+ else:
1320
+ rsync = updater.rsync_up
1321
+
1322
+ if source and target:
1323
+ # print rsync progress for single file rsync
1324
+ if cli_logger.verbosity > 0:
1325
+ cmd_output_util.set_output_redirected(False)
1326
+ set_rsync_silent(False)
1327
+ rsync(source, target, is_file_mount)
1328
+ else:
1329
+ updater.sync_file_mounts(rsync)
1330
+
1331
+ nodes = []
1332
+ head_node = _get_running_head_node(
1333
+ config, config_file, override_cluster_name, create_if_needed=False
1334
+ )
1335
+ if ip_address:
1336
+ nodes = [provider.get_node_id(ip_address, use_internal_ip=use_internal_ip)]
1337
+ else:
1338
+ nodes = [head_node]
1339
+ if all_nodes:
1340
+ nodes.extend(_get_worker_nodes(config, override_cluster_name))
1341
+
1342
+ for node_id in nodes:
1343
+ rsync_to_node(node_id, is_head_node=(node_id == head_node))
1344
+
1345
+
1346
+ def get_head_node_ip(
1347
+ config_file: str, override_cluster_name: Optional[str] = None
1348
+ ) -> str:
1349
+ """Returns head node IP for given configuration file if exists."""
1350
+
1351
+ config = yaml.safe_load(open(config_file).read())
1352
+ if override_cluster_name is not None:
1353
+ config["cluster_name"] = override_cluster_name
1354
+
1355
+ provider = _get_node_provider(config["provider"], config["cluster_name"])
1356
+ head_node = _get_running_head_node(config, config_file, override_cluster_name)
1357
+ provider_cfg = config.get("provider", {})
1358
+ # Get internal IP if using internal IPs and
1359
+ # use_external_head_ip is not specified
1360
+ if provider_cfg.get("use_internal_ips", False) and not provider_cfg.get(
1361
+ "use_external_head_ip", False
1362
+ ):
1363
+ head_node_ip = provider.internal_ip(head_node)
1364
+ else:
1365
+ head_node_ip = provider.external_ip(head_node)
1366
+
1367
+ return head_node_ip
1368
+
1369
+
1370
+ def get_worker_node_ips(
1371
+ config_file: str, override_cluster_name: Optional[str] = None
1372
+ ) -> List[str]:
1373
+ """Returns worker node IPs for given configuration file."""
1374
+
1375
+ config = yaml.safe_load(open(config_file).read())
1376
+ if override_cluster_name is not None:
1377
+ config["cluster_name"] = override_cluster_name
1378
+
1379
+ provider = _get_node_provider(config["provider"], config["cluster_name"])
1380
+ nodes = provider.non_terminated_nodes({TAG_RAY_NODE_KIND: NODE_KIND_WORKER})
1381
+
1382
+ if config.get("provider", {}).get("use_internal_ips", False):
1383
+ return [provider.internal_ip(node) for node in nodes]
1384
+ else:
1385
+ return [provider.external_ip(node) for node in nodes]
1386
+
1387
+
1388
+ def _get_worker_nodes(
1389
+ config: Dict[str, Any], override_cluster_name: Optional[str]
1390
+ ) -> List[str]:
1391
+ """Returns worker node ids for given configuration."""
1392
+ # todo: technically could be reused in get_worker_node_ips
1393
+ if override_cluster_name is not None:
1394
+ config["cluster_name"] = override_cluster_name
1395
+
1396
+ provider = _get_node_provider(config["provider"], config["cluster_name"])
1397
+ return provider.non_terminated_nodes({TAG_RAY_NODE_KIND: NODE_KIND_WORKER})
1398
+
1399
+
1400
+ def _get_running_head_node(
1401
+ config: Dict[str, Any],
1402
+ printable_config_file: str,
1403
+ override_cluster_name: Optional[str],
1404
+ create_if_needed: bool = False,
1405
+ _provider: Optional[NodeProvider] = None,
1406
+ _allow_uninitialized_state: bool = False,
1407
+ ) -> str:
1408
+ """Get a valid, running head node.
1409
+ Args:
1410
+ config (Dict[str, Any]): Cluster Config dictionary
1411
+ printable_config_file: Used for printing formatted CLI commands.
1412
+ override_cluster_name: Passed to `get_or_create_head_node` to
1413
+ override the cluster name present in `config`.
1414
+ create_if_needed: Create a head node if one is not present.
1415
+ _provider: [For testing], a Node Provider to use.
1416
+ _allow_uninitialized_state: Whether to return a head node that
1417
+ is not 'UP TO DATE'. This is used to allow `ray attach` and
1418
+ `ray exec` to debug a cluster in a bad state.
1419
+
1420
+ """
1421
+ provider = _provider or _get_node_provider(
1422
+ config["provider"], config["cluster_name"]
1423
+ )
1424
+ head_node_tags = {
1425
+ TAG_RAY_NODE_KIND: NODE_KIND_HEAD,
1426
+ }
1427
+ nodes = provider.non_terminated_nodes(head_node_tags)
1428
+ head_node = None
1429
+ _backup_head_node = None
1430
+ for node in nodes:
1431
+ node_state = provider.node_tags(node).get(TAG_RAY_NODE_STATUS)
1432
+ if node_state == STATUS_UP_TO_DATE:
1433
+ head_node = node
1434
+ else:
1435
+ _backup_head_node = node
1436
+ cli_logger.warning(f"Head node ({node}) is in state {node_state}.")
1437
+
1438
+ if head_node is not None:
1439
+ return head_node
1440
+ elif create_if_needed:
1441
+ get_or_create_head_node(
1442
+ config,
1443
+ printable_config_file=printable_config_file,
1444
+ restart_only=False,
1445
+ no_restart=False,
1446
+ yes=True,
1447
+ override_cluster_name=override_cluster_name,
1448
+ )
1449
+ # NOTE: `_allow_uninitialized_state` is forced to False if
1450
+ # `create_if_needed` is set to True. This is to ensure that the
1451
+ # commands executed after creation occur on an actually running
1452
+ # cluster.
1453
+ return _get_running_head_node(
1454
+ config,
1455
+ printable_config_file,
1456
+ override_cluster_name,
1457
+ create_if_needed=False,
1458
+ _allow_uninitialized_state=False,
1459
+ )
1460
+ else:
1461
+ if _allow_uninitialized_state and _backup_head_node is not None:
1462
+ cli_logger.warning(
1463
+ f"The head node being returned: {_backup_head_node} is not "
1464
+ "`up-to-date`. If you are not debugging a startup issue "
1465
+ "it is recommended to restart this head node with: {}",
1466
+ cf.bold(f" ray down {printable_config_file}"),
1467
+ )
1468
+
1469
+ return _backup_head_node
1470
+ raise RuntimeError(
1471
+ "Head node of cluster ({}) not found!".format(config["cluster_name"])
1472
+ )
1473
+
1474
+
1475
+ def get_local_dump_archive(
1476
+ stream: bool = False,
1477
+ output: Optional[str] = None,
1478
+ logs: bool = True,
1479
+ debug_state: bool = True,
1480
+ pip: bool = True,
1481
+ processes: bool = True,
1482
+ processes_verbose: bool = False,
1483
+ tempfile: Optional[str] = None,
1484
+ ) -> Optional[str]:
1485
+ if stream and output:
1486
+ raise ValueError(
1487
+ "You can only use either `--output` or `--stream`, but not both."
1488
+ )
1489
+
1490
+ parameters = GetParameters(
1491
+ logs=logs,
1492
+ debug_state=debug_state,
1493
+ pip=pip,
1494
+ processes=processes,
1495
+ processes_verbose=processes_verbose,
1496
+ )
1497
+
1498
+ with Archive(file=tempfile) as archive:
1499
+ get_all_local_data(archive, parameters)
1500
+
1501
+ tmp = archive.file
1502
+
1503
+ if stream:
1504
+ with open(tmp, "rb") as fp:
1505
+ os.write(1, fp.read())
1506
+ os.remove(tmp)
1507
+ return None
1508
+
1509
+ target = output or os.path.join(os.getcwd(), os.path.basename(tmp))
1510
+ shutil.move(tmp, target)
1511
+ cli_logger.print(f"Created local data archive at {target}")
1512
+
1513
+ return target
1514
+
1515
+
1516
+ def get_cluster_dump_archive(
1517
+ cluster_config_file: Optional[str] = None,
1518
+ host: Optional[str] = None,
1519
+ ssh_user: Optional[str] = None,
1520
+ ssh_key: Optional[str] = None,
1521
+ docker: Optional[str] = None,
1522
+ local: Optional[bool] = None,
1523
+ output: Optional[str] = None,
1524
+ logs: bool = True,
1525
+ debug_state: bool = True,
1526
+ pip: bool = True,
1527
+ processes: bool = True,
1528
+ processes_verbose: bool = False,
1529
+ tempfile: Optional[str] = None,
1530
+ ) -> Optional[str]:
1531
+ # Inform the user what kind of logs are collected (before actually
1532
+ # collecting, so they can abort)
1533
+ content_str = ""
1534
+ if logs:
1535
+ content_str += (
1536
+ " - The logfiles of your Ray session\n"
1537
+ " This usually includes Python outputs (stdout/stderr)\n"
1538
+ )
1539
+
1540
+ if debug_state:
1541
+ content_str += (
1542
+ " - Debug state information on your Ray cluster \n"
1543
+ " e.g. number of workers, drivers, objects, etc.\n"
1544
+ )
1545
+
1546
+ if pip:
1547
+ content_str += " - Your installed Python packages (`pip freeze`)\n"
1548
+
1549
+ if processes:
1550
+ content_str += (
1551
+ " - Information on your running Ray processes\n"
1552
+ " This includes command line arguments\n"
1553
+ )
1554
+
1555
+ cli_logger.warning(
1556
+ "You are about to create a cluster dump. This will collect data from "
1557
+ "cluster nodes.\n\n"
1558
+ "The dump will contain this information:\n\n"
1559
+ f"{content_str}\n"
1560
+ f"If you are concerned about leaking private information, extract "
1561
+ f"the archive and inspect its contents before sharing it with "
1562
+ f"anyone."
1563
+ )
1564
+
1565
+ # Parse arguments (e.g. fetch info from cluster config)
1566
+ (
1567
+ cluster_config_file,
1568
+ hosts,
1569
+ ssh_user,
1570
+ ssh_key,
1571
+ docker,
1572
+ cluster_name,
1573
+ ) = _info_from_params(cluster_config_file, host, ssh_user, ssh_key, docker)
1574
+
1575
+ nodes = [
1576
+ Node(host=h, ssh_user=ssh_user, ssh_key=ssh_key, docker_container=docker)
1577
+ for h in hosts
1578
+ ]
1579
+
1580
+ if not nodes:
1581
+ cli_logger.error(
1582
+ "No nodes found. Specify with `--host` or by passing a ray "
1583
+ "cluster config to `--cluster`."
1584
+ )
1585
+ return None
1586
+
1587
+ if cluster_config_file:
1588
+ nodes[0].is_head = True
1589
+
1590
+ if local is None:
1591
+ # If called with a cluster config, this was probably started
1592
+ # from a laptop
1593
+ local = not bool(cluster_config_file)
1594
+
1595
+ parameters = GetParameters(
1596
+ logs=logs,
1597
+ debug_state=debug_state,
1598
+ pip=pip,
1599
+ processes=processes,
1600
+ processes_verbose=processes_verbose,
1601
+ )
1602
+
1603
+ with Archive(file=tempfile) as archive:
1604
+ if local:
1605
+ create_archive_for_local_and_remote_nodes(
1606
+ archive, remote_nodes=nodes, parameters=parameters
1607
+ )
1608
+ else:
1609
+ create_archive_for_remote_nodes(
1610
+ archive, remote_nodes=nodes, parameters=parameters
1611
+ )
1612
+
1613
+ if not output:
1614
+ if cluster_name:
1615
+ filename = (
1616
+ f"{cluster_name}_" f"{datetime.datetime.now():%Y-%m-%d_%H-%M-%S}.tar.gz"
1617
+ )
1618
+ else:
1619
+ filename = (
1620
+ f"collected_logs_" f"{datetime.datetime.now():%Y-%m-%d_%H-%M-%S}.tar.gz"
1621
+ )
1622
+ output = os.path.join(os.getcwd(), filename)
1623
+ else:
1624
+ output = os.path.expanduser(output)
1625
+
1626
+ shutil.move(archive.file, output)
1627
+ return output
1628
+
1629
+
1630
+ def confirm(msg: str, yes: bool) -> Optional[bool]:
1631
+ return None if yes else click.confirm(msg, abort=True)
.venv/lib/python3.11/site-packages/ray/autoscaler/_private/constants.py ADDED
@@ -0,0 +1,140 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import sys
3
+
4
+ from ray._private.ray_constants import ( # noqa F401
5
+ AUTOSCALER_RESOURCE_REQUEST_CHANNEL,
6
+ DEFAULT_OBJECT_STORE_MEMORY_PROPORTION,
7
+ LABELS_ENVIRONMENT_VARIABLE,
8
+ LOGGER_FORMAT,
9
+ RESOURCES_ENVIRONMENT_VARIABLE,
10
+ )
11
+
12
+
13
+ def env_integer(key, default):
14
+ if key in os.environ:
15
+ val = os.environ[key]
16
+ if val == "inf":
17
+ return sys.maxsize
18
+ else:
19
+ return int(val)
20
+ return default
21
+
22
+
23
+ # Whether autoscaler cluster status logging is enabled. Set to 0 disable.
24
+ AUTOSCALER_STATUS_LOG = env_integer("RAY_ENABLE_CLUSTER_STATUS_LOG", 1)
25
+
26
+ # The name of the environment variable for plugging in a utilization scorer.
27
+ AUTOSCALER_UTILIZATION_SCORER_KEY = "RAY_AUTOSCALER_UTILIZATION_SCORER"
28
+
29
+ # Whether to avoid launching GPU nodes for CPU only tasks.
30
+ AUTOSCALER_CONSERVE_GPU_NODES = env_integer("AUTOSCALER_CONSERVE_GPU_NODES", 1)
31
+
32
+ # How long to wait for a node to start and terminate, in seconds.
33
+ AUTOSCALER_NODE_START_WAIT_S = env_integer("AUTOSCALER_NODE_START_WAIT_S", 900)
34
+ AUTOSCALER_NODE_TERMINATE_WAIT_S = env_integer("AUTOSCALER_NODE_TERMINATE_WAIT_S", 900)
35
+
36
+ # Interval at which to check if node SSH became available.
37
+ AUTOSCALER_NODE_SSH_INTERVAL_S = env_integer("AUTOSCALER_NODE_SSH_INTERVAL_S", 5)
38
+
39
+ # Abort autoscaling if more than this number of errors are encountered. This
40
+ # is a safety feature to prevent e.g. runaway node launches.
41
+ AUTOSCALER_MAX_NUM_FAILURES = env_integer("AUTOSCALER_MAX_NUM_FAILURES", 5)
42
+
43
+ # The maximum number of nodes to launch in a single request.
44
+ # Multiple requests may be made for this batch size, up to
45
+ # the limit of AUTOSCALER_MAX_CONCURRENT_LAUNCHES.
46
+ AUTOSCALER_MAX_LAUNCH_BATCH = env_integer("AUTOSCALER_MAX_LAUNCH_BATCH", 5)
47
+
48
+ # Max number of nodes to launch at a time.
49
+ AUTOSCALER_MAX_CONCURRENT_LAUNCHES = env_integer(
50
+ "AUTOSCALER_MAX_CONCURRENT_LAUNCHES", 10
51
+ )
52
+
53
+ # Default upscaling speed for the autoscaler. This specifies how many nodes
54
+ # to request at a time, where the desired number to upscale is
55
+ # min(1, upscaling_speed * current_num_nodes)
56
+ # e.g. 1.0 means to request enough nodes to double
57
+ # the cluster size in each round of requests.
58
+ # When the upscaling speed is 0.0, the autoscaler will request 1 node.
59
+ DEFAULT_UPSCALING_SPEED = 0.0
60
+
61
+ # Interval at which to perform autoscaling updates.
62
+ AUTOSCALER_UPDATE_INTERVAL_S = env_integer("AUTOSCALER_UPDATE_INTERVAL_S", 5)
63
+
64
+ # The autoscaler will attempt to restart Ray on nodes it hasn't heard from
65
+ # in more than this interval.
66
+ AUTOSCALER_HEARTBEAT_TIMEOUT_S = env_integer("AUTOSCALER_HEARTBEAT_TIMEOUT_S", 30)
67
+ # The maximum number of nodes (including failed nodes) that the autoscaler will
68
+ # track for logging purposes.
69
+ AUTOSCALER_MAX_NODES_TRACKED = 1500
70
+
71
+ AUTOSCALER_MAX_FAILURES_DISPLAYED = 20
72
+
73
+ AUTOSCALER_NODE_AVAILABILITY_MAX_STALENESS_S = env_integer(
74
+ "AUTOSCALER_NODE_AVAILABILITY_MAX_STALENESS_S", 30 * 60
75
+ )
76
+
77
+ AUTOSCALER_REPORT_PER_NODE_STATUS = (
78
+ env_integer("AUTOSCALER_REPORT_PER_NODE_STATUS", 1) == 1
79
+ )
80
+
81
+ # The maximum allowed resource demand vector size to guarantee the resource
82
+ # demand scheduler bin packing algorithm takes a reasonable amount of time
83
+ # to run.
84
+ AUTOSCALER_MAX_RESOURCE_DEMAND_VECTOR_SIZE = 1000
85
+
86
+ # Port that autoscaler prometheus metrics will be exported to
87
+ AUTOSCALER_METRIC_PORT = env_integer("AUTOSCALER_METRIC_PORT", 44217)
88
+
89
+ # Max number of retries to AWS (default is 5, time increases exponentially)
90
+ BOTO_MAX_RETRIES = env_integer("BOTO_MAX_RETRIES", 12)
91
+ # Max number of retries to create an EC2 node (retry different subnet)
92
+ BOTO_CREATE_MAX_RETRIES = env_integer("BOTO_CREATE_MAX_RETRIES", 5)
93
+
94
+ # ray home path in the container image
95
+ RAY_HOME = "/home/ray"
96
+
97
+ # The order of this list matters! `scripts.py` kills the ray processes in order of this
98
+ # list. Think twice when you add to this list.
99
+ # Invariants:
100
+ # RAYLET must be the first in the list.
101
+ # GCS SERVER must be the last in the list.
102
+ RAY_PROCESSES = [
103
+ # The first element is the substring to filter.
104
+ # The second element, if True, is to filter ps results by command name
105
+ # (only the first 15 charactors of the executable name on Linux);
106
+ # if False, is to filter ps results by command with all its arguments.
107
+ # See STANDARD FORMAT SPECIFIERS section of
108
+ # http://man7.org/linux/man-pages/man1/ps.1.html
109
+ # about comm and args. This can help avoid killing non-ray processes.
110
+ # Format:
111
+ # Keyword to filter, filter by command (True)/filter by args (False)
112
+ ["raylet", True],
113
+ ["plasma_store", True],
114
+ ["monitor.py", False],
115
+ ["ray.util.client.server", False],
116
+ ["default_worker.py", False], # Python worker.
117
+ ["setup_worker.py", False], # Python environment setup worker.
118
+ # For mac osx, setproctitle doesn't change the process name returned
119
+ # by psutil but only cmdline.
120
+ [
121
+ "ray::",
122
+ sys.platform != "darwin",
123
+ ], # Python worker. TODO(mehrdadn): Fix for Windows
124
+ ["io.ray.runtime.runner.worker.DefaultWorker", False], # Java worker.
125
+ ["log_monitor.py", False],
126
+ ["reporter.py", False],
127
+ [os.path.join("dashboard", "agent.py"), False],
128
+ [os.path.join("dashboard", "dashboard.py"), False],
129
+ [os.path.join("runtime_env", "agent", "main.py"), False],
130
+ ["ray_process_reaper.py", False],
131
+ ["gcs_server", True],
132
+ ]
133
+
134
+ # Max Concurrent SSH Calls to stop Docker
135
+ MAX_PARALLEL_SHUTDOWN_WORKERS = env_integer("MAX_PARALLEL_SHUTDOWN_WORKERS", 50)
136
+
137
+ DISABLE_NODE_UPDATERS_KEY = "disable_node_updaters"
138
+ DISABLE_LAUNCH_CONFIG_CHECK_KEY = "disable_launch_config_check"
139
+ FOREGROUND_NODE_LAUNCH_KEY = "foreground_node_launch"
140
+ WORKER_LIVENESS_CHECK_KEY = "worker_liveness_check"
.venv/lib/python3.11/site-packages/ray/autoscaler/_private/docker.py ADDED
@@ -0,0 +1,129 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pathlib import Path
2
+ from typing import Any, Dict
3
+
4
+ from ray.autoscaler._private.cli_logger import cli_logger
5
+
6
+ try: # py3
7
+ from shlex import quote
8
+ except ImportError: # py2
9
+ from pipes import quote
10
+
11
+
12
+ def _check_docker_file_mounts(file_mounts: Dict[str, str]) -> None:
13
+ """Checks if files are passed as file_mounts. This is a problem for Docker
14
+ based clusters because when a file is bind-mounted in Docker, updates to
15
+ the file on the host do not always propagate to the container. Using
16
+ directories is recommended.
17
+ """
18
+ for remote, local in file_mounts.items():
19
+ if Path(local).is_file():
20
+ cli_logger.warning(
21
+ f"File Mount: ({remote}:{local}) refers to a file.\n To ensure"
22
+ " this mount updates properly, please use a directory."
23
+ )
24
+
25
+
26
+ def validate_docker_config(config: Dict[str, Any]) -> None:
27
+ """Checks whether the Docker configuration is valid."""
28
+ if "docker" not in config:
29
+ return
30
+
31
+ _check_docker_file_mounts(config.get("file_mounts", {}))
32
+
33
+ docker_image = config["docker"].get("image")
34
+ cname = config["docker"].get("container_name")
35
+
36
+ head_docker_image = config["docker"].get("head_image", docker_image)
37
+
38
+ worker_docker_image = config["docker"].get("worker_image", docker_image)
39
+
40
+ image_present = docker_image or (head_docker_image and worker_docker_image)
41
+ if (not cname) and (not image_present):
42
+ return
43
+ else:
44
+ assert cname and image_present, "Must provide a container & image name"
45
+
46
+ return None
47
+
48
+
49
+ def with_docker_exec(
50
+ cmds, container_name, docker_cmd, env_vars=None, with_interactive=False
51
+ ):
52
+ assert docker_cmd, "Must provide docker command"
53
+ env_str = ""
54
+ if env_vars:
55
+ env_str = " ".join(["-e {env}=${env}".format(env=env) for env in env_vars])
56
+ return [
57
+ "docker exec {interactive} {env} {container} /bin/bash -c {cmd} ".format(
58
+ interactive="-it" if with_interactive else "",
59
+ env=env_str,
60
+ container=container_name,
61
+ cmd=quote(cmd),
62
+ )
63
+ for cmd in cmds
64
+ ]
65
+
66
+
67
+ def _check_helper(cname, template, docker_cmd):
68
+ return " ".join(
69
+ [docker_cmd, "inspect", "-f", "'{{" + template + "}}'", cname, "||", "true"]
70
+ )
71
+
72
+
73
+ def check_docker_running_cmd(cname, docker_cmd):
74
+ return _check_helper(cname, ".State.Running", docker_cmd)
75
+
76
+
77
+ def check_bind_mounts_cmd(cname, docker_cmd):
78
+ return _check_helper(cname, "json .Mounts", docker_cmd)
79
+
80
+
81
+ def check_docker_image(cname, docker_cmd):
82
+ return _check_helper(cname, ".Config.Image", docker_cmd)
83
+
84
+
85
+ def docker_start_cmds(
86
+ user,
87
+ image,
88
+ mount_dict,
89
+ container_name,
90
+ user_options,
91
+ cluster_name,
92
+ home_directory,
93
+ docker_cmd,
94
+ ):
95
+ # Imported here due to circular dependency.
96
+ from ray.autoscaler.sdk import get_docker_host_mount_location
97
+
98
+ docker_mount_prefix = get_docker_host_mount_location(cluster_name)
99
+ mount = {f"{docker_mount_prefix}/{dst}": dst for dst in mount_dict}
100
+
101
+ mount_flags = " ".join(
102
+ [
103
+ "-v {src}:{dest}".format(src=k, dest=v.replace("~/", home_directory + "/"))
104
+ for k, v in mount.items()
105
+ ]
106
+ )
107
+
108
+ # for click, used in ray cli
109
+ env_vars = {"LC_ALL": "C.UTF-8", "LANG": "C.UTF-8"}
110
+ env_flags = " ".join(
111
+ ["-e {name}={val}".format(name=k, val=v) for k, v in env_vars.items()]
112
+ )
113
+
114
+ user_options_str = " ".join(user_options)
115
+ docker_run = [
116
+ docker_cmd,
117
+ "run",
118
+ "--rm",
119
+ "--name {}".format(container_name),
120
+ "-d",
121
+ "-it",
122
+ mount_flags,
123
+ env_flags,
124
+ user_options_str,
125
+ "--net=host",
126
+ image,
127
+ "bash",
128
+ ]
129
+ return " ".join(docker_run)
.venv/lib/python3.11/site-packages/ray/autoscaler/_private/event_summarizer.py ADDED
@@ -0,0 +1,75 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import time
2
+ from threading import RLock
3
+ from typing import Any, Callable, Dict, List
4
+
5
+
6
+ class EventSummarizer:
7
+ """Utility that aggregates related log messages to reduce log spam."""
8
+
9
+ def __init__(self):
10
+ self.events_by_key: Dict[str, int] = {}
11
+ # Messages to send in next summary batch.
12
+ self.messages_to_send: List[str] = []
13
+ # Tracks TTL of messages. A message will not be re-sent once it is
14
+ # added here, until its TTL expires.
15
+ self.throttled_messages: Dict[str, float] = {}
16
+
17
+ # Event summarizer is used by the main thread and
18
+ # by node launcher child threads.
19
+ self.lock = RLock()
20
+
21
+ def add(
22
+ self, template: str, *, quantity: Any, aggregate: Callable[[Any, Any], Any]
23
+ ) -> None:
24
+ """Add a log message, which will be combined by template.
25
+
26
+ Args:
27
+ template: Format string with one placeholder for quantity.
28
+ quantity: Quantity to aggregate.
29
+ aggregate: Aggregation function used to combine the
30
+ quantities. The result is inserted into the template to
31
+ produce the final log message.
32
+ """
33
+ with self.lock:
34
+ # Enforce proper sentence structure.
35
+ if not template.endswith("."):
36
+ template += "."
37
+ if template in self.events_by_key:
38
+ self.events_by_key[template] = aggregate(
39
+ self.events_by_key[template], quantity
40
+ )
41
+ else:
42
+ self.events_by_key[template] = quantity
43
+
44
+ def add_once_per_interval(self, message: str, key: str, interval_s: int):
45
+ """Add a log message, which is throttled once per interval by a key.
46
+
47
+ Args:
48
+ message: The message to log.
49
+ key: The key to use to deduplicate the message.
50
+ interval_s: Throttling interval in seconds.
51
+ """
52
+ with self.lock:
53
+ if key not in self.throttled_messages:
54
+ self.throttled_messages[key] = time.time() + interval_s
55
+ self.messages_to_send.append(message)
56
+
57
+ def summary(self) -> List[str]:
58
+ """Generate the aggregated log summary of all added events."""
59
+ with self.lock:
60
+ out = []
61
+ for template, quantity in self.events_by_key.items():
62
+ out.append(template.format(quantity))
63
+ out.extend(self.messages_to_send)
64
+ return out
65
+
66
+ def clear(self) -> None:
67
+ """Clear the events added."""
68
+ with self.lock:
69
+ self.events_by_key.clear()
70
+ self.messages_to_send.clear()
71
+ # Expire any messages that have reached their TTL. This allows them
72
+ # to be sent again.
73
+ for k, t in list(self.throttled_messages.items()):
74
+ if time.time() > t:
75
+ del self.throttled_messages[k]
.venv/lib/python3.11/site-packages/ray/autoscaler/_private/event_system.py ADDED
@@ -0,0 +1,106 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from enum import Enum, auto
2
+ from typing import Any, Callable, Dict, List, Optional, Union
3
+
4
+ from ray.autoscaler._private.cli_logger import cli_logger
5
+
6
+
7
+ class CreateClusterEvent(Enum):
8
+ """Events to track in ray.autoscaler.sdk.create_or_update_cluster.
9
+
10
+ Attributes:
11
+ up_started : Invoked at the beginning of create_or_update_cluster.
12
+ ssh_keypair_downloaded : Invoked when the ssh keypair is downloaded.
13
+ cluster_booting_started : Invoked when when the cluster booting starts.
14
+ acquiring_new_head_node : Invoked before the head node is acquired.
15
+ head_node_acquired : Invoked after the head node is acquired.
16
+ ssh_control_acquired : Invoked when the node is being updated.
17
+ run_initialization_cmd : Invoked before all initialization
18
+ commands are called and again before each initialization command.
19
+ run_setup_cmd : Invoked before all setup commands are
20
+ called and again before each setup command.
21
+ start_ray_runtime : Invoked before ray start commands are run.
22
+ start_ray_runtime_completed : Invoked after ray start commands
23
+ are run.
24
+ cluster_booting_completed : Invoked after cluster booting
25
+ is completed.
26
+ """
27
+
28
+ up_started = auto()
29
+ ssh_keypair_downloaded = auto()
30
+ cluster_booting_started = auto()
31
+ acquiring_new_head_node = auto()
32
+ head_node_acquired = auto()
33
+ ssh_control_acquired = auto()
34
+ run_initialization_cmd = auto()
35
+ run_setup_cmd = auto()
36
+ start_ray_runtime = auto()
37
+ start_ray_runtime_completed = auto()
38
+ cluster_booting_completed = auto()
39
+
40
+
41
+ class _EventSystem:
42
+ """Event system that handles storing and calling callbacks for events.
43
+
44
+ Attributes:
45
+ callback_map (Dict[str, List[Callable]]) : Stores list of callbacks
46
+ for events when registered.
47
+ """
48
+
49
+ def __init__(self):
50
+ self.callback_map = {}
51
+
52
+ def add_callback_handler(
53
+ self,
54
+ event: str,
55
+ callback: Union[Callable[[Dict], None], List[Callable[[Dict], None]]],
56
+ ):
57
+ """Stores callback handler for event.
58
+
59
+ Args:
60
+ event: Event that callback should be called on. See
61
+ CreateClusterEvent for details on the events available to be
62
+ registered against.
63
+ callback (Callable[[Dict], None]): Callable object that is invoked
64
+ when specified event occurs.
65
+ """
66
+ if event not in CreateClusterEvent.__members__.values():
67
+ cli_logger.warning(
68
+ f"{event} is not currently tracked, and this"
69
+ " callback will not be invoked."
70
+ )
71
+
72
+ self.callback_map.setdefault(event, []).extend(
73
+ [callback] if type(callback) is not list else callback
74
+ )
75
+
76
+ def execute_callback(
77
+ self, event: CreateClusterEvent, event_data: Optional[Dict[str, Any]] = None
78
+ ):
79
+ """Executes all callbacks for event.
80
+
81
+ Args:
82
+ event: Event that is invoked. See CreateClusterEvent
83
+ for details on the available events.
84
+ event_data (Dict[str, Any]): Argument that is passed to each
85
+ callable object stored for this particular event.
86
+ """
87
+ if event_data is None:
88
+ event_data = {}
89
+
90
+ event_data["event_name"] = event
91
+ if event in self.callback_map:
92
+ for callback in self.callback_map[event]:
93
+ callback(event_data)
94
+
95
+ def clear_callbacks_for_event(self, event: str):
96
+ """Clears stored callable objects for event.
97
+
98
+ Args:
99
+ event: Event that has callable objects stored in map.
100
+ See CreateClusterEvent for details on the available events.
101
+ """
102
+ if event in self.callback_map:
103
+ del self.callback_map[event]
104
+
105
+
106
+ global_event_system = _EventSystem()
.venv/lib/python3.11/site-packages/ray/autoscaler/_private/fake_multi_node/__init__.py ADDED
File without changes
.venv/lib/python3.11/site-packages/ray/autoscaler/_private/fake_multi_node/command_runner.py ADDED
@@ -0,0 +1,91 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import subprocess
3
+ from typing import Dict, List, Tuple
4
+
5
+ from ray.autoscaler._private.docker import with_docker_exec
6
+ from ray.autoscaler.command_runner import CommandRunnerInterface
7
+
8
+
9
+ class FakeDockerCommandRunner(CommandRunnerInterface):
10
+ """Command runner for the fke docker multinode cluster.
11
+
12
+ This command runner uses ``docker exec`` and ``docker cp`` to
13
+ run commands and copy files, respectively.
14
+
15
+ The regular ``DockerCommandRunner`` is made for use in SSH settings
16
+ where Docker runs on a remote hose. In contrast, this command runner
17
+ does not wrap the docker commands in ssh calls.
18
+ """
19
+
20
+ def __init__(self, docker_config, **common_args):
21
+ self.container_name = docker_config["container_name"]
22
+ self.docker_config = docker_config
23
+ self.home_dir = None
24
+ self.initialized = False
25
+ # Optionally use 'podman' instead of 'docker'
26
+ use_podman = docker_config.get("use_podman", False)
27
+ self.docker_cmd = "podman" if use_podman else "docker"
28
+
29
+ def _run_shell(self, cmd: str, timeout: int = 120) -> str:
30
+ return subprocess.check_output(
31
+ cmd, shell=True, timeout=timeout, encoding="utf-8"
32
+ )
33
+
34
+ def run(
35
+ self,
36
+ cmd: str = None,
37
+ timeout: int = 120,
38
+ exit_on_fail: bool = False,
39
+ port_forward: List[Tuple[int, int]] = None,
40
+ with_output: bool = False,
41
+ environment_variables: Dict[str, object] = None,
42
+ run_env: str = "auto",
43
+ ssh_options_override_ssh_key: str = "",
44
+ shutdown_after_run: bool = False,
45
+ ) -> str:
46
+ prefix = with_docker_exec(
47
+ [cmd],
48
+ container_name=self.container_name,
49
+ with_interactive=False,
50
+ docker_cmd=self.docker_cmd,
51
+ )[0]
52
+ return self._run_shell(prefix)
53
+
54
+ def run_init(
55
+ self, *, as_head: bool, file_mounts: Dict[str, str], sync_run_yet: bool
56
+ ):
57
+ pass
58
+
59
+ def remote_shell_command_str(self):
60
+ return "{} exec -it {} bash".format(self.docker_cmd, self.container_name)
61
+
62
+ def run_rsync_down(self, source, target, options=None):
63
+ docker_dir = os.path.dirname(self._docker_expand_user(source))
64
+
65
+ self._run_shell(f"docker cp {self.container_name}:{docker_dir} {target}")
66
+
67
+ def run_rsync_up(self, source, target, options=None):
68
+ docker_dir = os.path.dirname(self._docker_expand_user(target))
69
+ self.run(cmd=f"mkdir -p {docker_dir}")
70
+
71
+ self._run_shell(f"docker cp {source} {self.container_name}:{docker_dir}")
72
+
73
+ def _docker_expand_user(self, string, any_char=False):
74
+ user_pos = string.find("~")
75
+ if user_pos > -1:
76
+ if self.home_dir is None:
77
+ self.home_dir = self._run_shell(
78
+ with_docker_exec(
79
+ ["printenv HOME"],
80
+ container_name=self.container_name,
81
+ docker_cmd=self.docker_cmd,
82
+ )
83
+ ).strip()
84
+
85
+ if any_char:
86
+ return string.replace("~/", self.home_dir + "/")
87
+
88
+ elif not any_char and user_pos == 0:
89
+ return string.replace("~", self.home_dir, 1)
90
+
91
+ return string
.venv/lib/python3.11/site-packages/ray/autoscaler/_private/fake_multi_node/docker_monitor.py ADDED
@@ -0,0 +1,246 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Fake multinode docker monitoring script.
2
+
3
+ This script is the "docker compose server" for the fake_multinode
4
+ provider using Docker compose. It should be started before running
5
+ `RAY_FAKE_CLUSTER=1 ray up <cluster_config>`.
6
+
7
+ This script reads the volume directory from a supplied fake multinode
8
+ docker cluster config file.
9
+ It then waits until a docker-compose.yaml file is created in the same
10
+ directory, which is done by the `ray up` command.
11
+
12
+ It then watches for changes in the docker-compose.yaml file and runs
13
+ `docker compose up` whenever changes are detected. This will start docker
14
+ containers as requested by the autoscaler.
15
+
16
+ Generally, the docker-compose.yaml will be mounted in the head node of the
17
+ cluster, which will then continue to change it according to the autoscaler
18
+ requirements.
19
+
20
+ Additionally, this script monitors the docker container status using
21
+ `docker status` and writes it into a `status.json`. This information is
22
+ again used by the autoscaler to determine if any nodes have died.
23
+ """
24
+ import argparse
25
+ import json
26
+ import os
27
+ import shutil
28
+ import subprocess
29
+ import time
30
+ from typing import Any, Dict, List, Optional
31
+
32
+ import yaml
33
+
34
+
35
+ def _read_yaml(path: str):
36
+ with open(path, "rt") as f:
37
+ return yaml.safe_load(f)
38
+
39
+
40
+ def _update_docker_compose(
41
+ docker_compose_path: str, project_name: str, status: Optional[Dict[str, Any]]
42
+ ) -> bool:
43
+ docker_compose_config = _read_yaml(docker_compose_path)
44
+
45
+ if not docker_compose_config:
46
+ print("Docker compose currently empty")
47
+ return False
48
+
49
+ cmd = ["up", "-d"]
50
+ if status and len(status) > 0:
51
+ cmd += ["--no-recreate"]
52
+
53
+ shutdown = False
54
+ if not docker_compose_config["services"]:
55
+ # If no more nodes, run `down` instead of `up`
56
+ print("Shutting down nodes")
57
+ cmd = ["down"]
58
+ shutdown = True
59
+ try:
60
+ subprocess.check_call(
61
+ ["docker", "compose", "-f", docker_compose_path, "-p", project_name]
62
+ + cmd
63
+ + [
64
+ "--remove-orphans",
65
+ ]
66
+ )
67
+ except Exception as e:
68
+ print(f"Ran into error when updating docker compose: {e}")
69
+ # Ignore error
70
+
71
+ return shutdown
72
+
73
+
74
+ def _get_ip(
75
+ project_name: str,
76
+ container_name: str,
77
+ override_network: Optional[str] = None,
78
+ retry_times: int = 3,
79
+ ) -> Optional[str]:
80
+ network = override_network or f"{project_name}_ray_local"
81
+
82
+ cmd = [
83
+ "docker",
84
+ "inspect",
85
+ "-f",
86
+ '"{{ .NetworkSettings.Networks' f".{network}.IPAddress" ' }}"',
87
+ f"{container_name}",
88
+ ]
89
+ for i in range(retry_times):
90
+ try:
91
+ ip_address = subprocess.check_output(cmd, encoding="utf-8")
92
+ except Exception:
93
+ time.sleep(1)
94
+ else:
95
+ return ip_address.strip().strip('"').strip('\\"')
96
+ return None
97
+
98
+
99
+ def _update_docker_status(
100
+ docker_compose_path: str, project_name: str, docker_status_path: str
101
+ ):
102
+ data_str = ""
103
+ try:
104
+ data_str = (
105
+ subprocess.check_output(
106
+ [
107
+ "docker",
108
+ "compose",
109
+ "-f",
110
+ docker_compose_path,
111
+ "-p",
112
+ project_name,
113
+ "ps",
114
+ "--format",
115
+ "json",
116
+ ]
117
+ )
118
+ .decode("utf-8")
119
+ .strip()
120
+ .split("\n")
121
+ )
122
+ data: List[Dict[str, str]] = []
123
+ for line in data_str:
124
+ line = line.strip()
125
+ if line:
126
+ data.append(json.loads(line))
127
+ except Exception as e:
128
+ print(f"Ran into error when fetching status: {e}")
129
+ print(f"docker compose ps output: {data_str}")
130
+ return None
131
+
132
+ status = {}
133
+ for container in data:
134
+ node_id = container["Service"]
135
+ container_name = container["Name"]
136
+ if container["State"] == "running":
137
+ ip = _get_ip(project_name, container_name)
138
+ else:
139
+ ip = ""
140
+ container["IP"] = ip
141
+ status[node_id] = container
142
+
143
+ with open(docker_status_path, "wt") as f:
144
+ json.dump(status, f)
145
+
146
+ return status
147
+
148
+
149
+ def monitor_docker(
150
+ docker_compose_path: str,
151
+ status_path: str,
152
+ project_name: str,
153
+ update_interval: float = 1.0,
154
+ ):
155
+ while not os.path.exists(docker_compose_path):
156
+ # Wait until cluster is created
157
+ time.sleep(0.5)
158
+
159
+ print("Docker compose config detected, starting status monitoring")
160
+
161
+ # Make sure this is always writeable from inside the containers
162
+ os.chmod(docker_compose_path, 0o777)
163
+
164
+ docker_config = {"force_update": True}
165
+
166
+ # Force update
167
+ next_update = time.monotonic() - 1.0
168
+ shutdown = False
169
+ status = None
170
+
171
+ # Loop:
172
+ # If the config changed, update cluster.
173
+ # Every `update_interval` seconds, update docker status.
174
+ while not shutdown:
175
+ new_docker_config = _read_yaml(docker_compose_path)
176
+ if new_docker_config != docker_config:
177
+ # Update cluster
178
+ shutdown = _update_docker_compose(docker_compose_path, project_name, status)
179
+
180
+ # Force status update
181
+ next_update = time.monotonic() - 1.0
182
+
183
+ if time.monotonic() > next_update:
184
+ # Update docker status
185
+ status = _update_docker_status(
186
+ docker_compose_path, project_name, status_path
187
+ )
188
+ next_update = time.monotonic() + update_interval
189
+
190
+ docker_config = new_docker_config
191
+ time.sleep(0.1)
192
+
193
+ print("Cluster shut down, terminating monitoring script.")
194
+
195
+
196
+ def start_monitor(config_file: str):
197
+ cluster_config = _read_yaml(config_file)
198
+
199
+ provider_config = cluster_config["provider"]
200
+ assert provider_config["type"] == "fake_multinode_docker", (
201
+ f"The docker monitor only works with providers of type "
202
+ f"`fake_multinode_docker`, got `{provider_config['type']}`"
203
+ )
204
+
205
+ project_name = provider_config["project_name"]
206
+
207
+ volume_dir = provider_config["shared_volume_dir"]
208
+ os.makedirs(volume_dir, mode=0o755, exist_ok=True)
209
+
210
+ # Create bootstrap config
211
+ bootstrap_config_path = os.path.join(volume_dir, "bootstrap_config.yaml")
212
+ shutil.copy(config_file, bootstrap_config_path)
213
+
214
+ # These two files usually don't exist, yet
215
+ docker_compose_config_path = os.path.join(volume_dir, "docker-compose.yaml")
216
+
217
+ docker_status_path = os.path.join(volume_dir, "status.json")
218
+
219
+ if os.path.exists(docker_compose_config_path):
220
+ # We wait until this file exists, so remove it if it exists
221
+ # from a previous run.
222
+ os.remove(docker_compose_config_path)
223
+
224
+ if os.path.exists(docker_status_path):
225
+ os.remove(docker_status_path)
226
+ # Create empty file so it can be mounted
227
+ with open(docker_status_path, "wt") as f:
228
+ f.write("{}")
229
+
230
+ print(
231
+ f"Starting monitor process. Please start Ray cluster with:\n"
232
+ f" RAY_FAKE_CLUSTER=1 ray up {config_file}"
233
+ )
234
+ monitor_docker(docker_compose_config_path, docker_status_path, project_name)
235
+
236
+
237
+ if __name__ == "__main__":
238
+ parser = argparse.ArgumentParser()
239
+ parser.add_argument(
240
+ "config_file",
241
+ help="Path to cluster config file containing a fake docker "
242
+ "cluster configuration.",
243
+ )
244
+ args = parser.parse_args()
245
+
246
+ start_monitor(args.config_file)
.venv/lib/python3.11/site-packages/ray/autoscaler/_private/fake_multi_node/test_utils.py ADDED
@@ -0,0 +1,398 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import logging
3
+ import os
4
+ import random
5
+ import shutil
6
+ import subprocess
7
+ import sys
8
+ import tempfile
9
+ import threading
10
+ import time
11
+ from typing import Any, Dict, Optional
12
+
13
+ import yaml
14
+
15
+ import ray
16
+ from ray._private.dict import deep_update
17
+ from ray.autoscaler._private.fake_multi_node.node_provider import (
18
+ FAKE_DOCKER_DEFAULT_CLIENT_PORT,
19
+ FAKE_DOCKER_DEFAULT_GCS_PORT,
20
+ )
21
+ from ray.util.queue import Empty, Queue
22
+
23
+ logger = logging.getLogger(__name__)
24
+
25
+ DEFAULT_DOCKER_IMAGE = "rayproject/ray:nightly-py{major}{minor}-cpu"
26
+
27
+
28
+ class ResourcesNotReadyError(RuntimeError):
29
+ pass
30
+
31
+
32
+ class DockerCluster:
33
+ """Docker cluster wrapper.
34
+
35
+ Creates a directory for starting a fake multinode docker cluster.
36
+
37
+ Includes APIs to update the cluster config as needed in tests,
38
+ and to start and connect to the cluster.
39
+ """
40
+
41
+ def __init__(self, config: Optional[Dict[str, Any]] = None):
42
+ self._base_config_file = os.path.join(
43
+ os.path.dirname(__file__), "example_docker.yaml"
44
+ )
45
+ self._tempdir = None
46
+ self._config_file = None
47
+ self._nodes_file = None
48
+ self._nodes = {}
49
+ self._status_file = None
50
+ self._status = {}
51
+ self._partial_config = config
52
+ self._cluster_config = None
53
+ self._docker_image = None
54
+
55
+ self._monitor_script = os.path.join(
56
+ os.path.dirname(__file__), "docker_monitor.py"
57
+ )
58
+ self._monitor_process = None
59
+
60
+ self._execution_thread = None
61
+ self._execution_event = threading.Event()
62
+ self._execution_queue = None
63
+
64
+ @property
65
+ def config_file(self):
66
+ return self._config_file
67
+
68
+ @property
69
+ def cluster_config(self):
70
+ return self._cluster_config
71
+
72
+ @property
73
+ def cluster_dir(self):
74
+ return self._tempdir
75
+
76
+ @property
77
+ def gcs_port(self):
78
+ return self._cluster_config.get("provider", {}).get(
79
+ "host_gcs_port", FAKE_DOCKER_DEFAULT_GCS_PORT
80
+ )
81
+
82
+ @property
83
+ def client_port(self):
84
+ return self._cluster_config.get("provider", {}).get(
85
+ "host_client_port", FAKE_DOCKER_DEFAULT_CLIENT_PORT
86
+ )
87
+
88
+ def connect(self, client: bool = True, timeout: int = 120, **init_kwargs):
89
+ """Connect to the docker-compose Ray cluster.
90
+
91
+ Assumes the cluster is at RAY_TESTHOST (defaults to
92
+ ``127.0.0.1``).
93
+
94
+ Args:
95
+ client: If True, uses Ray client to connect to the
96
+ cluster. If False, uses GCS to connect to the cluster.
97
+ timeout: Connection timeout in seconds.
98
+ **init_kwargs: kwargs to pass to ``ray.init()``.
99
+
100
+ """
101
+ host = os.environ.get("RAY_TESTHOST", "127.0.0.1")
102
+
103
+ if client:
104
+ port = self.client_port
105
+ address = f"ray://{host}:{port}"
106
+ else:
107
+ port = self.gcs_port
108
+ address = f"{host}:{port}"
109
+
110
+ timeout_at = time.monotonic() + timeout
111
+ while time.monotonic() < timeout_at:
112
+ try:
113
+ ray.init(address, **init_kwargs)
114
+ self.wait_for_resources({"CPU": 1})
115
+ except ResourcesNotReadyError:
116
+ time.sleep(1)
117
+ continue
118
+ else:
119
+ break
120
+
121
+ try:
122
+ ray.cluster_resources()
123
+ except Exception as e:
124
+ raise RuntimeError(f"Timed out connecting to Ray: {e}")
125
+
126
+ def remote_execution_api(self) -> "RemoteAPI":
127
+ """Create an object to control cluster state from within the cluster."""
128
+ self._execution_queue = Queue(actor_options={"num_cpus": 0})
129
+ stop_event = self._execution_event
130
+
131
+ def entrypoint():
132
+ while not stop_event.is_set():
133
+ try:
134
+ cmd, kwargs = self._execution_queue.get(timeout=1)
135
+ except Empty:
136
+ continue
137
+
138
+ if cmd == "kill_node":
139
+ self.kill_node(**kwargs)
140
+
141
+ self._execution_thread = threading.Thread(target=entrypoint)
142
+ self._execution_thread.start()
143
+
144
+ return RemoteAPI(self._execution_queue)
145
+
146
+ @staticmethod
147
+ def wait_for_resources(resources: Dict[str, float], timeout: int = 60):
148
+ """Wait until Ray cluster resources are available
149
+
150
+ Args:
151
+ resources: Minimum resources needed before
152
+ this function returns.
153
+ timeout: Timeout in seconds.
154
+
155
+ """
156
+ timeout = time.monotonic() + timeout
157
+
158
+ available = ray.cluster_resources()
159
+ while any(available.get(k, 0.0) < v for k, v in resources.items()):
160
+ if time.monotonic() > timeout:
161
+ raise ResourcesNotReadyError(
162
+ f"Timed out waiting for resources: {resources}"
163
+ )
164
+ time.sleep(1)
165
+ available = ray.cluster_resources()
166
+
167
+ def update_config(self, config: Optional[Dict[str, Any]] = None):
168
+ """Update autoscaling config.
169
+
170
+ Does a deep update of the base config with a new configuration.
171
+ This can change autoscaling behavior.
172
+
173
+ Args:
174
+ config: Partial config to update current
175
+ config with.
176
+
177
+ """
178
+ assert self._tempdir, "Call setup() first"
179
+
180
+ config = config or {}
181
+
182
+ if config:
183
+ self._partial_config = config
184
+
185
+ if not config.get("provider", {}).get("image"):
186
+ # No image specified, trying to parse from buildkite
187
+ docker_image = os.environ.get("RAY_DOCKER_IMAGE", None)
188
+
189
+ if not docker_image:
190
+ # If still no docker image, use one according to Python version
191
+ mj = sys.version_info.major
192
+ mi = sys.version_info.minor
193
+
194
+ docker_image = DEFAULT_DOCKER_IMAGE.format(major=mj, minor=mi)
195
+
196
+ self._docker_image = docker_image
197
+
198
+ with open(self._base_config_file, "rt") as f:
199
+ cluster_config = yaml.safe_load(f)
200
+
201
+ if self._partial_config:
202
+ deep_update(cluster_config, self._partial_config, new_keys_allowed=True)
203
+
204
+ if self._docker_image:
205
+ cluster_config["provider"]["image"] = self._docker_image
206
+
207
+ cluster_config["provider"]["shared_volume_dir"] = self._tempdir
208
+
209
+ self._cluster_config = cluster_config
210
+
211
+ with open(self._config_file, "wt") as f:
212
+ yaml.safe_dump(self._cluster_config, f)
213
+
214
+ logging.info(f"Updated cluster config to: {self._cluster_config}")
215
+
216
+ def maybe_pull_image(self):
217
+ if self._docker_image:
218
+ try:
219
+ images_str = subprocess.check_output(
220
+ f"docker image inspect {self._docker_image}", shell=True
221
+ )
222
+ images = json.loads(images_str)
223
+ except Exception as e:
224
+ logger.error(f"Error inspecting image {self._docker_image}: {e}")
225
+ return
226
+
227
+ if not images:
228
+ try:
229
+ subprocess.check_call(
230
+ f"docker pull {self._docker_image}", shell=True
231
+ )
232
+ except Exception as e:
233
+ logger.error(f"Error pulling image {self._docker_image}: {e}")
234
+
235
+ def setup(self):
236
+ """Setup docker compose cluster environment.
237
+
238
+ Creates the temporary directory, writes the initial config file,
239
+ and pulls the docker image, if required.
240
+ """
241
+ self._tempdir = tempfile.mkdtemp(dir=os.environ.get("RAY_TEMPDIR", None))
242
+ os.chmod(self._tempdir, 0o777)
243
+ self._config_file = os.path.join(self._tempdir, "cluster.yaml")
244
+ self._nodes_file = os.path.join(self._tempdir, "nodes.json")
245
+ self._status_file = os.path.join(self._tempdir, "status.json")
246
+ self.update_config()
247
+ self.maybe_pull_image()
248
+
249
+ def teardown(self, keep_dir: bool = False):
250
+ """Tear down docker compose cluster environment.
251
+
252
+ Args:
253
+ keep_dir: If True, cluster directory
254
+ will not be removed after termination.
255
+ """
256
+ if not keep_dir:
257
+ shutil.rmtree(self._tempdir)
258
+ self._tempdir = None
259
+ self._config_file = None
260
+
261
+ def _start_monitor(self):
262
+ self._monitor_process = subprocess.Popen(
263
+ [sys.executable, self._monitor_script, self.config_file]
264
+ )
265
+ time.sleep(2)
266
+
267
+ def _stop_monitor(self):
268
+ if self._monitor_process:
269
+ self._monitor_process.wait(timeout=30)
270
+ if self._monitor_process.poll() is None:
271
+ self._monitor_process.terminate()
272
+ self._monitor_process = None
273
+
274
+ def start(self):
275
+ """Start docker compose cluster.
276
+
277
+ Starts the monitor process and runs ``ray up``.
278
+ """
279
+ self._start_monitor()
280
+
281
+ subprocess.check_call(
282
+ f"RAY_FAKE_CLUSTER=1 ray up -y {self.config_file}", shell=True
283
+ )
284
+
285
+ def stop(self):
286
+ """Stop docker compose cluster.
287
+
288
+ Runs ``ray down`` and stops the monitor process.
289
+ """
290
+ if ray.is_initialized:
291
+ ray.shutdown()
292
+
293
+ subprocess.check_call(
294
+ f"RAY_FAKE_CLUSTER=1 ray down -y {self.config_file}", shell=True
295
+ )
296
+
297
+ self._stop_monitor()
298
+ self._execution_event.set()
299
+
300
+ def _update_nodes(self):
301
+ with open(self._nodes_file, "rt") as f:
302
+ self._nodes = json.load(f)
303
+
304
+ def _update_status(self):
305
+ with open(self._status_file, "rt") as f:
306
+ self._status = json.load(f)
307
+
308
+ def _get_node(
309
+ self,
310
+ node_id: Optional[str] = None,
311
+ num: Optional[int] = None,
312
+ rand: Optional[str] = None,
313
+ ) -> str:
314
+ self._update_nodes()
315
+ if node_id:
316
+ assert (
317
+ not num and not rand
318
+ ), "Only provide either `node_id`, `num`, or `random`."
319
+ elif num:
320
+ assert (
321
+ not node_id and not rand
322
+ ), "Only provide either `node_id`, `num`, or `random`."
323
+ base = "fffffffffffffffffffffffffffffffffffffffffffffffffff"
324
+ node_id = base + str(num).zfill(5)
325
+ elif rand:
326
+ assert (
327
+ not node_id and not num
328
+ ), "Only provide either `node_id`, `num`, or `random`."
329
+ assert rand in [
330
+ "worker",
331
+ "any",
332
+ ], "`random` must be one of ['worker', 'any']"
333
+ choices = list(self._nodes.keys())
334
+ if rand == "worker":
335
+ choices.remove(
336
+ "fffffffffffffffffffffffffffffffffffffffffffffffffff00000"
337
+ )
338
+ # Else: any
339
+ node_id = random.choice(choices)
340
+
341
+ assert node_id in self._nodes, f"Node with ID {node_id} is not in active nodes."
342
+ return node_id
343
+
344
+ def _get_docker_container(self, node_id: str) -> Optional[str]:
345
+ self._update_status()
346
+ node_status = self._status.get(node_id)
347
+ if not node_status:
348
+ return None
349
+
350
+ return node_status["Name"]
351
+
352
+ def kill_node(
353
+ self,
354
+ node_id: Optional[str] = None,
355
+ num: Optional[int] = None,
356
+ rand: Optional[str] = None,
357
+ ):
358
+ """Kill node.
359
+
360
+ If ``node_id`` is given, kill that node.
361
+
362
+ If ``num`` is given, construct node_id from this number, and kill
363
+ that node.
364
+
365
+ If ``rand`` is given (as either ``worker`` or ``any``), kill a random
366
+ node.
367
+ """
368
+ node_id = self._get_node(node_id=node_id, num=num, rand=rand)
369
+ container = self._get_docker_container(node_id=node_id)
370
+ subprocess.check_call(f"docker kill {container}", shell=True)
371
+
372
+
373
+ class RemoteAPI:
374
+ """Remote API to control cluster state from within cluster tasks.
375
+
376
+ This API uses a Ray queue to interact with an execution thread on the
377
+ host machine that will execute commands passed to the queue.
378
+
379
+ Instances of this class can be serialized and passed to Ray remote actors
380
+ to interact with cluster state (but they can also be used outside actors).
381
+
382
+ The API subset is limited to specific commands.
383
+
384
+ Args:
385
+ queue: Ray queue to push command instructions to.
386
+
387
+ """
388
+
389
+ def __init__(self, queue: Queue):
390
+ self._queue = queue
391
+
392
+ def kill_node(
393
+ self,
394
+ node_id: Optional[str] = None,
395
+ num: Optional[int] = None,
396
+ rand: Optional[str] = None,
397
+ ):
398
+ self._queue.put(("kill_node", dict(node_id=node_id, num=num, rand=rand)))
.venv/lib/python3.11/site-packages/ray/autoscaler/_private/legacy_info_string.py ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+
3
+ from ray._private.ray_constants import DEBUG_AUTOSCALING_STATUS_LEGACY
4
+ from ray.experimental.internal_kv import _internal_kv_initialized, _internal_kv_put
5
+
6
+ """This file provides legacy support for the old info string in order to
7
+ ensure the dashboard's `api/cluster_status` does not break backwards
8
+ compatibilty.
9
+ """
10
+
11
+ logger = logging.getLogger(__name__)
12
+
13
+
14
+ def legacy_log_info_string(autoscaler, nodes):
15
+ tmp = "Cluster status: "
16
+ tmp += info_string(autoscaler, nodes)
17
+ tmp += "\n"
18
+ tmp += autoscaler.load_metrics.info_string()
19
+ tmp += "\n"
20
+ tmp += autoscaler.resource_demand_scheduler.debug_string(
21
+ nodes,
22
+ autoscaler.pending_launches.breakdown(),
23
+ autoscaler.load_metrics.get_resource_utilization(),
24
+ )
25
+ if _internal_kv_initialized():
26
+ _internal_kv_put(DEBUG_AUTOSCALING_STATUS_LEGACY, tmp, overwrite=True)
27
+ logger.debug(tmp)
28
+
29
+
30
+ def info_string(autoscaler, nodes):
31
+ suffix = ""
32
+ if autoscaler.updaters:
33
+ suffix += " ({} updating)".format(len(autoscaler.updaters))
34
+ if autoscaler.num_failed_updates:
35
+ suffix += " ({} failed to update)".format(len(autoscaler.num_failed_updates))
36
+
37
+ return "{} nodes{}".format(len(nodes), suffix)
.venv/lib/python3.11/site-packages/ray/autoscaler/_private/load_metrics.py ADDED
@@ -0,0 +1,375 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ import time
3
+ from collections import Counter
4
+ from functools import reduce
5
+ from typing import Dict, List
6
+
7
+ from ray._private.gcs_utils import PlacementGroupTableData
8
+ from ray.autoscaler._private.constants import (
9
+ AUTOSCALER_MAX_RESOURCE_DEMAND_VECTOR_SIZE,
10
+ AUTOSCALER_REPORT_PER_NODE_STATUS,
11
+ )
12
+ from ray.autoscaler._private.util import (
13
+ DictCount,
14
+ LoadMetricsSummary,
15
+ NodeIP,
16
+ ResourceDict,
17
+ )
18
+ from ray.core.generated.common_pb2 import PlacementStrategy
19
+
20
+ logger = logging.getLogger(__name__)
21
+
22
+
23
+ def add_resources(dict1: Dict[str, float], dict2: Dict[str, float]) -> Dict[str, float]:
24
+ """Add the values in two dictionaries.
25
+
26
+ Returns:
27
+ dict: A new dictionary (inputs remain unmodified).
28
+ """
29
+ new_dict = dict1.copy()
30
+ for k, v in dict2.items():
31
+ new_dict[k] = v + new_dict.get(k, 0)
32
+ return new_dict
33
+
34
+
35
+ def freq_of_dicts(dicts: List[Dict], serializer=None, deserializer=dict) -> DictCount:
36
+ """Count a list of dictionaries (or unhashable types).
37
+
38
+ This is somewhat annoying because mutable data structures aren't hashable,
39
+ and set/dict keys must be hashable.
40
+
41
+ Args:
42
+ dicts (List[D]): A list of dictionaries to be counted.
43
+ serializer (D -> S): A custom serialization function. The output type S
44
+ must be hashable. The default serializer converts a dictionary into
45
+ a frozenset of KV pairs.
46
+ deserializer (S -> U): A custom deserialization function. See the
47
+ serializer for information about type S. For dictionaries U := D.
48
+
49
+ Returns:
50
+ List[Tuple[U, int]]: Returns a list of tuples. Each entry in the list
51
+ is a tuple containing a unique entry from `dicts` and its
52
+ corresponding frequency count.
53
+ """
54
+ if serializer is None:
55
+ serializer = lambda d: frozenset(d.items()) # noqa: E731
56
+
57
+ freqs = Counter(serializer(d) for d in dicts)
58
+ as_list = []
59
+ for as_set, count in freqs.items():
60
+ as_list.append((deserializer(as_set), count))
61
+ return as_list
62
+
63
+
64
+ class LoadMetrics:
65
+ """Container for cluster load metrics.
66
+
67
+ Metrics here are updated from raylet heartbeats. The autoscaler
68
+ queries these metrics to determine when to scale up, and which nodes
69
+ can be removed.
70
+ """
71
+
72
+ def __init__(self):
73
+ self.last_heartbeat_time_by_ip = {}
74
+ self.static_resources_by_ip = {}
75
+ self.dynamic_resources_by_ip = {}
76
+ self.raylet_id_by_ip = {}
77
+ self.waiting_bundles = []
78
+ self.infeasible_bundles = []
79
+ self.pending_placement_groups = []
80
+ self.resource_requests = []
81
+ self.cluster_full_of_actors_detected = False
82
+ self.ray_nodes_last_used_time_by_ip = {}
83
+
84
+ def __bool__(self):
85
+ """A load metrics instance is Falsey iff the autoscaler process
86
+ has not received a resource message from the GCS.
87
+ """
88
+ return bool(self.raylet_id_by_ip)
89
+
90
+ def update(
91
+ self,
92
+ ip: str,
93
+ raylet_id: bytes,
94
+ static_resources: Dict[str, Dict],
95
+ dynamic_resources: Dict[str, Dict],
96
+ node_idle_duration_s: float,
97
+ waiting_bundles: List[Dict[str, float]] = None,
98
+ infeasible_bundles: List[Dict[str, float]] = None,
99
+ pending_placement_groups: List[PlacementGroupTableData] = None,
100
+ cluster_full_of_actors_detected: bool = False,
101
+ ):
102
+ self.static_resources_by_ip[ip] = static_resources
103
+ self.raylet_id_by_ip[ip] = raylet_id
104
+ self.cluster_full_of_actors_detected = cluster_full_of_actors_detected
105
+
106
+ if not waiting_bundles:
107
+ waiting_bundles = []
108
+ if not infeasible_bundles:
109
+ infeasible_bundles = []
110
+ if not pending_placement_groups:
111
+ pending_placement_groups = []
112
+
113
+ # We are not guaranteed to have a corresponding dynamic resource
114
+ # for every static resource because dynamic resources are based on
115
+ # the available resources in the heartbeat, which does not exist
116
+ # if it is zero. Thus, we have to update dynamic resources here.
117
+ dynamic_resources_update = dynamic_resources.copy()
118
+ for resource_name, capacity in self.static_resources_by_ip[ip].items():
119
+ if resource_name not in dynamic_resources_update:
120
+ dynamic_resources_update[resource_name] = 0.0
121
+ self.dynamic_resources_by_ip[ip] = dynamic_resources_update
122
+
123
+ now = time.time()
124
+ self.ray_nodes_last_used_time_by_ip[ip] = now - node_idle_duration_s
125
+ self.last_heartbeat_time_by_ip[ip] = now
126
+ self.waiting_bundles = waiting_bundles
127
+ self.infeasible_bundles = infeasible_bundles
128
+ self.pending_placement_groups = pending_placement_groups
129
+
130
+ def mark_active(self, ip):
131
+ assert ip is not None, "IP should be known at this time"
132
+ logger.debug("Node {} is newly setup, treating as active".format(ip))
133
+ self.last_heartbeat_time_by_ip[ip] = time.time()
134
+
135
+ def is_active(self, ip):
136
+ return ip in self.last_heartbeat_time_by_ip
137
+
138
+ def prune_active_ips(self, active_ips: List[str]):
139
+ """The Raylet ips stored by LoadMetrics are obtained by polling
140
+ the GCS in Monitor.update_load_metrics().
141
+
142
+ On the other hand, the autoscaler gets a list of node ips from
143
+ its NodeProvider.
144
+
145
+ This method removes from LoadMetrics the ips unknown to the autoscaler.
146
+
147
+ Args:
148
+ active_ips (List[str]): The node ips known to the autoscaler.
149
+ """
150
+ active_ips = set(active_ips)
151
+
152
+ def prune(mapping, should_log):
153
+ unwanted_ips = set(mapping) - active_ips
154
+ for unwanted_ip in unwanted_ips:
155
+ if should_log:
156
+ logger.info("LoadMetrics: " f"Removed ip: {unwanted_ip}.")
157
+ del mapping[unwanted_ip]
158
+ if unwanted_ips and should_log:
159
+ logger.info(
160
+ "LoadMetrics: "
161
+ "Removed {} stale ip mappings: {} not in {}".format(
162
+ len(unwanted_ips), unwanted_ips, active_ips
163
+ )
164
+ )
165
+ assert not (unwanted_ips & set(mapping))
166
+
167
+ prune(self.ray_nodes_last_used_time_by_ip, should_log=True)
168
+ prune(self.static_resources_by_ip, should_log=False)
169
+ prune(self.raylet_id_by_ip, should_log=False)
170
+ prune(self.dynamic_resources_by_ip, should_log=False)
171
+ prune(self.last_heartbeat_time_by_ip, should_log=False)
172
+
173
+ def get_node_resources(self):
174
+ """Return a list of node resources (static resource sizes).
175
+
176
+ Example:
177
+ >>> from ray.autoscaler._private.load_metrics import LoadMetrics
178
+ >>> metrics = LoadMetrics(...) # doctest: +SKIP
179
+ >>> metrics.get_node_resources() # doctest: +SKIP
180
+ [{"CPU": 1}, {"CPU": 4, "GPU": 8}] # for two different nodes
181
+ """
182
+ return self.static_resources_by_ip.values()
183
+
184
+ def get_static_node_resources_by_ip(self) -> Dict[NodeIP, ResourceDict]:
185
+ """Return a dict of node resources for every node ip.
186
+
187
+ Example:
188
+ >>> from ray.autoscaler._private.load_metrics import LoadMetrics
189
+ >>> metrics = LoadMetrics(...) # doctest: +SKIP
190
+ >>> metrics.get_static_node_resources_by_ip() # doctest: +SKIP
191
+ {127.0.0.1: {"CPU": 1}, 127.0.0.2: {"CPU": 4, "GPU": 8}}
192
+ """
193
+ return self.static_resources_by_ip
194
+
195
+ def get_resource_utilization(self):
196
+ return self.dynamic_resources_by_ip
197
+
198
+ def _get_resource_usage(self):
199
+ resources_used = {}
200
+ resources_total = {}
201
+ for ip, max_resources in self.static_resources_by_ip.items():
202
+ avail_resources = self.dynamic_resources_by_ip[ip]
203
+ for resource_id, amount in max_resources.items():
204
+ used = amount - avail_resources[resource_id]
205
+ if resource_id not in resources_used:
206
+ resources_used[resource_id] = 0.0
207
+ resources_total[resource_id] = 0.0
208
+ resources_used[resource_id] += used
209
+ resources_total[resource_id] += amount
210
+ used = max(0, used)
211
+
212
+ return resources_used, resources_total
213
+
214
+ def get_resource_demand_vector(self, clip=True):
215
+ if clip:
216
+ # Bound the total number of bundles to
217
+ # 2xMAX_RESOURCE_DEMAND_VECTOR_SIZE. This guarantees the resource
218
+ # demand scheduler bin packing algorithm takes a reasonable amount
219
+ # of time to run.
220
+ return (
221
+ self.waiting_bundles[:AUTOSCALER_MAX_RESOURCE_DEMAND_VECTOR_SIZE]
222
+ + self.infeasible_bundles[:AUTOSCALER_MAX_RESOURCE_DEMAND_VECTOR_SIZE]
223
+ )
224
+ else:
225
+ return self.waiting_bundles + self.infeasible_bundles
226
+
227
+ def get_resource_requests(self):
228
+ return self.resource_requests
229
+
230
+ def get_pending_placement_groups(self):
231
+ return self.pending_placement_groups
232
+
233
+ def resources_avail_summary(self) -> str:
234
+ """Return a concise string of cluster size to report to event logs.
235
+
236
+ For example, "3 CPUs, 4 GPUs".
237
+ """
238
+ total_resources = (
239
+ reduce(add_resources, self.static_resources_by_ip.values())
240
+ if self.static_resources_by_ip
241
+ else {}
242
+ )
243
+ out = "{} CPUs".format(int(total_resources.get("CPU", 0)))
244
+ if "GPU" in total_resources:
245
+ out += ", {} GPUs".format(int(total_resources["GPU"]))
246
+ if "TPU" in total_resources:
247
+ out += ", {} TPUs".format(int(total_resources["TPU"]))
248
+ return out
249
+
250
+ def summary(self):
251
+ available_resources = (
252
+ reduce(add_resources, self.dynamic_resources_by_ip.values())
253
+ if self.dynamic_resources_by_ip
254
+ else {}
255
+ )
256
+ total_resources = (
257
+ reduce(add_resources, self.static_resources_by_ip.values())
258
+ if self.static_resources_by_ip
259
+ else {}
260
+ )
261
+ usage_dict = {}
262
+ for key in total_resources:
263
+ if key in ["memory", "object_store_memory"]:
264
+ total = total_resources[key]
265
+ available = available_resources[key]
266
+ usage_dict[key] = (total - available, total)
267
+ else:
268
+ total = total_resources[key]
269
+ usage_dict[key] = (total - available_resources[key], total)
270
+
271
+ summarized_demand_vector = freq_of_dicts(
272
+ self.get_resource_demand_vector(clip=False)
273
+ )
274
+ summarized_resource_requests = freq_of_dicts(self.get_resource_requests())
275
+
276
+ def placement_group_serializer(pg):
277
+ bundles = tuple(
278
+ frozenset(bundle.unit_resources.items()) for bundle in pg.bundles
279
+ )
280
+ return (bundles, pg.strategy)
281
+
282
+ def placement_group_deserializer(pg_tuple):
283
+ # We marshal this as a dictionary so that we can easily json.dumps
284
+ # it later.
285
+ # TODO (Alex): Would there be a benefit to properly
286
+ # marshalling this (into a protobuf)?
287
+ bundles = list(map(dict, pg_tuple[0]))
288
+ return {
289
+ "bundles": freq_of_dicts(bundles),
290
+ "strategy": PlacementStrategy.Name(pg_tuple[1]),
291
+ }
292
+
293
+ summarized_placement_groups = freq_of_dicts(
294
+ self.get_pending_placement_groups(),
295
+ serializer=placement_group_serializer,
296
+ deserializer=placement_group_deserializer,
297
+ )
298
+ nodes_summary = freq_of_dicts(self.static_resources_by_ip.values())
299
+
300
+ usage_by_node = None
301
+ if AUTOSCALER_REPORT_PER_NODE_STATUS:
302
+ usage_by_node = {}
303
+ for ip, totals in self.static_resources_by_ip.items():
304
+ available = self.dynamic_resources_by_ip.get(ip, {})
305
+ usage_by_node[ip] = {}
306
+ for resource, total in totals.items():
307
+ usage_by_node[ip][resource] = (
308
+ total - available.get(resource, 0),
309
+ total,
310
+ )
311
+
312
+ return LoadMetricsSummary(
313
+ usage=usage_dict,
314
+ resource_demand=summarized_demand_vector,
315
+ pg_demand=summarized_placement_groups,
316
+ request_demand=summarized_resource_requests,
317
+ node_types=nodes_summary,
318
+ usage_by_node=usage_by_node,
319
+ )
320
+
321
+ def set_resource_requests(self, requested_resources):
322
+ if requested_resources is not None:
323
+ assert isinstance(requested_resources, list), requested_resources
324
+ self.resource_requests = [
325
+ request for request in requested_resources if len(request) > 0
326
+ ]
327
+
328
+ def info_string(self):
329
+ return " - " + "\n - ".join(
330
+ ["{}: {}".format(k, v) for k, v in sorted(self._info().items())]
331
+ )
332
+
333
+ def _info(self):
334
+ resources_used, resources_total = self._get_resource_usage()
335
+
336
+ now = time.time()
337
+ idle_times = [now - t for t in self.ray_nodes_last_used_time_by_ip.values()]
338
+ heartbeat_times = [now - t for t in self.last_heartbeat_time_by_ip.values()]
339
+ most_delayed_heartbeats = sorted(
340
+ self.last_heartbeat_time_by_ip.items(), key=lambda pair: pair[1]
341
+ )[:5]
342
+ most_delayed_heartbeats = {ip: (now - t) for ip, t in most_delayed_heartbeats}
343
+
344
+ def format_resource(key, value):
345
+ if key in ["object_store_memory", "memory"]:
346
+ return "{} GiB".format(round(value / (1024 * 1024 * 1024), 2))
347
+ else:
348
+ return round(value, 2)
349
+
350
+ return {
351
+ "ResourceUsage": ", ".join(
352
+ [
353
+ "{}/{} {}".format(
354
+ format_resource(rid, resources_used[rid]),
355
+ format_resource(rid, resources_total[rid]),
356
+ rid,
357
+ )
358
+ for rid in sorted(resources_used)
359
+ if not rid.startswith("node:")
360
+ ]
361
+ ),
362
+ "NodeIdleSeconds": "Min={} Mean={} Max={}".format(
363
+ int(min(idle_times)) if idle_times else -1,
364
+ int(float(sum(idle_times)) / len(idle_times)) if idle_times else -1,
365
+ int(max(idle_times)) if idle_times else -1,
366
+ ),
367
+ "TimeSinceLastHeartbeat": "Min={} Mean={} Max={}".format(
368
+ int(min(heartbeat_times)) if heartbeat_times else -1,
369
+ int(float(sum(heartbeat_times)) / len(heartbeat_times))
370
+ if heartbeat_times
371
+ else -1,
372
+ int(max(heartbeat_times)) if heartbeat_times else -1,
373
+ ),
374
+ "MostDelayedHeartbeats": most_delayed_heartbeats,
375
+ }
.venv/lib/python3.11/site-packages/ray/autoscaler/_private/loader.py ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import importlib
2
+
3
+
4
+ def load_function_or_class(path):
5
+ """Load a function or class at runtime given a full path.
6
+
7
+ Example of the path: mypkg.mysubpkg.myclass
8
+ """
9
+ class_data = path.split(".")
10
+ if len(class_data) < 2:
11
+ raise ValueError("You need to pass a valid path like mymodule.provider_class")
12
+ module_path = ".".join(class_data[:-1])
13
+ fn_or_class_str = class_data[-1]
14
+ module = importlib.import_module(module_path)
15
+ return getattr(module, fn_or_class_str)
.venv/lib/python3.11/site-packages/ray/autoscaler/_private/local/__init__.py ADDED
File without changes
.venv/lib/python3.11/site-packages/ray/autoscaler/_private/local/__pycache__/__init__.cpython-311.pyc ADDED
Binary file (202 Bytes). View file
 
.venv/lib/python3.11/site-packages/ray/autoscaler/_private/local/__pycache__/config.cpython-311.pyc ADDED
Binary file (5.4 kB). View file
 
.venv/lib/python3.11/site-packages/ray/autoscaler/_private/local/__pycache__/coordinator_node_provider.cpython-311.pyc ADDED
Binary file (5.85 kB). View file
 
.venv/lib/python3.11/site-packages/ray/autoscaler/_private/local/__pycache__/node_provider.cpython-311.pyc ADDED
Binary file (17.3 kB). View file
 
.venv/lib/python3.11/site-packages/ray/autoscaler/_private/local/config.py ADDED
@@ -0,0 +1,121 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import copy
2
+ import os
3
+ from typing import Any, Dict
4
+
5
+ from ray._private.utils import get_ray_temp_dir
6
+ from ray.autoscaler._private.cli_logger import cli_logger
7
+
8
+ unsupported_field_message = "The field {} is not supported for on-premise clusters."
9
+
10
+ LOCAL_CLUSTER_NODE_TYPE = "local.cluster.node"
11
+
12
+
13
+ def prepare_local(config: Dict[str, Any]) -> Dict[str, Any]:
14
+ """
15
+ Prepare local cluster config for ingestion by cluster launcher and
16
+ autoscaler.
17
+ """
18
+ config = copy.deepcopy(config)
19
+ for field in "head_node", "worker_nodes", "available_node_types":
20
+ if config.get(field):
21
+ err_msg = unsupported_field_message.format(field)
22
+ cli_logger.abort(err_msg)
23
+ # We use a config with a single node type for on-prem clusters.
24
+ # Resources internally detected by Ray are not overridden by the autoscaler
25
+ # (see NodeProvider.do_update)
26
+ config["available_node_types"] = {
27
+ LOCAL_CLUSTER_NODE_TYPE: {"node_config": {}, "resources": {}}
28
+ }
29
+ config["head_node_type"] = LOCAL_CLUSTER_NODE_TYPE
30
+ if "coordinator_address" in config["provider"]:
31
+ config = prepare_coordinator(config)
32
+ else:
33
+ config = prepare_manual(config)
34
+ return config
35
+
36
+
37
+ def prepare_coordinator(config: Dict[str, Any]) -> Dict[str, Any]:
38
+ config = copy.deepcopy(config)
39
+ # User should explicitly set the max number of workers for the coordinator
40
+ # to allocate.
41
+ if "max_workers" not in config:
42
+ cli_logger.abort(
43
+ "The field `max_workers` is required when using an "
44
+ "automatically managed on-premise cluster."
45
+ )
46
+ node_type = config["available_node_types"][LOCAL_CLUSTER_NODE_TYPE]
47
+ # The autoscaler no longer uses global `min_workers`.
48
+ # Move `min_workers` to the node_type config.
49
+ node_type["min_workers"] = config.pop("min_workers", 0)
50
+ node_type["max_workers"] = config["max_workers"]
51
+ return config
52
+
53
+
54
+ def prepare_manual(config: Dict[str, Any]) -> Dict[str, Any]:
55
+ """Validates and sets defaults for configs of manually managed on-prem
56
+ clusters.
57
+
58
+ - Checks for presence of required `worker_ips` and `head_ips` fields.
59
+ - Defaults min and max workers to the number of `worker_ips`.
60
+ - Caps min and max workers at the number of `worker_ips`.
61
+ - Writes min and max worker info into the single worker node type.
62
+ """
63
+ config = copy.deepcopy(config)
64
+ if ("worker_ips" not in config["provider"]) or (
65
+ "head_ip" not in config["provider"]
66
+ ):
67
+ cli_logger.abort(
68
+ "Please supply a `head_ip` and list of `worker_ips`. "
69
+ "Alternatively, supply a `coordinator_address`."
70
+ )
71
+ num_ips = len(config["provider"]["worker_ips"])
72
+ node_type = config["available_node_types"][LOCAL_CLUSTER_NODE_TYPE]
73
+ # Default to keeping all provided ips in the cluster.
74
+ config.setdefault("max_workers", num_ips)
75
+
76
+ # The autoscaler no longer uses global `min_workers`.
77
+ # We will move `min_workers` to the node_type config.
78
+ min_workers = config.pop("min_workers", num_ips)
79
+ max_workers = config["max_workers"]
80
+
81
+ if min_workers > num_ips:
82
+ cli_logger.warning(
83
+ f"The value of `min_workers` supplied ({min_workers}) is greater"
84
+ f" than the number of available worker ips ({num_ips})."
85
+ f" Setting `min_workers={num_ips}`."
86
+ )
87
+ node_type["min_workers"] = num_ips
88
+ else:
89
+ node_type["min_workers"] = min_workers
90
+
91
+ if max_workers > num_ips:
92
+ cli_logger.warning(
93
+ f"The value of `max_workers` supplied ({max_workers}) is greater"
94
+ f" than the number of available worker ips ({num_ips})."
95
+ f" Setting `max_workers={num_ips}`."
96
+ )
97
+ node_type["max_workers"] = num_ips
98
+ config["max_workers"] = num_ips
99
+ else:
100
+ node_type["max_workers"] = max_workers
101
+
102
+ if max_workers < num_ips:
103
+ cli_logger.warning(
104
+ f"The value of `max_workers` supplied ({max_workers}) is less"
105
+ f" than the number of available worker ips ({num_ips})."
106
+ f" At most {max_workers} Ray worker nodes will connect to the cluster."
107
+ )
108
+
109
+ return config
110
+
111
+
112
+ def get_lock_path(cluster_name: str) -> str:
113
+ return os.path.join(get_ray_temp_dir(), "cluster-{}.lock".format(cluster_name))
114
+
115
+
116
+ def get_state_path(cluster_name: str) -> str:
117
+ return os.path.join(get_ray_temp_dir(), "cluster-{}.state".format(cluster_name))
118
+
119
+
120
+ def bootstrap_local(config: Dict[str, Any]) -> Dict[str, Any]:
121
+ return config
.venv/lib/python3.11/site-packages/ray/autoscaler/_private/local/coordinator_node_provider.py ADDED
@@ -0,0 +1,110 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import logging
3
+ from http.client import RemoteDisconnected
4
+
5
+ from ray.autoscaler.node_provider import NodeProvider
6
+ from ray.autoscaler.tags import TAG_RAY_CLUSTER_NAME
7
+
8
+ logger = logging.getLogger(__name__)
9
+
10
+
11
+ class CoordinatorSenderNodeProvider(NodeProvider):
12
+ """NodeProvider for automatically managed private/local clusters.
13
+
14
+ The cluster management is handled by a remote coordinating server.
15
+ The server listens on <coordinator_address>, therefore, the address
16
+ should be provided in the provider section in the cluster config.
17
+ The server receieves HTTP requests from this class and uses
18
+ LocalNodeProvider to get their responses.
19
+ """
20
+
21
+ def __init__(self, provider_config, cluster_name):
22
+ NodeProvider.__init__(self, provider_config, cluster_name)
23
+ self.coordinator_address = provider_config["coordinator_address"]
24
+
25
+ def _get_http_response(self, request):
26
+ headers = {
27
+ "Content-Type": "application/json",
28
+ }
29
+ request_message = json.dumps(request).encode()
30
+ http_coordinator_address = "http://" + self.coordinator_address
31
+
32
+ try:
33
+ import requests # `requests` is not part of stdlib.
34
+ from requests.exceptions import ConnectionError
35
+
36
+ r = requests.get(
37
+ http_coordinator_address,
38
+ data=request_message,
39
+ headers=headers,
40
+ timeout=None,
41
+ )
42
+ except (RemoteDisconnected, ConnectionError):
43
+ logger.exception(
44
+ "Could not connect to: "
45
+ + http_coordinator_address
46
+ + ". Did you run python coordinator_server.py"
47
+ + " --ips <list_of_node_ips> --port <PORT>?"
48
+ )
49
+ raise
50
+ except ImportError:
51
+ logger.exception(
52
+ "Not all Ray Autoscaler dependencies were found. "
53
+ "In Ray 1.4+, the Ray CLI, autoscaler, and dashboard will "
54
+ 'only be usable via `pip install "ray[default]"`. Please '
55
+ "update your install command."
56
+ )
57
+ raise
58
+
59
+ response = r.json()
60
+ return response
61
+
62
+ def non_terminated_nodes(self, tag_filters):
63
+ # Only get the non terminated nodes associated with this cluster name.
64
+ tag_filters[TAG_RAY_CLUSTER_NAME] = self.cluster_name
65
+ request = {"type": "non_terminated_nodes", "args": (tag_filters,)}
66
+ return self._get_http_response(request)
67
+
68
+ def is_running(self, node_id):
69
+ request = {"type": "is_running", "args": (node_id,)}
70
+ return self._get_http_response(request)
71
+
72
+ def is_terminated(self, node_id):
73
+ request = {"type": "is_terminated", "args": (node_id,)}
74
+ return self._get_http_response(request)
75
+
76
+ def node_tags(self, node_id):
77
+ request = {"type": "node_tags", "args": (node_id,)}
78
+ return self._get_http_response(request)
79
+
80
+ def external_ip(self, node_id):
81
+ request = {"type": "external_ip", "args": (node_id,)}
82
+ response = self._get_http_response(request)
83
+ return response
84
+
85
+ def internal_ip(self, node_id):
86
+ request = {"type": "internal_ip", "args": (node_id,)}
87
+ response = self._get_http_response(request)
88
+ return response
89
+
90
+ def create_node(self, node_config, tags, count):
91
+ # Tag the newly created node with this cluster name. Helps to get
92
+ # the right nodes when calling non_terminated_nodes.
93
+ tags[TAG_RAY_CLUSTER_NAME] = self.cluster_name
94
+ request = {
95
+ "type": "create_node",
96
+ "args": (node_config, tags, count),
97
+ }
98
+ self._get_http_response(request)
99
+
100
+ def set_node_tags(self, node_id, tags):
101
+ request = {"type": "set_node_tags", "args": (node_id, tags)}
102
+ self._get_http_response(request)
103
+
104
+ def terminate_node(self, node_id):
105
+ request = {"type": "terminate_node", "args": (node_id,)}
106
+ self._get_http_response(request)
107
+
108
+ def terminate_nodes(self, node_ids):
109
+ request = {"type": "terminate_nodes", "args": (node_ids,)}
110
+ self._get_http_response(request)
.venv/lib/python3.11/site-packages/ray/autoscaler/_private/local/node_provider.py ADDED
@@ -0,0 +1,304 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import logging
3
+ import os
4
+ import socket
5
+ from threading import RLock
6
+
7
+ from filelock import FileLock
8
+
9
+ from ray.autoscaler._private.local.config import (
10
+ LOCAL_CLUSTER_NODE_TYPE,
11
+ bootstrap_local,
12
+ get_lock_path,
13
+ get_state_path,
14
+ )
15
+ from ray.autoscaler.node_provider import NodeProvider
16
+ from ray.autoscaler.tags import (
17
+ NODE_KIND_HEAD,
18
+ NODE_KIND_WORKER,
19
+ STATUS_UP_TO_DATE,
20
+ TAG_RAY_NODE_KIND,
21
+ TAG_RAY_NODE_NAME,
22
+ TAG_RAY_NODE_STATUS,
23
+ TAG_RAY_USER_NODE_TYPE,
24
+ )
25
+
26
+ logger = logging.getLogger(__name__)
27
+
28
+ filelock_logger = logging.getLogger("filelock")
29
+ filelock_logger.setLevel(logging.WARNING)
30
+
31
+
32
+ class ClusterState:
33
+ def __init__(self, lock_path, save_path, provider_config):
34
+ self.lock = RLock()
35
+ os.makedirs(os.path.dirname(lock_path), exist_ok=True)
36
+ self.file_lock = FileLock(lock_path)
37
+ self.save_path = save_path
38
+
39
+ with self.lock:
40
+ with self.file_lock:
41
+ if os.path.exists(self.save_path):
42
+ workers = json.loads(open(self.save_path).read())
43
+ head_config = workers.get(provider_config["head_ip"])
44
+ if (
45
+ not head_config
46
+ or head_config.get("tags", {}).get(TAG_RAY_NODE_KIND)
47
+ != NODE_KIND_HEAD
48
+ ):
49
+ workers = {}
50
+ logger.info("Head IP changed - recreating cluster.")
51
+ else:
52
+ workers = {}
53
+ logger.info(
54
+ "ClusterState: Loaded cluster state: {}".format(list(workers))
55
+ )
56
+ for worker_ip in provider_config["worker_ips"]:
57
+ if worker_ip not in workers:
58
+ workers[worker_ip] = {
59
+ "tags": {TAG_RAY_NODE_KIND: NODE_KIND_WORKER},
60
+ "state": "terminated",
61
+ }
62
+ else:
63
+ assert (
64
+ workers[worker_ip]["tags"][TAG_RAY_NODE_KIND]
65
+ == NODE_KIND_WORKER
66
+ )
67
+ if provider_config["head_ip"] not in workers:
68
+ workers[provider_config["head_ip"]] = {
69
+ "tags": {TAG_RAY_NODE_KIND: NODE_KIND_HEAD},
70
+ "state": "terminated",
71
+ }
72
+ else:
73
+ assert (
74
+ workers[provider_config["head_ip"]]["tags"][TAG_RAY_NODE_KIND]
75
+ == NODE_KIND_HEAD
76
+ )
77
+ # Relevant when a user reduces the number of workers
78
+ # without changing the headnode.
79
+ list_of_node_ips = list(provider_config["worker_ips"])
80
+ list_of_node_ips.append(provider_config["head_ip"])
81
+ for worker_ip in list(workers):
82
+ if worker_ip not in list_of_node_ips:
83
+ del workers[worker_ip]
84
+
85
+ # Set external head ip, if provided by user.
86
+ # Necessary if calling `ray up` from outside the network.
87
+ # Refer to LocalNodeProvider.external_ip function.
88
+ external_head_ip = provider_config.get("external_head_ip")
89
+ if external_head_ip:
90
+ head = workers[provider_config["head_ip"]]
91
+ head["external_ip"] = external_head_ip
92
+
93
+ assert len(workers) == len(provider_config["worker_ips"]) + 1
94
+ with open(self.save_path, "w") as f:
95
+ logger.debug(
96
+ "ClusterState: Writing cluster state: {}".format(workers)
97
+ )
98
+ f.write(json.dumps(workers))
99
+
100
+ def get(self):
101
+ with self.lock:
102
+ with self.file_lock:
103
+ workers = json.loads(open(self.save_path).read())
104
+ return workers
105
+
106
+ def put(self, worker_id, info):
107
+ assert "tags" in info
108
+ assert "state" in info
109
+ with self.lock:
110
+ with self.file_lock:
111
+ workers = self.get()
112
+ workers[worker_id] = info
113
+ with open(self.save_path, "w") as f:
114
+ logger.info(
115
+ "ClusterState: "
116
+ "Writing cluster state: {}".format(list(workers))
117
+ )
118
+ f.write(json.dumps(workers))
119
+
120
+
121
+ class OnPremCoordinatorState(ClusterState):
122
+ """Generates & updates the state file of CoordinatorSenderNodeProvider.
123
+
124
+ Unlike ClusterState, which generates a cluster specific file with
125
+ predefined head and worker ips, OnPremCoordinatorState overwrites
126
+ ClusterState's __init__ function to generate and manage a unified
127
+ file of the status of all the nodes for multiple clusters.
128
+ """
129
+
130
+ def __init__(self, lock_path, save_path, list_of_node_ips):
131
+ self.lock = RLock()
132
+ self.file_lock = FileLock(lock_path)
133
+ self.save_path = save_path
134
+
135
+ with self.lock:
136
+ with self.file_lock:
137
+ if os.path.exists(self.save_path):
138
+ nodes = json.loads(open(self.save_path).read())
139
+ else:
140
+ nodes = {}
141
+ logger.info(
142
+ "OnPremCoordinatorState: "
143
+ "Loaded on prem coordinator state: {}".format(nodes)
144
+ )
145
+
146
+ # Filter removed node ips.
147
+ for node_ip in list(nodes):
148
+ if node_ip not in list_of_node_ips:
149
+ del nodes[node_ip]
150
+
151
+ for node_ip in list_of_node_ips:
152
+ if node_ip not in nodes:
153
+ nodes[node_ip] = {
154
+ "tags": {},
155
+ "state": "terminated",
156
+ }
157
+ assert len(nodes) == len(list_of_node_ips)
158
+ with open(self.save_path, "w") as f:
159
+ logger.info(
160
+ "OnPremCoordinatorState: "
161
+ "Writing on prem coordinator state: {}".format(nodes)
162
+ )
163
+ f.write(json.dumps(nodes))
164
+
165
+
166
+ class LocalNodeProvider(NodeProvider):
167
+ """NodeProvider for private/local clusters.
168
+
169
+ `node_id` is overloaded to also be `node_ip` in this class.
170
+
171
+ When `cluster_name` is provided, it manages a single cluster in a cluster
172
+ specific state file. But when `cluster_name` is None, it manages multiple
173
+ clusters in a unified state file that requires each node to be tagged with
174
+ TAG_RAY_CLUSTER_NAME in create and non_terminated_nodes function calls to
175
+ associate each node with the right cluster.
176
+
177
+ The current use case of managing multiple clusters is by
178
+ OnPremCoordinatorServer which receives node provider HTTP requests
179
+ from CoordinatorSenderNodeProvider and uses LocalNodeProvider to get
180
+ the responses.
181
+ """
182
+
183
+ def __init__(self, provider_config, cluster_name):
184
+ NodeProvider.__init__(self, provider_config, cluster_name)
185
+
186
+ if cluster_name:
187
+ lock_path = get_lock_path(cluster_name)
188
+ state_path = get_state_path(cluster_name)
189
+ self.state = ClusterState(
190
+ lock_path,
191
+ state_path,
192
+ provider_config,
193
+ )
194
+ self.use_coordinator = False
195
+ else:
196
+ # LocalNodeProvider with a coordinator server.
197
+ self.state = OnPremCoordinatorState(
198
+ "/tmp/coordinator.lock",
199
+ "/tmp/coordinator.state",
200
+ provider_config["list_of_node_ips"],
201
+ )
202
+ self.use_coordinator = True
203
+
204
+ def non_terminated_nodes(self, tag_filters):
205
+ workers = self.state.get()
206
+ matching_ips = []
207
+ for worker_ip, info in workers.items():
208
+ if info["state"] == "terminated":
209
+ continue
210
+ ok = True
211
+ for k, v in tag_filters.items():
212
+ if info["tags"].get(k) != v:
213
+ ok = False
214
+ break
215
+ if ok:
216
+ matching_ips.append(worker_ip)
217
+ return matching_ips
218
+
219
+ def is_running(self, node_id):
220
+ return self.state.get()[node_id]["state"] == "running"
221
+
222
+ def is_terminated(self, node_id):
223
+ return not self.is_running(node_id)
224
+
225
+ def node_tags(self, node_id):
226
+ return self.state.get()[node_id]["tags"]
227
+
228
+ def external_ip(self, node_id):
229
+ """Returns an external ip if the user has supplied one.
230
+ Otherwise, use the same logic as internal_ip below.
231
+
232
+ This can be used to call ray up from outside the network, for example
233
+ if the Ray cluster exists in an AWS VPC and we're interacting with
234
+ the cluster from a laptop (where using an internal_ip will not work).
235
+
236
+ Useful for debugging the local node provider with cloud VMs."""
237
+
238
+ node_state = self.state.get()[node_id]
239
+ ext_ip = node_state.get("external_ip")
240
+ if ext_ip:
241
+ return ext_ip
242
+ else:
243
+ return socket.gethostbyname(node_id)
244
+
245
+ def internal_ip(self, node_id):
246
+ return socket.gethostbyname(node_id)
247
+
248
+ def set_node_tags(self, node_id, tags):
249
+ with self.state.file_lock:
250
+ info = self.state.get()[node_id]
251
+ info["tags"].update(tags)
252
+ self.state.put(node_id, info)
253
+
254
+ def create_node(self, node_config, tags, count):
255
+ """Creates min(count, currently available) nodes."""
256
+ node_type = tags[TAG_RAY_NODE_KIND]
257
+ with self.state.file_lock:
258
+ workers = self.state.get()
259
+ for node_id, info in workers.items():
260
+ if info["state"] == "terminated" and (
261
+ self.use_coordinator or info["tags"][TAG_RAY_NODE_KIND] == node_type
262
+ ):
263
+ info["tags"] = tags
264
+ info["state"] = "running"
265
+ self.state.put(node_id, info)
266
+ count = count - 1
267
+ if count == 0:
268
+ return
269
+
270
+ def terminate_node(self, node_id):
271
+ workers = self.state.get()
272
+ info = workers[node_id]
273
+ info["state"] = "terminated"
274
+ self.state.put(node_id, info)
275
+
276
+ @staticmethod
277
+ def bootstrap_config(cluster_config):
278
+ return bootstrap_local(cluster_config)
279
+
280
+
281
+ def record_local_head_state_if_needed(local_provider: LocalNodeProvider) -> None:
282
+ """This function is called on the Ray head from StandardAutoscaler.reset
283
+ to record the head node's own existence in the cluster state file.
284
+
285
+ This is necessary because `provider.create_node` in
286
+ `commands.get_or_create_head_node` records the head state on the
287
+ cluster-launching machine but not on the head.
288
+ """
289
+ head_ip = local_provider.provider_config["head_ip"]
290
+ cluster_name = local_provider.cluster_name
291
+ # If the head node is not marked as created in the cluster state file,
292
+ if head_ip not in local_provider.non_terminated_nodes({}):
293
+ # These tags are based on the ones in commands.get_or_create_head_node;
294
+ # keep in sync.
295
+ head_tags = {
296
+ TAG_RAY_NODE_KIND: NODE_KIND_HEAD,
297
+ TAG_RAY_USER_NODE_TYPE: LOCAL_CLUSTER_NODE_TYPE,
298
+ TAG_RAY_NODE_NAME: "ray-{}-head".format(cluster_name),
299
+ TAG_RAY_NODE_STATUS: STATUS_UP_TO_DATE,
300
+ }
301
+ # Mark the head node as created in the cluster state file.
302
+ local_provider.create_node(node_config={}, tags=head_tags, count=1)
303
+
304
+ assert head_ip in local_provider.non_terminated_nodes({})
.venv/lib/python3.11/site-packages/ray/autoscaler/_private/log_timer.py ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import datetime
2
+ import logging
3
+
4
+ from ray.autoscaler._private.cli_logger import cli_logger
5
+
6
+ logger = logging.getLogger(__name__)
7
+
8
+
9
+ class LogTimer:
10
+ def __init__(self, message, show_status=False):
11
+ self._message = message
12
+ self._show_status = show_status
13
+
14
+ def __enter__(self):
15
+ self._start_time = datetime.datetime.utcnow()
16
+
17
+ def __exit__(self, *error_vals):
18
+ if cli_logger.log_style != "record":
19
+ return
20
+
21
+ td = datetime.datetime.utcnow() - self._start_time
22
+ status = ""
23
+ if self._show_status:
24
+ status = "failed" if any(error_vals) else "succeeded"
25
+ cli_logger.print(
26
+ " ".join(
27
+ [
28
+ self._message,
29
+ status,
30
+ "[LogTimer={:.0f}ms]".format(td.total_seconds() * 1000),
31
+ ]
32
+ )
33
+ )
.venv/lib/python3.11/site-packages/ray/autoscaler/_private/monitor.py ADDED
@@ -0,0 +1,719 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Autoscaler monitoring loop daemon."""
2
+
3
+ import argparse
4
+ import json
5
+ import logging
6
+ import os
7
+ import signal
8
+ import sys
9
+ import time
10
+ import traceback
11
+ from collections import Counter
12
+ from dataclasses import asdict
13
+ from typing import Any, Callable, Dict, Optional, Union
14
+
15
+ import ray
16
+ import ray._private.ray_constants as ray_constants
17
+ import ray._private.utils
18
+ from ray._private.event.event_logger import get_event_logger
19
+ from ray._private.ray_logging import setup_component_logger
20
+ from ray._raylet import GcsClient
21
+ from ray.autoscaler._private.autoscaler import StandardAutoscaler
22
+ from ray.autoscaler._private.commands import teardown_cluster
23
+ from ray.autoscaler._private.constants import (
24
+ AUTOSCALER_MAX_RESOURCE_DEMAND_VECTOR_SIZE,
25
+ AUTOSCALER_METRIC_PORT,
26
+ AUTOSCALER_UPDATE_INTERVAL_S,
27
+ DISABLE_LAUNCH_CONFIG_CHECK_KEY,
28
+ )
29
+ from ray.autoscaler._private.event_summarizer import EventSummarizer
30
+ from ray.autoscaler._private.load_metrics import LoadMetrics
31
+ from ray.autoscaler._private.prom_metrics import AutoscalerPrometheusMetrics
32
+ from ray.autoscaler._private.util import format_readonly_node_type
33
+ from ray.autoscaler.v2.sdk import get_cluster_resource_state
34
+ from ray.core.generated import gcs_pb2
35
+ from ray.core.generated.event_pb2 import Event as RayEvent
36
+ from ray.experimental.internal_kv import (
37
+ _initialize_internal_kv,
38
+ _internal_kv_del,
39
+ _internal_kv_get,
40
+ _internal_kv_initialized,
41
+ _internal_kv_put,
42
+ )
43
+
44
+ try:
45
+ import prometheus_client
46
+ except ImportError:
47
+ prometheus_client = None
48
+
49
+
50
+ logger = logging.getLogger(__name__)
51
+
52
+
53
+ def parse_resource_demands(resource_load_by_shape):
54
+ """Handle the message.resource_load_by_shape protobuf for the demand
55
+ based autoscaling. Catch and log all exceptions so this doesn't
56
+ interfere with the utilization based autoscaler until we're confident
57
+ this is stable. Worker queue backlogs are added to the appropriate
58
+ resource demand vector.
59
+
60
+ Args:
61
+ resource_load_by_shape (pb2.gcs.ResourceLoad): The resource demands
62
+ in protobuf form or None.
63
+
64
+ Returns:
65
+ List[ResourceDict]: Waiting bundles (ready and feasible).
66
+ List[ResourceDict]: Infeasible bundles.
67
+ """
68
+ waiting_bundles, infeasible_bundles = [], []
69
+ try:
70
+ for resource_demand_pb in list(resource_load_by_shape.resource_demands):
71
+ request_shape = dict(resource_demand_pb.shape)
72
+ for _ in range(resource_demand_pb.num_ready_requests_queued):
73
+ waiting_bundles.append(request_shape)
74
+ for _ in range(resource_demand_pb.num_infeasible_requests_queued):
75
+ infeasible_bundles.append(request_shape)
76
+
77
+ # Infeasible and ready states for tasks are (logically)
78
+ # mutually exclusive.
79
+ if resource_demand_pb.num_infeasible_requests_queued > 0:
80
+ backlog_queue = infeasible_bundles
81
+ else:
82
+ backlog_queue = waiting_bundles
83
+ for _ in range(resource_demand_pb.backlog_size):
84
+ backlog_queue.append(request_shape)
85
+ if (
86
+ len(waiting_bundles + infeasible_bundles)
87
+ > AUTOSCALER_MAX_RESOURCE_DEMAND_VECTOR_SIZE
88
+ ):
89
+ break
90
+ except Exception:
91
+ logger.exception("Failed to parse resource demands.")
92
+
93
+ return waiting_bundles, infeasible_bundles
94
+
95
+
96
+ # Readonly provider config (e.g., for laptop mode, manually setup clusters).
97
+ BASE_READONLY_CONFIG = {
98
+ "cluster_name": "default",
99
+ "max_workers": 0,
100
+ "upscaling_speed": 1.0,
101
+ "docker": {},
102
+ "idle_timeout_minutes": 0,
103
+ "provider": {
104
+ "type": "readonly",
105
+ "use_node_id_as_ip": True, # For emulated multi-node on laptop.
106
+ DISABLE_LAUNCH_CONFIG_CHECK_KEY: True, # No launch check.
107
+ },
108
+ "auth": {},
109
+ "available_node_types": {
110
+ "ray.head.default": {"resources": {}, "node_config": {}, "max_workers": 0}
111
+ },
112
+ "head_node_type": "ray.head.default",
113
+ "file_mounts": {},
114
+ "cluster_synced_files": [],
115
+ "file_mounts_sync_continuously": False,
116
+ "rsync_exclude": [],
117
+ "rsync_filter": [],
118
+ "initialization_commands": [],
119
+ "setup_commands": [],
120
+ "head_setup_commands": [],
121
+ "worker_setup_commands": [],
122
+ "head_start_ray_commands": [],
123
+ "worker_start_ray_commands": [],
124
+ }
125
+
126
+
127
+ class Monitor:
128
+ """Autoscaling monitor.
129
+
130
+ This process periodically collects stats from the GCS and triggers
131
+ autoscaler updates.
132
+ """
133
+
134
+ def __init__(
135
+ self,
136
+ address: str,
137
+ autoscaling_config: Union[str, Callable[[], Dict[str, Any]]],
138
+ log_dir: str = None,
139
+ prefix_cluster_info: bool = False,
140
+ monitor_ip: Optional[str] = None,
141
+ retry_on_failure: bool = True,
142
+ ):
143
+ self.gcs_address = address
144
+ worker = ray._private.worker.global_worker
145
+ # TODO: eventually plumb ClusterID through to here
146
+ self.gcs_client = GcsClient(address=self.gcs_address)
147
+
148
+ if monitor_ip:
149
+ monitor_addr = f"{monitor_ip}:{AUTOSCALER_METRIC_PORT}"
150
+ self.gcs_client.internal_kv_put(
151
+ b"AutoscalerMetricsAddress", monitor_addr.encode(), True, None
152
+ )
153
+ _initialize_internal_kv(self.gcs_client)
154
+ if monitor_ip:
155
+ monitor_addr = f"{monitor_ip}:{AUTOSCALER_METRIC_PORT}"
156
+ self.gcs_client.internal_kv_put(
157
+ b"AutoscalerMetricsAddress", monitor_addr.encode(), True, None
158
+ )
159
+ self._session_name = self.get_session_name(self.gcs_client)
160
+ logger.info(f"session_name: {self._session_name}")
161
+ worker.mode = 0
162
+ head_node_ip = self.gcs_address.split(":")[0]
163
+
164
+ self.load_metrics = LoadMetrics()
165
+ self.last_avail_resources = None
166
+ self.event_summarizer = EventSummarizer()
167
+ self.prefix_cluster_info = prefix_cluster_info
168
+ self.retry_on_failure = retry_on_failure
169
+ self.autoscaling_config = autoscaling_config
170
+ self.autoscaler = None
171
+ # If set, we are in a manually created cluster (non-autoscaling) and
172
+ # simply mirroring what the GCS tells us the cluster node types are.
173
+ self.readonly_config = None
174
+
175
+ if log_dir:
176
+ try:
177
+ self.event_logger = get_event_logger(
178
+ RayEvent.SourceType.AUTOSCALER, log_dir
179
+ )
180
+ except Exception:
181
+ self.event_logger = None
182
+ else:
183
+ self.event_logger = None
184
+
185
+ self.prom_metrics = AutoscalerPrometheusMetrics(session_name=self._session_name)
186
+
187
+ if monitor_ip and prometheus_client:
188
+ # If monitor_ip wasn't passed in, then don't attempt to start the
189
+ # metric server to keep behavior identical to before metrics were
190
+ # introduced
191
+ try:
192
+ logger.info(
193
+ "Starting autoscaler metrics server on port {}".format(
194
+ AUTOSCALER_METRIC_PORT
195
+ )
196
+ )
197
+ kwargs = {"addr": "127.0.0.1"} if head_node_ip == "127.0.0.1" else {}
198
+ prometheus_client.start_http_server(
199
+ port=AUTOSCALER_METRIC_PORT,
200
+ registry=self.prom_metrics.registry,
201
+ **kwargs,
202
+ )
203
+
204
+ # Reset some gauges, since we don't know which labels have
205
+ # leaked if the autoscaler was restarted.
206
+ self.prom_metrics.pending_nodes.clear()
207
+ self.prom_metrics.active_nodes.clear()
208
+ except Exception:
209
+ logger.exception(
210
+ "An exception occurred while starting the metrics server."
211
+ )
212
+ elif not prometheus_client:
213
+ logger.warning(
214
+ "`prometheus_client` not found, so metrics will not be exported."
215
+ )
216
+
217
+ logger.info("Monitor: Started")
218
+
219
+ def _initialize_autoscaler(self):
220
+ if self.autoscaling_config:
221
+ autoscaling_config = self.autoscaling_config
222
+ else:
223
+ # This config mirrors the current setup of the manually created
224
+ # cluster. Each node gets its own unique node type.
225
+ self.readonly_config = BASE_READONLY_CONFIG
226
+
227
+ # Note that the "available_node_types" of the config can change.
228
+ def get_latest_readonly_config():
229
+ return self.readonly_config
230
+
231
+ autoscaling_config = get_latest_readonly_config
232
+ self.autoscaler = StandardAutoscaler(
233
+ autoscaling_config,
234
+ self.load_metrics,
235
+ self.gcs_client,
236
+ self._session_name,
237
+ prefix_cluster_info=self.prefix_cluster_info,
238
+ event_summarizer=self.event_summarizer,
239
+ prom_metrics=self.prom_metrics,
240
+ )
241
+
242
+ def update_load_metrics(self):
243
+ """Fetches resource usage data from GCS and updates load metrics."""
244
+
245
+ response = self.gcs_client.get_all_resource_usage(timeout=60)
246
+ resources_batch_data = response.resource_usage_data
247
+ log_resource_batch_data_if_desired(resources_batch_data)
248
+
249
+ # This is a workaround to get correct idle_duration_ms
250
+ # from "get_cluster_resource_state"
251
+ # ref: https://github.com/ray-project/ray/pull/48519#issuecomment-2481659346
252
+ cluster_resource_state = get_cluster_resource_state(self.gcs_client)
253
+ ray_node_states = cluster_resource_state.node_states
254
+ ray_nodes_idle_duration_ms_by_id = {
255
+ node.node_id: node.idle_duration_ms for node in ray_node_states
256
+ }
257
+
258
+ # Tell the readonly node provider what nodes to report.
259
+ if self.readonly_config:
260
+ new_nodes = []
261
+ for msg in list(resources_batch_data.batch):
262
+ node_id = msg.node_id.hex()
263
+ new_nodes.append((node_id, msg.node_manager_address))
264
+ self.autoscaler.provider._set_nodes(new_nodes)
265
+
266
+ mirror_node_types = {}
267
+ cluster_full = False
268
+ if (
269
+ hasattr(response, "cluster_full_of_actors_detected_by_gcs")
270
+ and response.cluster_full_of_actors_detected_by_gcs
271
+ ):
272
+ # GCS has detected the cluster full of actors.
273
+ cluster_full = True
274
+ for resource_message in resources_batch_data.batch:
275
+ node_id = resource_message.node_id
276
+ # Generate node type config based on GCS reported node list.
277
+ if self.readonly_config:
278
+ # Keep prefix in sync with ReadonlyNodeProvider.
279
+ node_type = format_readonly_node_type(node_id.hex())
280
+ resources = {}
281
+ for k, v in resource_message.resources_total.items():
282
+ resources[k] = v
283
+ mirror_node_types[node_type] = {
284
+ "resources": resources,
285
+ "node_config": {},
286
+ "max_workers": 1,
287
+ }
288
+ if (
289
+ hasattr(resource_message, "cluster_full_of_actors_detected")
290
+ and resource_message.cluster_full_of_actors_detected
291
+ ):
292
+ # A worker node has detected the cluster full of actors.
293
+ cluster_full = True
294
+ total_resources = dict(resource_message.resources_total)
295
+ available_resources = dict(resource_message.resources_available)
296
+
297
+ waiting_bundles, infeasible_bundles = parse_resource_demands(
298
+ resources_batch_data.resource_load_by_shape
299
+ )
300
+
301
+ pending_placement_groups = list(
302
+ resources_batch_data.placement_group_load.placement_group_data
303
+ )
304
+
305
+ use_node_id_as_ip = self.autoscaler is not None and self.autoscaler.config[
306
+ "provider"
307
+ ].get("use_node_id_as_ip", False)
308
+
309
+ # "use_node_id_as_ip" is a hack meant to address situations in
310
+ # which there's more than one Ray node residing at a given ip.
311
+ # TODO (Dmitri): Stop using ips as node identifiers.
312
+ # https://github.com/ray-project/ray/issues/19086
313
+ if use_node_id_as_ip:
314
+ peloton_id = total_resources.get("NODE_ID_AS_RESOURCE")
315
+ # Legacy support https://github.com/ray-project/ray/pull/17312
316
+ if peloton_id is not None:
317
+ ip = str(int(peloton_id))
318
+ else:
319
+ ip = node_id.hex()
320
+ else:
321
+ ip = resource_message.node_manager_address
322
+
323
+ idle_duration_s = 0.0
324
+ if node_id in ray_nodes_idle_duration_ms_by_id:
325
+ idle_duration_s = ray_nodes_idle_duration_ms_by_id[node_id] / 1000
326
+ else:
327
+ logger.warning(
328
+ f"node_id {node_id} not found in ray_nodes_idle_duration_ms_by_id"
329
+ )
330
+
331
+ self.load_metrics.update(
332
+ ip,
333
+ node_id,
334
+ total_resources,
335
+ available_resources,
336
+ idle_duration_s,
337
+ waiting_bundles,
338
+ infeasible_bundles,
339
+ pending_placement_groups,
340
+ cluster_full,
341
+ )
342
+ if self.readonly_config:
343
+ self.readonly_config["available_node_types"].update(mirror_node_types)
344
+
345
+ def get_session_name(self, gcs_client: GcsClient) -> Optional[str]:
346
+ """Obtain the session name from the GCS.
347
+
348
+ If the GCS doesn't respond, session name is considered None.
349
+ In this case, the metrics reported from the monitor won't have
350
+ the correct session name.
351
+ """
352
+ if not _internal_kv_initialized():
353
+ return None
354
+
355
+ session_name = gcs_client.internal_kv_get(
356
+ b"session_name",
357
+ ray_constants.KV_NAMESPACE_SESSION,
358
+ timeout=10,
359
+ )
360
+
361
+ if session_name:
362
+ session_name = session_name.decode()
363
+
364
+ return session_name
365
+
366
+ def update_resource_requests(self):
367
+ """Fetches resource requests from the internal KV and updates load."""
368
+ if not _internal_kv_initialized():
369
+ return
370
+ data = _internal_kv_get(
371
+ ray._private.ray_constants.AUTOSCALER_RESOURCE_REQUEST_CHANNEL
372
+ )
373
+ if data:
374
+ try:
375
+ resource_request = json.loads(data)
376
+ self.load_metrics.set_resource_requests(resource_request)
377
+ except Exception:
378
+ logger.exception("Error parsing resource requests")
379
+
380
+ def _run(self):
381
+ """Run the monitor loop."""
382
+
383
+ while True:
384
+ try:
385
+ gcs_request_start_time = time.time()
386
+ self.update_load_metrics()
387
+ gcs_request_time = time.time() - gcs_request_start_time
388
+ self.update_resource_requests()
389
+ self.update_event_summary()
390
+ load_metrics_summary = self.load_metrics.summary()
391
+ status = {
392
+ "gcs_request_time": gcs_request_time,
393
+ "time": time.time(),
394
+ "monitor_pid": os.getpid(),
395
+ }
396
+
397
+ if self.autoscaler and not self.load_metrics:
398
+ # load_metrics is Falsey iff we haven't collected any
399
+ # resource messages from the GCS, which can happen at startup if
400
+ # the GCS hasn't yet received data from the Raylets.
401
+ # In this case, do not do an autoscaler update.
402
+ # Wait to get load metrics.
403
+ logger.info(
404
+ "Autoscaler has not yet received load metrics. Waiting."
405
+ )
406
+ elif self.autoscaler:
407
+ # Process autoscaling actions
408
+ update_start_time = time.time()
409
+ self.autoscaler.update()
410
+ status["autoscaler_update_time"] = time.time() - update_start_time
411
+ autoscaler_summary = self.autoscaler.summary()
412
+ try:
413
+ self.emit_metrics(
414
+ load_metrics_summary,
415
+ autoscaler_summary,
416
+ self.autoscaler.all_node_types,
417
+ )
418
+ except Exception:
419
+ logger.exception("Error emitting metrics")
420
+
421
+ if autoscaler_summary:
422
+ status["autoscaler_report"] = asdict(autoscaler_summary)
423
+ status[
424
+ "non_terminated_nodes_time"
425
+ ] = (
426
+ self.autoscaler.non_terminated_nodes.non_terminated_nodes_time # noqa: E501
427
+ )
428
+
429
+ for msg in self.event_summarizer.summary():
430
+ # Need to prefix each line of the message for the lines to
431
+ # get pushed to the driver logs.
432
+ for line in msg.split("\n"):
433
+ logger.info(
434
+ "{}{}".format(
435
+ ray_constants.LOG_PREFIX_EVENT_SUMMARY, line
436
+ )
437
+ )
438
+ if self.event_logger:
439
+ self.event_logger.info(line)
440
+
441
+ self.event_summarizer.clear()
442
+
443
+ status["load_metrics_report"] = asdict(load_metrics_summary)
444
+ as_json = json.dumps(status)
445
+ if _internal_kv_initialized():
446
+ _internal_kv_put(
447
+ ray_constants.DEBUG_AUTOSCALING_STATUS, as_json, overwrite=True
448
+ )
449
+ except Exception:
450
+ # By default, do not exit the monitor on failure.
451
+ if self.retry_on_failure:
452
+ logger.exception("Monitor: Execution exception. Trying again...")
453
+ else:
454
+ raise
455
+
456
+ # Wait for a autoscaler update interval before processing the next
457
+ # round of messages.
458
+ time.sleep(AUTOSCALER_UPDATE_INTERVAL_S)
459
+
460
+ def emit_metrics(self, load_metrics_summary, autoscaler_summary, node_types):
461
+ if autoscaler_summary is None:
462
+ return None
463
+
464
+ for resource_name in ["CPU", "GPU", "TPU"]:
465
+ _, total = load_metrics_summary.usage.get(resource_name, (0, 0))
466
+ pending = autoscaler_summary.pending_resources.get(resource_name, 0)
467
+ self.prom_metrics.cluster_resources.labels(
468
+ resource=resource_name,
469
+ SessionName=self.prom_metrics.session_name,
470
+ ).set(total)
471
+ self.prom_metrics.pending_resources.labels(
472
+ resource=resource_name,
473
+ SessionName=self.prom_metrics.session_name,
474
+ ).set(pending)
475
+
476
+ pending_node_count = Counter()
477
+ for _, node_type, _ in autoscaler_summary.pending_nodes:
478
+ pending_node_count[node_type] += 1
479
+
480
+ for node_type, count in autoscaler_summary.pending_launches.items():
481
+ pending_node_count[node_type] += count
482
+
483
+ for node_type in node_types:
484
+ count = pending_node_count[node_type]
485
+ self.prom_metrics.pending_nodes.labels(
486
+ SessionName=self.prom_metrics.session_name,
487
+ NodeType=node_type,
488
+ ).set(count)
489
+
490
+ for node_type in node_types:
491
+ count = autoscaler_summary.active_nodes.get(node_type, 0)
492
+ self.prom_metrics.active_nodes.labels(
493
+ SessionName=self.prom_metrics.session_name,
494
+ NodeType=node_type,
495
+ ).set(count)
496
+
497
+ failed_node_counts = Counter()
498
+ for _, node_type in autoscaler_summary.failed_nodes:
499
+ failed_node_counts[node_type] += 1
500
+
501
+ # NOTE: This metric isn't reset with monitor resets. This means it will
502
+ # only be updated when the autoscaler' node tracker remembers failed
503
+ # nodes. If the node type failure is evicted from the autoscaler, the
504
+ # metric may not update for a while.
505
+ for node_type, count in failed_node_counts.items():
506
+ self.prom_metrics.recently_failed_nodes.labels(
507
+ SessionName=self.prom_metrics.session_name,
508
+ NodeType=node_type,
509
+ ).set(count)
510
+
511
+ def update_event_summary(self):
512
+ """Report the current size of the cluster.
513
+
514
+ To avoid log spam, only cluster size changes (CPU, GPU or TPU count change)
515
+ are reported to the event summarizer. The event summarizer will report
516
+ only the latest cluster size per batch.
517
+ """
518
+ avail_resources = self.load_metrics.resources_avail_summary()
519
+ if not self.readonly_config and avail_resources != self.last_avail_resources:
520
+ self.event_summarizer.add(
521
+ "Resized to {}.", # e.g., Resized to 100 CPUs, 4 GPUs, 4 TPUs.
522
+ quantity=avail_resources,
523
+ aggregate=lambda old, new: new,
524
+ )
525
+ self.last_avail_resources = avail_resources
526
+
527
+ def destroy_autoscaler_workers(self):
528
+ """Cleanup the autoscaler, in case of an exception in the run() method.
529
+
530
+ We kill the worker nodes, but retain the head node in order to keep
531
+ logs around, keeping costs minimal. This monitor process runs on the
532
+ head node anyway, so this is more reliable."""
533
+
534
+ if self.autoscaler is None:
535
+ return # Nothing to clean up.
536
+
537
+ if self.autoscaling_config is None:
538
+ # This is a logic error in the program. Can't do anything.
539
+ logger.error("Monitor: Cleanup failed due to lack of autoscaler config.")
540
+ return
541
+
542
+ logger.info("Monitor: Exception caught. Taking down workers...")
543
+ clean = False
544
+ while not clean:
545
+ try:
546
+ teardown_cluster(
547
+ config_file=self.autoscaling_config,
548
+ yes=True, # Non-interactive.
549
+ workers_only=True, # Retain head node for logs.
550
+ override_cluster_name=None,
551
+ keep_min_workers=True, # Retain minimal amount of workers.
552
+ )
553
+ clean = True
554
+ logger.info("Monitor: Workers taken down.")
555
+ except Exception:
556
+ logger.error("Monitor: Cleanup exception. Trying again...")
557
+ time.sleep(2)
558
+
559
+ def _handle_failure(self, error):
560
+ if (
561
+ self.autoscaler is not None
562
+ and os.environ.get("RAY_AUTOSCALER_FATESHARE_WORKERS", "") == "1"
563
+ ):
564
+ self.autoscaler.kill_workers()
565
+ # Take down autoscaler workers if necessary.
566
+ self.destroy_autoscaler_workers()
567
+
568
+ # Something went wrong, so push an error to all current and future
569
+ # drivers.
570
+ message = f"The autoscaler failed with the following error:\n{error}"
571
+ if _internal_kv_initialized():
572
+ _internal_kv_put(
573
+ ray_constants.DEBUG_AUTOSCALING_ERROR, message, overwrite=True
574
+ )
575
+ gcs_publisher = ray._raylet.GcsPublisher(address=self.gcs_address)
576
+ from ray._private.utils import publish_error_to_driver
577
+
578
+ publish_error_to_driver(
579
+ ray_constants.MONITOR_DIED_ERROR,
580
+ message,
581
+ gcs_publisher=gcs_publisher,
582
+ )
583
+
584
+ def _signal_handler(self, sig, frame):
585
+ try:
586
+ self._handle_failure(
587
+ f"Terminated with signal {sig}\n"
588
+ + "".join(traceback.format_stack(frame))
589
+ )
590
+ except Exception:
591
+ logger.exception("Monitor: Failure in signal handler.")
592
+ sys.exit(sig + 128)
593
+
594
+ def run(self):
595
+ # Register signal handlers for autoscaler termination.
596
+ # Signals will not be received on windows
597
+ signal.signal(signal.SIGINT, self._signal_handler)
598
+ signal.signal(signal.SIGTERM, self._signal_handler)
599
+ try:
600
+ if _internal_kv_initialized():
601
+ # Delete any previous autoscaling errors.
602
+ _internal_kv_del(ray_constants.DEBUG_AUTOSCALING_ERROR)
603
+ self._initialize_autoscaler()
604
+ self._run()
605
+ except Exception:
606
+ logger.exception("Error in monitor loop")
607
+ self._handle_failure(traceback.format_exc())
608
+ raise
609
+
610
+
611
+ def log_resource_batch_data_if_desired(
612
+ resources_batch_data: gcs_pb2.ResourceUsageBatchData,
613
+ ) -> None:
614
+ if os.getenv("AUTOSCALER_LOG_RESOURCE_BATCH_DATA") == "1":
615
+ logger.info("Logging raw resource message pulled from GCS.")
616
+ logger.info(resources_batch_data)
617
+ logger.info("Done logging raw resource message.")
618
+
619
+
620
+ if __name__ == "__main__":
621
+ parser = argparse.ArgumentParser(
622
+ description=("Parse GCS server for the monitor to connect to.")
623
+ )
624
+ parser.add_argument(
625
+ "--gcs-address", required=False, type=str, help="The address (ip:port) of GCS."
626
+ )
627
+ parser.add_argument(
628
+ "--autoscaling-config",
629
+ required=False,
630
+ type=str,
631
+ help="the path to the autoscaling config file",
632
+ )
633
+ parser.add_argument(
634
+ "--logging-level",
635
+ required=False,
636
+ type=str,
637
+ default=ray_constants.LOGGER_LEVEL,
638
+ choices=ray_constants.LOGGER_LEVEL_CHOICES,
639
+ help=ray_constants.LOGGER_LEVEL_HELP,
640
+ )
641
+ parser.add_argument(
642
+ "--logging-format",
643
+ required=False,
644
+ type=str,
645
+ default=ray_constants.LOGGER_FORMAT,
646
+ help=ray_constants.LOGGER_FORMAT_HELP,
647
+ )
648
+ parser.add_argument(
649
+ "--logging-filename",
650
+ required=False,
651
+ type=str,
652
+ default=ray_constants.MONITOR_LOG_FILE_NAME,
653
+ help="Specify the name of log file, "
654
+ "log to stdout if set empty, default is "
655
+ f'"{ray_constants.MONITOR_LOG_FILE_NAME}"',
656
+ )
657
+ parser.add_argument(
658
+ "--logs-dir",
659
+ required=True,
660
+ type=str,
661
+ help="Specify the path of the temporary directory used by Ray processes.",
662
+ )
663
+ parser.add_argument(
664
+ "--logging-rotate-bytes",
665
+ required=False,
666
+ type=int,
667
+ default=ray_constants.LOGGING_ROTATE_BYTES,
668
+ help="Specify the max bytes for rotating "
669
+ "log file, default is "
670
+ f"{ray_constants.LOGGING_ROTATE_BYTES} bytes.",
671
+ )
672
+ parser.add_argument(
673
+ "--logging-rotate-backup-count",
674
+ required=False,
675
+ type=int,
676
+ default=ray_constants.LOGGING_ROTATE_BACKUP_COUNT,
677
+ help="Specify the backup count of rotated log file, default is "
678
+ f"{ray_constants.LOGGING_ROTATE_BACKUP_COUNT}.",
679
+ )
680
+ parser.add_argument(
681
+ "--monitor-ip",
682
+ required=False,
683
+ type=str,
684
+ default=None,
685
+ help="The IP address of the machine hosting the monitor process.",
686
+ )
687
+
688
+ args = parser.parse_args()
689
+ setup_component_logger(
690
+ logging_level=args.logging_level,
691
+ logging_format=args.logging_format,
692
+ log_dir=args.logs_dir,
693
+ filename=args.logging_filename,
694
+ max_bytes=args.logging_rotate_bytes,
695
+ backup_count=args.logging_rotate_backup_count,
696
+ )
697
+
698
+ logger.info(f"Starting monitor using ray installation: {ray.__file__}")
699
+ logger.info(f"Ray version: {ray.__version__}")
700
+ logger.info(f"Ray commit: {ray.__commit__}")
701
+ logger.info(f"Monitor started with command: {sys.argv}")
702
+
703
+ if args.autoscaling_config:
704
+ autoscaling_config = os.path.expanduser(args.autoscaling_config)
705
+ else:
706
+ autoscaling_config = None
707
+
708
+ bootstrap_address = args.gcs_address
709
+ if bootstrap_address is None:
710
+ raise ValueError("--gcs-address must be set!")
711
+
712
+ monitor = Monitor(
713
+ bootstrap_address,
714
+ autoscaling_config,
715
+ log_dir=args.logs_dir,
716
+ monitor_ip=args.monitor_ip,
717
+ )
718
+
719
+ monitor.run()
.venv/lib/python3.11/site-packages/ray/autoscaler/_private/node_launcher.py ADDED
@@ -0,0 +1,221 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import copy
2
+ import logging
3
+ import operator
4
+ import threading
5
+ import time
6
+ import traceback
7
+ from typing import Any, Dict, Optional
8
+
9
+ from ray.autoscaler._private.node_provider_availability_tracker import (
10
+ NodeProviderAvailabilityTracker,
11
+ )
12
+ from ray.autoscaler._private.prom_metrics import AutoscalerPrometheusMetrics
13
+ from ray.autoscaler._private.util import hash_launch_conf
14
+ from ray.autoscaler.node_launch_exception import NodeLaunchException
15
+ from ray.autoscaler.tags import (
16
+ NODE_KIND_WORKER,
17
+ STATUS_UNINITIALIZED,
18
+ TAG_RAY_LAUNCH_CONFIG,
19
+ TAG_RAY_NODE_KIND,
20
+ TAG_RAY_NODE_NAME,
21
+ TAG_RAY_NODE_STATUS,
22
+ TAG_RAY_USER_NODE_TYPE,
23
+ )
24
+
25
+ logger = logging.getLogger(__name__)
26
+
27
+
28
+ class BaseNodeLauncher:
29
+ """Launches Ray nodes in the main thread using
30
+ `BaseNodeLauncher.launch_node()`.
31
+
32
+ This is a superclass of NodeLauncher, which launches nodes asynchronously
33
+ in the background.
34
+
35
+ By default, the subclass NodeLauncher is used to launch nodes in subthreads.
36
+ That behavior can be flagged off in the provider config by setting
37
+ `foreground_node_launch: True`; the autoscaler will then makes blocking calls to
38
+ BaseNodeLauncher.launch_node() in the main thread.
39
+ """
40
+
41
+ def __init__(
42
+ self,
43
+ provider,
44
+ pending,
45
+ event_summarizer,
46
+ node_provider_availability_tracker: NodeProviderAvailabilityTracker,
47
+ session_name: Optional[str] = None,
48
+ prom_metrics=None,
49
+ node_types=None,
50
+ index=None,
51
+ *args,
52
+ **kwargs,
53
+ ):
54
+ self.pending = pending
55
+ self.event_summarizer = event_summarizer
56
+ self.node_provider_availability_tracker = node_provider_availability_tracker
57
+ self.prom_metrics = prom_metrics or AutoscalerPrometheusMetrics(
58
+ session_name=session_name
59
+ )
60
+ self.provider = provider
61
+ self.node_types = node_types
62
+ self.index = str(index) if index is not None else ""
63
+
64
+ def launch_node(
65
+ self, config: Dict[str, Any], count: int, node_type: str
66
+ ) -> Optional[Dict]:
67
+ self.log("Got {} nodes to launch.".format(count))
68
+ created_nodes = self._launch_node(config, count, node_type)
69
+ self.pending.dec(node_type, count)
70
+ return created_nodes
71
+
72
+ def _launch_node(
73
+ self, config: Dict[str, Any], count: int, node_type: str
74
+ ) -> Optional[Dict]:
75
+ if self.node_types:
76
+ assert node_type, node_type
77
+
78
+ # The `worker_nodes` field is deprecated in favor of per-node-type
79
+ # node_configs. We allow it for backwards-compatibility.
80
+ launch_config = copy.deepcopy(config.get("worker_nodes", {}))
81
+ if node_type:
82
+ launch_config.update(
83
+ config["available_node_types"][node_type]["node_config"]
84
+ )
85
+ resources = copy.deepcopy(
86
+ config["available_node_types"][node_type]["resources"]
87
+ )
88
+ labels = copy.deepcopy(
89
+ config["available_node_types"][node_type].get("labels", {})
90
+ )
91
+ launch_hash = hash_launch_conf(launch_config, config["auth"])
92
+ node_config = copy.deepcopy(config.get("worker_nodes", {}))
93
+ node_tags = {
94
+ TAG_RAY_NODE_NAME: "ray-{}-worker".format(config["cluster_name"]),
95
+ TAG_RAY_NODE_KIND: NODE_KIND_WORKER,
96
+ TAG_RAY_NODE_STATUS: STATUS_UNINITIALIZED,
97
+ TAG_RAY_LAUNCH_CONFIG: launch_hash,
98
+ }
99
+ # A custom node type is specified; set the tag in this case, and also
100
+ # merge the configs. We merge the configs instead of overriding, so
101
+ # that the bootstrapped per-cloud properties are preserved.
102
+ # TODO(ekl) this logic is duplicated in commands.py (keep in sync)
103
+ if node_type:
104
+ node_tags[TAG_RAY_USER_NODE_TYPE] = node_type
105
+ node_config.update(launch_config)
106
+
107
+ node_launch_start_time = time.time()
108
+
109
+ error_msg = None
110
+ full_exception = None
111
+ created_nodes = {}
112
+ try:
113
+ created_nodes = self.provider.create_node_with_resources_and_labels(
114
+ node_config, node_tags, count, resources, labels
115
+ )
116
+ except NodeLaunchException as node_launch_exception:
117
+ self.node_provider_availability_tracker.update_node_availability(
118
+ node_type, int(node_launch_start_time), node_launch_exception
119
+ )
120
+
121
+ if node_launch_exception.src_exc_info is not None:
122
+ full_exception = "\n".join(
123
+ traceback.format_exception(*node_launch_exception.src_exc_info)
124
+ )
125
+
126
+ error_msg = (
127
+ f"Failed to launch {{}} node(s) of type {node_type}. "
128
+ f"({node_launch_exception.category}): "
129
+ f"{node_launch_exception.description}"
130
+ )
131
+ except Exception:
132
+ error_msg = f"Failed to launch {{}} node(s) of type {node_type}."
133
+ full_exception = traceback.format_exc()
134
+ else:
135
+ # Record some metrics/observability information when a node is launched.
136
+ launch_time = time.time() - node_launch_start_time
137
+ for _ in range(count):
138
+ # Note: when launching multiple nodes we observe the time it
139
+ # took all nodes to launch for each node. For example, if 4
140
+ # nodes were created in 25 seconds, we would observe the 25
141
+ # second create time 4 times.
142
+ self.prom_metrics.worker_create_node_time.observe(launch_time)
143
+ self.prom_metrics.started_nodes.inc(count)
144
+ self.node_provider_availability_tracker.update_node_availability(
145
+ node_type=node_type,
146
+ timestamp=int(node_launch_start_time),
147
+ node_launch_exception=None,
148
+ )
149
+
150
+ if error_msg is not None:
151
+ self.event_summarizer.add(
152
+ error_msg,
153
+ quantity=count,
154
+ aggregate=operator.add,
155
+ )
156
+ self.log(error_msg)
157
+ self.prom_metrics.node_launch_exceptions.inc()
158
+ self.prom_metrics.failed_create_nodes.inc(count)
159
+ else:
160
+ self.log("Launching {} nodes, type {}.".format(count, node_type))
161
+ self.event_summarizer.add(
162
+ "Adding {} node(s) of type " + str(node_type) + ".",
163
+ quantity=count,
164
+ aggregate=operator.add,
165
+ )
166
+
167
+ if full_exception is not None:
168
+ self.log(full_exception)
169
+
170
+ return created_nodes
171
+
172
+ def log(self, statement):
173
+ # launcher_class is "BaseNodeLauncher", or "NodeLauncher" if called
174
+ # from that subclass.
175
+ launcher_class: str = type(self).__name__
176
+ prefix = "{}{}:".format(launcher_class, self.index)
177
+ logger.info(prefix + " {}".format(statement))
178
+
179
+
180
+ class NodeLauncher(BaseNodeLauncher, threading.Thread):
181
+ """Launches nodes asynchronously in the background."""
182
+
183
+ def __init__(
184
+ self,
185
+ provider,
186
+ queue,
187
+ pending,
188
+ event_summarizer,
189
+ node_provider_availability_tracker,
190
+ session_name: Optional[str] = None,
191
+ prom_metrics=None,
192
+ node_types=None,
193
+ index=None,
194
+ *thread_args,
195
+ **thread_kwargs,
196
+ ):
197
+ self.queue = queue
198
+ BaseNodeLauncher.__init__(
199
+ self,
200
+ provider=provider,
201
+ pending=pending,
202
+ event_summarizer=event_summarizer,
203
+ session_name=session_name,
204
+ node_provider_availability_tracker=node_provider_availability_tracker,
205
+ prom_metrics=prom_metrics,
206
+ node_types=node_types,
207
+ index=index,
208
+ )
209
+ threading.Thread.__init__(self, *thread_args, **thread_kwargs)
210
+
211
+ def run(self):
212
+ """Collects launch data from queue populated by StandardAutoscaler.
213
+ Launches nodes in a background thread.
214
+
215
+ Overrides threading.Thread.run().
216
+ NodeLauncher.start() executes this loop in a background thread.
217
+ """
218
+ while True:
219
+ config, count, node_type = self.queue.get()
220
+ # launch_node is implemented in BaseNodeLauncher
221
+ self.launch_node(config, count, node_type)
.venv/lib/python3.11/site-packages/ray/autoscaler/_private/node_provider_availability_tracker.py ADDED
@@ -0,0 +1,165 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import threading
2
+ import time
3
+ from dataclasses import dataclass
4
+ from typing import Callable, Dict, Optional, Tuple
5
+
6
+ from ray.autoscaler._private.constants import (
7
+ AUTOSCALER_NODE_AVAILABILITY_MAX_STALENESS_S,
8
+ )
9
+ from ray.autoscaler.node_launch_exception import NodeLaunchException
10
+
11
+
12
+ @dataclass
13
+ class UnavailableNodeInformation:
14
+ category: str
15
+ description: str
16
+
17
+
18
+ @dataclass
19
+ class NodeAvailabilityRecord:
20
+ node_type: str
21
+ is_available: bool
22
+ last_checked_timestamp: float
23
+ unavailable_node_information: Optional[UnavailableNodeInformation]
24
+
25
+
26
+ @dataclass
27
+ class NodeAvailabilitySummary:
28
+ node_availabilities: Dict[
29
+ str, NodeAvailabilityRecord
30
+ ] # Mapping from node type to node availability record.
31
+
32
+ @classmethod
33
+ def from_fields(cls, **fields) -> Optional["NodeAvailabilitySummary"]:
34
+ """Implement marshalling from nested fields. pydantic isn't a core dependency
35
+ so we're implementing this by hand instead."""
36
+ parsed = {}
37
+
38
+ node_availabilites_dict = fields.get("node_availabilities", {})
39
+
40
+ for node_type, node_availability_record_dict in node_availabilites_dict.items():
41
+ unavailable_information_dict = node_availability_record_dict.pop(
42
+ "unavailable_node_information", None
43
+ )
44
+ unavaiable_information = None
45
+ if unavailable_information_dict is not None:
46
+ unavaiable_information = UnavailableNodeInformation(
47
+ **unavailable_information_dict
48
+ )
49
+
50
+ parsed[node_type] = NodeAvailabilityRecord(
51
+ unavailable_node_information=unavaiable_information,
52
+ **node_availability_record_dict,
53
+ )
54
+
55
+ return NodeAvailabilitySummary(node_availabilities=parsed)
56
+
57
+ def __eq__(self, other: "NodeAvailabilitySummary"):
58
+ return self.node_availabilities == other.node_availabilities
59
+
60
+ def __bool__(self) -> bool:
61
+ return bool(self.node_availabilities)
62
+
63
+
64
+ class NodeProviderAvailabilityTracker:
65
+ """A thread safe, TTL cache of node provider availability. We don't use
66
+ cachetools.TTLCache because it always sets the expiration time relative to
67
+ insertion time, but in our case, we want entries to expire relative to when
68
+ the node creation was attempted (and entries aren't necessarily added in
69
+ order). We want the entries to expire because the information grows stale
70
+ over time.
71
+ """
72
+
73
+ def __init__(
74
+ self,
75
+ timer: Callable[[], float] = time.time,
76
+ ttl: float = AUTOSCALER_NODE_AVAILABILITY_MAX_STALENESS_S,
77
+ ):
78
+ """A cache that tracks the availability of nodes and throw away
79
+ entries which have grown too stale.
80
+
81
+ Args:
82
+ timer: A function that returns the current time in seconds.
83
+ ttl: The ttl from the insertion timestamp of an entry.
84
+ """
85
+ self.timer = timer
86
+ self.ttl = ttl
87
+ # Mapping from node type to (eviction_time, record)
88
+ self.store: Dict[str, Tuple[float, NodeAvailabilityRecord]] = {}
89
+ # A global lock to simplify thread safety handling.
90
+ self.lock = threading.RLock()
91
+
92
+ def _update_node_availability_requires_lock(
93
+ self,
94
+ node_type: str,
95
+ timestamp: int,
96
+ node_launch_exception: Optional[NodeLaunchException],
97
+ ) -> None:
98
+ if node_launch_exception is None:
99
+ record = NodeAvailabilityRecord(
100
+ node_type=node_type,
101
+ is_available=True,
102
+ last_checked_timestamp=timestamp,
103
+ unavailable_node_information=None,
104
+ )
105
+ else:
106
+ info = UnavailableNodeInformation(
107
+ category=node_launch_exception.category,
108
+ description=node_launch_exception.description,
109
+ )
110
+ record = NodeAvailabilityRecord(
111
+ node_type=node_type,
112
+ is_available=False,
113
+ last_checked_timestamp=timestamp,
114
+ unavailable_node_information=info,
115
+ )
116
+
117
+ expiration_time = timestamp + self.ttl
118
+
119
+ # TODO (Alex): In theory it would be nice to make this dictionary
120
+ # ordered by expiration time, unfortunately that's a bit difficult
121
+ # since `update_node_availability` can be called with out of order
122
+ # timestamps.
123
+ self.store[node_type] = (expiration_time, record)
124
+
125
+ self._remove_old_entries()
126
+
127
+ def update_node_availability(
128
+ self,
129
+ node_type: str,
130
+ timestamp: int,
131
+ node_launch_exception: Optional[NodeLaunchException],
132
+ ) -> None:
133
+ """
134
+ Update the availability and details of a single ndoe type.
135
+
136
+ Args:
137
+ node_type: The node type.
138
+ timestamp: The timestamp that this information is accurate as of.
139
+ node_launch_exception: Details about why the node launch failed. If
140
+ empty, the node type will be considered available."""
141
+ with self.lock:
142
+ self._update_node_availability_requires_lock(
143
+ node_type, timestamp, node_launch_exception
144
+ )
145
+
146
+ def summary(self) -> NodeAvailabilitySummary:
147
+ """
148
+ Returns a summary of node availabilities and their staleness.
149
+
150
+ Returns
151
+ A summary of node availabilities and their staleness.
152
+ """
153
+ with self.lock:
154
+ self._remove_old_entries()
155
+ return NodeAvailabilitySummary(
156
+ {node_type: record for node_type, (_, record) in self.store.items()}
157
+ )
158
+
159
+ def _remove_old_entries(self):
160
+ """Remove any expired entries from the cache."""
161
+ cur_time = self.timer()
162
+ with self.lock:
163
+ for key, (expiration_time, _) in list(self.store.items()):
164
+ if expiration_time < cur_time:
165
+ del self.store[key]
.venv/lib/python3.11/site-packages/ray/autoscaler/_private/node_tracker.py ADDED
@@ -0,0 +1,77 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import List, Set, Tuple
2
+
3
+ from ray.autoscaler._private import constants
4
+
5
+
6
+ class NodeTracker:
7
+ """Map nodes to their corresponding logs.
8
+
9
+ We need to be a little careful here. At an given point in time, node_id <->
10
+ ip can be interchangeably used, but the node_id -> ip relation is not
11
+ bijective _across time_ since IP addresses can be reused. Therefore, we
12
+ should treat node_id as the only unique identifier.
13
+ """
14
+
15
+ def __init__(self):
16
+ # Mapping from node_id -> (ip, node type, stdout_path, process runner)
17
+ self.node_mapping = {}
18
+
19
+ # A quick, inefficient FIFO cache implementation.
20
+ self.lru_order = []
21
+
22
+ def _add_node_mapping(self, node_id: str, value: str):
23
+ if node_id in self.node_mapping:
24
+ return
25
+
26
+ assert len(self.lru_order) == len(self.node_mapping)
27
+ if len(self.lru_order) >= constants.AUTOSCALER_MAX_NODES_TRACKED:
28
+ # The LRU eviction case
29
+ node_id = self.lru_order.pop(0)
30
+ del self.node_mapping[node_id]
31
+
32
+ self.node_mapping[node_id] = value
33
+ self.lru_order.append(node_id)
34
+
35
+ def track(self, node_id: str, ip: str, node_type: str):
36
+ """
37
+ Begin to track a new node.
38
+
39
+ Args:
40
+ node_id: The node id.
41
+ ip: The node ip address.
42
+ node_type: The node type.
43
+ """
44
+ if node_id not in self.node_mapping:
45
+ self._add_node_mapping(node_id, (ip, node_type))
46
+
47
+ def untrack(self, node_id: str):
48
+ """Gracefully stop tracking a node. If a node is intentionally removed from
49
+ the cluster, we should stop tracking it so we don't mistakenly mark it
50
+ as failed.
51
+
52
+ Args:
53
+ node_id: The node id which failed.
54
+ """
55
+ if node_id in self.node_mapping:
56
+ self.lru_order.remove(node_id)
57
+ del self.node_mapping[node_id]
58
+
59
+ def get_all_failed_node_info(
60
+ self, non_failed_ids: Set[str]
61
+ ) -> List[Tuple[str, str]]:
62
+ """Get the information about all failed nodes. A failed node is any node which
63
+ we began to track that is not pending or alive (i.e. not failed).
64
+
65
+ Args:
66
+ non_failed_ids: Nodes are failed unless they are in this set.
67
+
68
+ Returns:
69
+ List[Tuple[str, str]]: A list of tuples. Each tuple is the ip
70
+ address and type of a failed node.
71
+ """
72
+ failed_nodes = self.node_mapping.keys() - non_failed_ids
73
+ failed_info = []
74
+ # Returning the list in order is important for display purposes.
75
+ for node_id in filter(lambda node_id: node_id in failed_nodes, self.lru_order):
76
+ failed_info.append(self.node_mapping[node_id])
77
+ return failed_info
.venv/lib/python3.11/site-packages/ray/autoscaler/_private/prom_metrics.py ADDED
@@ -0,0 +1,292 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Optional
2
+
3
+
4
+ class NullMetric:
5
+ """Mock metric class to be used in case of prometheus_client import error."""
6
+
7
+ def set(self, *args, **kwargs):
8
+ pass
9
+
10
+ def observe(self, *args, **kwargs):
11
+ pass
12
+
13
+ def inc(self, *args, **kwargs):
14
+ pass
15
+
16
+ def labels(self, *args, **kwargs):
17
+ return self
18
+
19
+ def clear(self):
20
+ pass
21
+
22
+
23
+ try:
24
+
25
+ from prometheus_client import CollectorRegistry, Counter, Gauge, Histogram
26
+
27
+ # The metrics in this class should be kept in sync with
28
+ # python/ray/tests/test_metrics_agent.py
29
+ class AutoscalerPrometheusMetrics:
30
+ def __init__(
31
+ self, session_name: str = None, registry: Optional[CollectorRegistry] = None
32
+ ):
33
+ self.registry: CollectorRegistry = registry or CollectorRegistry(
34
+ auto_describe=True
35
+ )
36
+ self._session_name = session_name
37
+ # Buckets: 5 seconds, 10 seconds, 20 seconds, 30 seconds,
38
+ # 45 seconds, 1 minute, 1.5 minutes, 2 minutes,
39
+ # 3 minutes, 4 minutes, 5 minutes, 6 minutes,
40
+ # 8 minutes, 10 minutes, 12 minutes, 15 minutes
41
+ # 20 minutes, 25 minutes, 30 minutes
42
+ # used for both worker launch time and worker update time
43
+ histogram_buckets = [
44
+ 5,
45
+ 10,
46
+ 20,
47
+ 30,
48
+ 45,
49
+ 60,
50
+ 90,
51
+ 120,
52
+ 180,
53
+ 240,
54
+ 300,
55
+ 360,
56
+ 480,
57
+ 600,
58
+ 720,
59
+ 900,
60
+ 1200,
61
+ 1500,
62
+ 1800,
63
+ ]
64
+ # Buckets: .01 seconds to 1000 seconds.
65
+ # Used for autoscaler update time.
66
+ update_time_buckets = [0.01, 0.1, 1, 10, 100, 1000]
67
+ self.worker_create_node_time: Histogram = Histogram(
68
+ "worker_create_node_time_seconds",
69
+ "Worker launch time. This is the time it takes for a call to "
70
+ "a node provider's create_node method to return. Note that "
71
+ "when nodes are launched in batches, the launch time for that "
72
+ "batch will be observed once for *each* node in that batch. "
73
+ "For example, if 8 nodes are launched in 3 minutes, a launch "
74
+ "time of 3 minutes will be observed 8 times.",
75
+ labelnames=("SessionName",),
76
+ unit="seconds",
77
+ namespace="autoscaler",
78
+ registry=self.registry,
79
+ buckets=histogram_buckets,
80
+ ).labels(SessionName=session_name)
81
+ self.worker_update_time: Histogram = Histogram(
82
+ "worker_update_time_seconds",
83
+ "Worker update time. This is the time between when an updater "
84
+ "thread begins executing and when it exits successfully. This "
85
+ "metric only observes times for successful updates.",
86
+ labelnames=("SessionName",),
87
+ unit="seconds",
88
+ namespace="autoscaler",
89
+ registry=self.registry,
90
+ buckets=histogram_buckets,
91
+ ).labels(SessionName=session_name)
92
+ self.update_time: Histogram = Histogram(
93
+ "update_time",
94
+ "Autoscaler update time. This is the time for an autoscaler "
95
+ "update iteration to complete.",
96
+ labelnames=("SessionName",),
97
+ unit="seconds",
98
+ namespace="autoscaler",
99
+ registry=self.registry,
100
+ buckets=update_time_buckets,
101
+ ).labels(SessionName=session_name)
102
+ self.pending_nodes: Gauge = Gauge(
103
+ "pending_nodes",
104
+ "Number of nodes pending to be started.",
105
+ labelnames=(
106
+ "NodeType",
107
+ "SessionName",
108
+ ),
109
+ unit="nodes",
110
+ namespace="autoscaler",
111
+ registry=self.registry,
112
+ )
113
+ self.active_nodes: Gauge = Gauge(
114
+ "active_nodes",
115
+ "Number of nodes in the cluster.",
116
+ labelnames=(
117
+ "NodeType",
118
+ "SessionName",
119
+ ),
120
+ unit="nodes",
121
+ namespace="autoscaler",
122
+ registry=self.registry,
123
+ )
124
+ self.recently_failed_nodes = Gauge(
125
+ "recently_failed_nodes",
126
+ "The number of recently failed nodes. This count could reset "
127
+ "at undefined times.",
128
+ labelnames=(
129
+ "NodeType",
130
+ "SessionName",
131
+ ),
132
+ unit="nodes",
133
+ namespace="autoscaler",
134
+ registry=self.registry,
135
+ )
136
+ self.started_nodes: Counter = Counter(
137
+ "started_nodes",
138
+ "Number of nodes started.",
139
+ labelnames=("SessionName",),
140
+ unit="nodes",
141
+ namespace="autoscaler",
142
+ registry=self.registry,
143
+ ).labels(SessionName=session_name)
144
+ self.stopped_nodes: Counter = Counter(
145
+ "stopped_nodes",
146
+ "Number of nodes stopped.",
147
+ labelnames=("SessionName",),
148
+ unit="nodes",
149
+ namespace="autoscaler",
150
+ registry=self.registry,
151
+ ).labels(SessionName=session_name)
152
+ self.updating_nodes: Gauge = Gauge(
153
+ "updating_nodes",
154
+ "Number of nodes in the process of updating.",
155
+ labelnames=("SessionName",),
156
+ unit="nodes",
157
+ namespace="autoscaler",
158
+ registry=self.registry,
159
+ ).labels(SessionName=session_name)
160
+ self.recovering_nodes: Gauge = Gauge(
161
+ "recovering_nodes",
162
+ "Number of nodes in the process of recovering.",
163
+ labelnames=("SessionName",),
164
+ unit="nodes",
165
+ namespace="autoscaler",
166
+ registry=self.registry,
167
+ ).labels(SessionName=session_name)
168
+ self.running_workers: Gauge = Gauge(
169
+ "running_workers",
170
+ "Number of worker nodes running.",
171
+ labelnames=("SessionName",),
172
+ unit="nodes",
173
+ namespace="autoscaler",
174
+ registry=self.registry,
175
+ ).labels(SessionName=session_name)
176
+ self.failed_create_nodes: Counter = Counter(
177
+ "failed_create_nodes",
178
+ "Number of nodes that failed to be created due to an "
179
+ "exception in the node provider's create_node method.",
180
+ labelnames=("SessionName",),
181
+ unit="nodes",
182
+ namespace="autoscaler",
183
+ registry=self.registry,
184
+ ).labels(SessionName=session_name)
185
+ self.failed_updates: Counter = Counter(
186
+ "failed_updates",
187
+ "Number of failed worker node updates.",
188
+ labelnames=("SessionName",),
189
+ unit="updates",
190
+ namespace="autoscaler",
191
+ registry=self.registry,
192
+ ).labels(SessionName=session_name)
193
+ self.successful_updates: Counter = Counter(
194
+ "successful_updates",
195
+ "Number of succesfful worker node updates.",
196
+ labelnames=("SessionName",),
197
+ unit="updates",
198
+ namespace="autoscaler",
199
+ registry=self.registry,
200
+ ).labels(SessionName=session_name)
201
+ self.failed_recoveries: Counter = Counter(
202
+ "failed_recoveries",
203
+ "Number of failed node recoveries.",
204
+ labelnames=("SessionName",),
205
+ unit="recoveries",
206
+ namespace="autoscaler",
207
+ registry=self.registry,
208
+ ).labels(SessionName=session_name)
209
+ self.successful_recoveries: Counter = Counter(
210
+ "successful_recoveries",
211
+ "Number of successful node recoveries.",
212
+ labelnames=("SessionName",),
213
+ unit="recoveries",
214
+ namespace="autoscaler",
215
+ registry=self.registry,
216
+ ).labels(SessionName=session_name)
217
+ self.update_loop_exceptions: Counter = Counter(
218
+ "update_loop_exceptions",
219
+ "Number of exceptions raised in the update loop of the autoscaler.",
220
+ labelnames=("SessionName",),
221
+ unit="exceptions",
222
+ namespace="autoscaler",
223
+ registry=self.registry,
224
+ ).labels(SessionName=session_name)
225
+ self.node_launch_exceptions: Counter = Counter(
226
+ "node_launch_exceptions",
227
+ "Number of exceptions raised while launching nodes.",
228
+ labelnames=("SessionName",),
229
+ unit="exceptions",
230
+ namespace="autoscaler",
231
+ registry=self.registry,
232
+ ).labels(SessionName=session_name)
233
+ self.reset_exceptions: Counter = Counter(
234
+ "reset_exceptions",
235
+ "Number of exceptions raised while resetting the autoscaler.",
236
+ labelnames=("SessionName",),
237
+ unit="exceptions",
238
+ namespace="autoscaler",
239
+ registry=self.registry,
240
+ ).labels(SessionName=session_name)
241
+ self.config_validation_exceptions: Counter = Counter(
242
+ "config_validation_exceptions",
243
+ "Number of exceptions raised while validating the config "
244
+ "during a reset.",
245
+ labelnames=("SessionName",),
246
+ unit="exceptions",
247
+ namespace="autoscaler",
248
+ registry=self.registry,
249
+ ).labels(SessionName=session_name)
250
+ self.drain_node_exceptions: Counter = Counter(
251
+ "drain_node_exceptions",
252
+ "Number of exceptions raised when making a DrainNode rpc"
253
+ "prior to node termination.",
254
+ labelnames=("SessionName",),
255
+ unit="exceptions",
256
+ namespace="autoscaler",
257
+ registry=self.registry,
258
+ ).labels(SessionName=session_name)
259
+ # This represents the autoscaler's view of essentially
260
+ # `ray.cluster_resources()`, it may be slightly different from the
261
+ # core metric from an eventual consistency perspective.
262
+ self.cluster_resources: Gauge = Gauge(
263
+ "cluster_resources",
264
+ "Total logical resources in the cluster.",
265
+ labelnames=("resource", "SessionName"),
266
+ unit="resources",
267
+ namespace="autoscaler",
268
+ registry=self.registry,
269
+ )
270
+ # This represents the pending launches + nodes being set up for the
271
+ # autoscaler.
272
+ self.pending_resources: Gauge = Gauge(
273
+ "pending_resources",
274
+ "Pending logical resources in the cluster.",
275
+ labelnames=("resource", "SessionName"),
276
+ unit="resources",
277
+ namespace="autoscaler",
278
+ registry=self.registry,
279
+ )
280
+
281
+ @property
282
+ def session_name(self):
283
+ return self._session_name
284
+
285
+ except ImportError:
286
+
287
+ class AutoscalerPrometheusMetrics(object):
288
+ def __init__(self, session_name: str = None):
289
+ pass
290
+
291
+ def __getattr__(self, attr):
292
+ return NullMetric()