Add files using upload-large-folder tool
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- .venv/lib/python3.11/site-packages/ray/air/_internal/__pycache__/__init__.cpython-311.pyc +0 -0
- .venv/lib/python3.11/site-packages/ray/air/_internal/__pycache__/config.cpython-311.pyc +0 -0
- .venv/lib/python3.11/site-packages/ray/autoscaler/__init__.py +8 -0
- .venv/lib/python3.11/site-packages/ray/autoscaler/__pycache__/__init__.cpython-311.pyc +0 -0
- .venv/lib/python3.11/site-packages/ray/autoscaler/__pycache__/batching_node_provider.cpython-311.pyc +0 -0
- .venv/lib/python3.11/site-packages/ray/autoscaler/__pycache__/command_runner.cpython-311.pyc +0 -0
- .venv/lib/python3.11/site-packages/ray/autoscaler/__pycache__/launch_and_verify_cluster.cpython-311.pyc +0 -0
- .venv/lib/python3.11/site-packages/ray/autoscaler/__pycache__/node_launch_exception.cpython-311.pyc +0 -0
- .venv/lib/python3.11/site-packages/ray/autoscaler/__pycache__/node_provider.cpython-311.pyc +0 -0
- .venv/lib/python3.11/site-packages/ray/autoscaler/__pycache__/tags.cpython-311.pyc +0 -0
- .venv/lib/python3.11/site-packages/ray/autoscaler/_private/__init__.py +0 -0
- .venv/lib/python3.11/site-packages/ray/autoscaler/_private/_azure/__init__.py +0 -0
- .venv/lib/python3.11/site-packages/ray/autoscaler/_private/_azure/__pycache__/__init__.cpython-311.pyc +0 -0
- .venv/lib/python3.11/site-packages/ray/autoscaler/_private/_azure/__pycache__/config.cpython-311.pyc +0 -0
- .venv/lib/python3.11/site-packages/ray/autoscaler/_private/_azure/__pycache__/node_provider.cpython-311.pyc +0 -0
- .venv/lib/python3.11/site-packages/ray/autoscaler/_private/_azure/azure-config-template.json +130 -0
- .venv/lib/python3.11/site-packages/ray/autoscaler/_private/_azure/azure-vm-template.json +294 -0
- .venv/lib/python3.11/site-packages/ray/autoscaler/_private/_azure/config.py +208 -0
- .venv/lib/python3.11/site-packages/ray/autoscaler/_private/_azure/node_provider.py +488 -0
- .venv/lib/python3.11/site-packages/ray/autoscaler/_private/autoscaler.py +1508 -0
- .venv/lib/python3.11/site-packages/ray/autoscaler/_private/cli_logger.py +825 -0
- .venv/lib/python3.11/site-packages/ray/autoscaler/_private/cli_logger_demoall.py +40 -0
- .venv/lib/python3.11/site-packages/ray/autoscaler/_private/cluster_dump.py +652 -0
- .venv/lib/python3.11/site-packages/ray/autoscaler/_private/command_runner.py +921 -0
- .venv/lib/python3.11/site-packages/ray/autoscaler/_private/commands.py +1631 -0
- .venv/lib/python3.11/site-packages/ray/autoscaler/_private/constants.py +140 -0
- .venv/lib/python3.11/site-packages/ray/autoscaler/_private/docker.py +129 -0
- .venv/lib/python3.11/site-packages/ray/autoscaler/_private/event_summarizer.py +75 -0
- .venv/lib/python3.11/site-packages/ray/autoscaler/_private/event_system.py +106 -0
- .venv/lib/python3.11/site-packages/ray/autoscaler/_private/fake_multi_node/__init__.py +0 -0
- .venv/lib/python3.11/site-packages/ray/autoscaler/_private/fake_multi_node/command_runner.py +91 -0
- .venv/lib/python3.11/site-packages/ray/autoscaler/_private/fake_multi_node/docker_monitor.py +246 -0
- .venv/lib/python3.11/site-packages/ray/autoscaler/_private/fake_multi_node/test_utils.py +398 -0
- .venv/lib/python3.11/site-packages/ray/autoscaler/_private/legacy_info_string.py +37 -0
- .venv/lib/python3.11/site-packages/ray/autoscaler/_private/load_metrics.py +375 -0
- .venv/lib/python3.11/site-packages/ray/autoscaler/_private/loader.py +15 -0
- .venv/lib/python3.11/site-packages/ray/autoscaler/_private/local/__init__.py +0 -0
- .venv/lib/python3.11/site-packages/ray/autoscaler/_private/local/__pycache__/__init__.cpython-311.pyc +0 -0
- .venv/lib/python3.11/site-packages/ray/autoscaler/_private/local/__pycache__/config.cpython-311.pyc +0 -0
- .venv/lib/python3.11/site-packages/ray/autoscaler/_private/local/__pycache__/coordinator_node_provider.cpython-311.pyc +0 -0
- .venv/lib/python3.11/site-packages/ray/autoscaler/_private/local/__pycache__/node_provider.cpython-311.pyc +0 -0
- .venv/lib/python3.11/site-packages/ray/autoscaler/_private/local/config.py +121 -0
- .venv/lib/python3.11/site-packages/ray/autoscaler/_private/local/coordinator_node_provider.py +110 -0
- .venv/lib/python3.11/site-packages/ray/autoscaler/_private/local/node_provider.py +304 -0
- .venv/lib/python3.11/site-packages/ray/autoscaler/_private/log_timer.py +33 -0
- .venv/lib/python3.11/site-packages/ray/autoscaler/_private/monitor.py +719 -0
- .venv/lib/python3.11/site-packages/ray/autoscaler/_private/node_launcher.py +221 -0
- .venv/lib/python3.11/site-packages/ray/autoscaler/_private/node_provider_availability_tracker.py +165 -0
- .venv/lib/python3.11/site-packages/ray/autoscaler/_private/node_tracker.py +77 -0
- .venv/lib/python3.11/site-packages/ray/autoscaler/_private/prom_metrics.py +292 -0
.venv/lib/python3.11/site-packages/ray/air/_internal/__pycache__/__init__.cpython-311.pyc
ADDED
|
Binary file (190 Bytes). View file
|
|
|
.venv/lib/python3.11/site-packages/ray/air/_internal/__pycache__/config.cpython-311.pyc
ADDED
|
Binary file (2.43 kB). View file
|
|
|
.venv/lib/python3.11/site-packages/ray/autoscaler/__init__.py
ADDED
|
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
from pathlib import Path
|
| 3 |
+
|
| 4 |
+
from ray.autoscaler import sdk
|
| 5 |
+
|
| 6 |
+
__all__ = ["sdk"]
|
| 7 |
+
|
| 8 |
+
AUTOSCALER_DIR_PATH = Path(os.path.abspath(os.path.dirname(__file__)))
|
.venv/lib/python3.11/site-packages/ray/autoscaler/__pycache__/__init__.cpython-311.pyc
ADDED
|
Binary file (560 Bytes). View file
|
|
|
.venv/lib/python3.11/site-packages/ray/autoscaler/__pycache__/batching_node_provider.cpython-311.pyc
ADDED
|
Binary file (12.9 kB). View file
|
|
|
.venv/lib/python3.11/site-packages/ray/autoscaler/__pycache__/command_runner.cpython-311.pyc
ADDED
|
Binary file (4.84 kB). View file
|
|
|
.venv/lib/python3.11/site-packages/ray/autoscaler/__pycache__/launch_and_verify_cluster.cpython-311.pyc
ADDED
|
Binary file (19 kB). View file
|
|
|
.venv/lib/python3.11/site-packages/ray/autoscaler/__pycache__/node_launch_exception.cpython-311.pyc
ADDED
|
Binary file (1.99 kB). View file
|
|
|
.venv/lib/python3.11/site-packages/ray/autoscaler/__pycache__/node_provider.cpython-311.pyc
ADDED
|
Binary file (13.4 kB). View file
|
|
|
.venv/lib/python3.11/site-packages/ray/autoscaler/__pycache__/tags.cpython-311.pyc
ADDED
|
Binary file (1.32 kB). View file
|
|
|
.venv/lib/python3.11/site-packages/ray/autoscaler/_private/__init__.py
ADDED
|
File without changes
|
.venv/lib/python3.11/site-packages/ray/autoscaler/_private/_azure/__init__.py
ADDED
|
File without changes
|
.venv/lib/python3.11/site-packages/ray/autoscaler/_private/_azure/__pycache__/__init__.cpython-311.pyc
ADDED
|
Binary file (203 Bytes). View file
|
|
|
.venv/lib/python3.11/site-packages/ray/autoscaler/_private/_azure/__pycache__/config.cpython-311.pyc
ADDED
|
Binary file (9.59 kB). View file
|
|
|
.venv/lib/python3.11/site-packages/ray/autoscaler/_private/_azure/__pycache__/node_provider.cpython-311.pyc
ADDED
|
Binary file (26.4 kB). View file
|
|
|
.venv/lib/python3.11/site-packages/ray/autoscaler/_private/_azure/azure-config-template.json
ADDED
|
@@ -0,0 +1,130 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"$schema": "https://schema.management.azure.com/schemas/2019-04-01/deploymentTemplate.json#",
|
| 3 |
+
"contentVersion": "1.0.0.0",
|
| 4 |
+
"parameters": {
|
| 5 |
+
"clusterId": {
|
| 6 |
+
"type": "string",
|
| 7 |
+
"metadata": {
|
| 8 |
+
"description": "Unique string appended to resource names to isolate resources from different ray clusters."
|
| 9 |
+
}
|
| 10 |
+
},
|
| 11 |
+
"subnet": {
|
| 12 |
+
"type": "string",
|
| 13 |
+
"metadata": {
|
| 14 |
+
"description": "Subnet parameters."
|
| 15 |
+
}
|
| 16 |
+
},
|
| 17 |
+
"msiName": {
|
| 18 |
+
"type": "string",
|
| 19 |
+
"metadata": {
|
| 20 |
+
"description": "Managed service identity."
|
| 21 |
+
}
|
| 22 |
+
},
|
| 23 |
+
"msiResourceGroup": {
|
| 24 |
+
"type": "string",
|
| 25 |
+
"metadata": {
|
| 26 |
+
"description": "Managed service identity resource group."
|
| 27 |
+
}
|
| 28 |
+
},
|
| 29 |
+
"createMsi": {
|
| 30 |
+
"type": "bool",
|
| 31 |
+
"defaultValue": "true"
|
| 32 |
+
}
|
| 33 |
+
},
|
| 34 |
+
"variables": {
|
| 35 |
+
"contributor": "[subscriptionResourceId('Microsoft.Authorization/roleDefinitions', 'b24988ac-6180-42a0-ab88-20f7382dd24c')]",
|
| 36 |
+
"location": "[resourceGroup().location]",
|
| 37 |
+
"roleAssignmentName": "[concat('ray-', parameters('clusterId'), '-ra')]",
|
| 38 |
+
"nsgName": "[concat('ray-', parameters('clusterId'), '-nsg')]",
|
| 39 |
+
"nsg": "[resourceId('Microsoft.Network/networkSecurityGroups', variables('nsgName'))]",
|
| 40 |
+
"vnetName": "[concat('ray-', parameters('clusterId'), '-vnet')]",
|
| 41 |
+
"subnetName": "[concat('ray-', parameters('clusterId'), '-subnet')]"
|
| 42 |
+
},
|
| 43 |
+
"resources": [
|
| 44 |
+
{
|
| 45 |
+
"condition": "[parameters('createMsi')]",
|
| 46 |
+
"type": "Microsoft.ManagedIdentity/userAssignedIdentities",
|
| 47 |
+
"apiVersion": "2018-11-30",
|
| 48 |
+
"location": "[variables('location')]",
|
| 49 |
+
"name": "[parameters('msiName')]"
|
| 50 |
+
},
|
| 51 |
+
{
|
| 52 |
+
"type": "Microsoft.Authorization/roleAssignments",
|
| 53 |
+
"apiVersion": "2020-08-01-preview",
|
| 54 |
+
"name": "[guid(variables('roleAssignmentName'))]",
|
| 55 |
+
"properties": {
|
| 56 |
+
"principalId": "[reference(resourceId(parameters('msiResourceGroup'), 'Microsoft.ManagedIdentity/userAssignedIdentities', parameters('msiName')), '2018-11-30').principalId]",
|
| 57 |
+
"roleDefinitionId": "[variables('contributor')]",
|
| 58 |
+
"scope": "[resourceGroup().id]",
|
| 59 |
+
"principalType": "ServicePrincipal"
|
| 60 |
+
},
|
| 61 |
+
"dependsOn": [
|
| 62 |
+
"[parameters('msiName')]"
|
| 63 |
+
]
|
| 64 |
+
},
|
| 65 |
+
{
|
| 66 |
+
"type": "Microsoft.Network/networkSecurityGroups",
|
| 67 |
+
"apiVersion": "2019-02-01",
|
| 68 |
+
"name": "[variables('nsgName')]",
|
| 69 |
+
"location": "[variables('location')]",
|
| 70 |
+
"properties": {
|
| 71 |
+
"securityRules": [
|
| 72 |
+
{
|
| 73 |
+
"name": "SSH",
|
| 74 |
+
"properties": {
|
| 75 |
+
"priority": 1000,
|
| 76 |
+
"protocol": "TCP",
|
| 77 |
+
"access": "Allow",
|
| 78 |
+
"direction": "Inbound",
|
| 79 |
+
"sourceAddressPrefix": "*",
|
| 80 |
+
"sourcePortRange": "*",
|
| 81 |
+
"destinationAddressPrefix": "*",
|
| 82 |
+
"destinationPortRange": "22"
|
| 83 |
+
}
|
| 84 |
+
}
|
| 85 |
+
]
|
| 86 |
+
}
|
| 87 |
+
},
|
| 88 |
+
{
|
| 89 |
+
"type": "Microsoft.Network/virtualNetworks",
|
| 90 |
+
"apiVersion": "2019-11-01",
|
| 91 |
+
"name": "[variables('vnetName')]",
|
| 92 |
+
"location": "[variables('location')]",
|
| 93 |
+
"properties": {
|
| 94 |
+
"addressSpace": {
|
| 95 |
+
"addressPrefixes": [
|
| 96 |
+
"[parameters('subnet')]"
|
| 97 |
+
]
|
| 98 |
+
},
|
| 99 |
+
"subnets": [
|
| 100 |
+
{
|
| 101 |
+
"name": "[variables('subnetName')]",
|
| 102 |
+
"properties": {
|
| 103 |
+
"addressPrefix": "[parameters('subnet')]",
|
| 104 |
+
"networkSecurityGroup": {
|
| 105 |
+
"id": "[variables('nsg')]"
|
| 106 |
+
}
|
| 107 |
+
}
|
| 108 |
+
}
|
| 109 |
+
]
|
| 110 |
+
},
|
| 111 |
+
"dependsOn": [
|
| 112 |
+
"[variables('nsg')]"
|
| 113 |
+
]
|
| 114 |
+
}
|
| 115 |
+
],
|
| 116 |
+
"outputs": {
|
| 117 |
+
"subnet": {
|
| 118 |
+
"type": "string",
|
| 119 |
+
"value": "[resourceId('Microsoft.Network/virtualNetworks/subnets', variables('vnetName'), variables('subnetName'))]"
|
| 120 |
+
},
|
| 121 |
+
"nsg": {
|
| 122 |
+
"type": "string",
|
| 123 |
+
"value": "[variables('nsg')]"
|
| 124 |
+
},
|
| 125 |
+
"msi": {
|
| 126 |
+
"type": "string",
|
| 127 |
+
"value": "[resourceId(parameters('msiResourceGroup'), 'Microsoft.ManagedIdentity/userAssignedIdentities', parameters('msiName'))]"
|
| 128 |
+
}
|
| 129 |
+
}
|
| 130 |
+
}
|
.venv/lib/python3.11/site-packages/ray/autoscaler/_private/_azure/azure-vm-template.json
ADDED
|
@@ -0,0 +1,294 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"$schema": "https://schema.management.azure.com/schemas/2019-04-01/deploymentTemplate.json#",
|
| 3 |
+
"contentVersion": "1.0.0.0",
|
| 4 |
+
"parameters": {
|
| 5 |
+
"vmName": {
|
| 6 |
+
"type": "string",
|
| 7 |
+
"metadata": {
|
| 8 |
+
"description": "The name of you Virtual Machine."
|
| 9 |
+
}
|
| 10 |
+
},
|
| 11 |
+
"adminUsername": {
|
| 12 |
+
"type": "string",
|
| 13 |
+
"metadata": {
|
| 14 |
+
"description": "Username for the Virtual Machine."
|
| 15 |
+
}
|
| 16 |
+
},
|
| 17 |
+
"publicKey": {
|
| 18 |
+
"type": "securestring",
|
| 19 |
+
"metadata": {
|
| 20 |
+
"description": "SSH Key for the Virtual Machine"
|
| 21 |
+
}
|
| 22 |
+
},
|
| 23 |
+
"imagePublisher": {
|
| 24 |
+
"type": "string",
|
| 25 |
+
"metadata": {
|
| 26 |
+
"description": "The publisher of the VM image"
|
| 27 |
+
}
|
| 28 |
+
},
|
| 29 |
+
"imageOffer": {
|
| 30 |
+
"type": "string",
|
| 31 |
+
"metadata": {
|
| 32 |
+
"description": "The offer of the VM image"
|
| 33 |
+
}
|
| 34 |
+
},
|
| 35 |
+
"imageSku": {
|
| 36 |
+
"type": "string",
|
| 37 |
+
"metadata": {
|
| 38 |
+
"description": "The sku of the VM image"
|
| 39 |
+
}
|
| 40 |
+
},
|
| 41 |
+
"imageVersion": {
|
| 42 |
+
"type": "string",
|
| 43 |
+
"metadata": {
|
| 44 |
+
"description": "The version of the VM image"
|
| 45 |
+
}
|
| 46 |
+
},
|
| 47 |
+
"vmSize": {
|
| 48 |
+
"type": "string",
|
| 49 |
+
"metadata": {
|
| 50 |
+
"description": "The size of the VM"
|
| 51 |
+
}
|
| 52 |
+
},
|
| 53 |
+
"vmTags": {
|
| 54 |
+
"type": "object",
|
| 55 |
+
"metadata": {
|
| 56 |
+
"description": "Tags for the VM"
|
| 57 |
+
}
|
| 58 |
+
},
|
| 59 |
+
"vmCount": {
|
| 60 |
+
"type": "int",
|
| 61 |
+
"metadata": {
|
| 62 |
+
"description": "Number of VMs to deploy"
|
| 63 |
+
}
|
| 64 |
+
},
|
| 65 |
+
"provisionPublicIp": {
|
| 66 |
+
"type": "bool",
|
| 67 |
+
"defaultValue": true,
|
| 68 |
+
"metadata": {
|
| 69 |
+
"description": "If true creates a public ip"
|
| 70 |
+
}
|
| 71 |
+
},
|
| 72 |
+
"priority": {
|
| 73 |
+
"type": "string",
|
| 74 |
+
"defaultValue": "Regular",
|
| 75 |
+
"metadata": {
|
| 76 |
+
"description": "Specifies the priority for the virtual machine."
|
| 77 |
+
}
|
| 78 |
+
},
|
| 79 |
+
"evictionPolicy": {
|
| 80 |
+
"type": "string",
|
| 81 |
+
"defaultValue": "Delete",
|
| 82 |
+
"metadata": {
|
| 83 |
+
"description": "Specifies the eviction policy for the virtual machine."
|
| 84 |
+
}
|
| 85 |
+
},
|
| 86 |
+
"billingProfile": {
|
| 87 |
+
"type": "object",
|
| 88 |
+
"defaultValue": {},
|
| 89 |
+
"metadata": {
|
| 90 |
+
"description": "Specifies the maximum price to pay for Azure Spot VM."
|
| 91 |
+
}
|
| 92 |
+
},
|
| 93 |
+
"msi": {
|
| 94 |
+
"type": "string",
|
| 95 |
+
"metadata": {
|
| 96 |
+
"description": "Managed service identity resource id."
|
| 97 |
+
}
|
| 98 |
+
},
|
| 99 |
+
"nsg": {
|
| 100 |
+
"type": "string",
|
| 101 |
+
"metadata": {
|
| 102 |
+
"description": "Network security group resource id."
|
| 103 |
+
}
|
| 104 |
+
},
|
| 105 |
+
"subnet": {
|
| 106 |
+
"type": "string",
|
| 107 |
+
"metadata": {
|
| 108 |
+
"descriptions": "Subnet resource id."
|
| 109 |
+
}
|
| 110 |
+
},
|
| 111 |
+
"enableAcceleratedNetworking": {
|
| 112 |
+
"type": "bool",
|
| 113 |
+
"defaultValue": false,
|
| 114 |
+
"metadata": {
|
| 115 |
+
"descriptions": "Whether to enable accelerated networking."
|
| 116 |
+
}
|
| 117 |
+
}
|
| 118 |
+
},
|
| 119 |
+
"variables": {
|
| 120 |
+
"location": "[resourceGroup().location]",
|
| 121 |
+
"networkInterfaceNamePrivate": "[concat(parameters('vmName'), '-nic')]",
|
| 122 |
+
"networkInterfaceNamePublic": "[concat(parameters('vmName'), '-nic-public')]",
|
| 123 |
+
"networkInterfaceName": "[if(parameters('provisionPublicIp'), variables('networkInterfaceNamePublic'), variables('networkInterfaceNamePrivate'))]",
|
| 124 |
+
"networkIpConfig": "[guid(resourceGroup().id, parameters('vmName'))]",
|
| 125 |
+
"osDiskType": "Standard_LRS",
|
| 126 |
+
"publicIpAddressName": "[concat(parameters('vmName'), '-ip')]"
|
| 127 |
+
},
|
| 128 |
+
"resources": [
|
| 129 |
+
{
|
| 130 |
+
"type": "Microsoft.Network/networkInterfaces",
|
| 131 |
+
"apiVersion": "2020-06-01",
|
| 132 |
+
"name": "[concat(variables('networkInterfaceNamePublic'), copyIndex())]",
|
| 133 |
+
"location": "[variables('location')]",
|
| 134 |
+
"dependsOn": [
|
| 135 |
+
"[resourceId('Microsoft.Network/publicIpAddresses/', concat(variables('publicIpAddressName'), copyIndex()))]"
|
| 136 |
+
],
|
| 137 |
+
"copy": {
|
| 138 |
+
"name": "NICPublicCopy",
|
| 139 |
+
"count": "[parameters('vmCount')]"
|
| 140 |
+
},
|
| 141 |
+
"properties": {
|
| 142 |
+
"ipConfigurations": [
|
| 143 |
+
{
|
| 144 |
+
"name": "[variables('networkIpConfig')]",
|
| 145 |
+
"properties": {
|
| 146 |
+
"subnet": {
|
| 147 |
+
"id": "[parameters('subnet')]"
|
| 148 |
+
},
|
| 149 |
+
"privateIPAllocationMethod": "Dynamic",
|
| 150 |
+
"publicIpAddress": {
|
| 151 |
+
"id": "[resourceId('Microsoft.Network/publicIPAddresses', concat(variables('publicIPAddressName'), copyIndex()))]"
|
| 152 |
+
}
|
| 153 |
+
}
|
| 154 |
+
}
|
| 155 |
+
],
|
| 156 |
+
"networkSecurityGroup": {
|
| 157 |
+
"id": "[parameters('nsg')]"
|
| 158 |
+
},
|
| 159 |
+
"enableAcceleratedNetworking": "[parameters('enableAcceleratedNetworking')]"
|
| 160 |
+
},
|
| 161 |
+
"condition": "[parameters('provisionPublicIp')]"
|
| 162 |
+
},
|
| 163 |
+
{
|
| 164 |
+
"type": "Microsoft.Network/networkInterfaces",
|
| 165 |
+
"apiVersion": "2020-06-01",
|
| 166 |
+
"name": "[concat(variables('networkInterfaceNamePrivate'), copyIndex())]",
|
| 167 |
+
"location": "[variables('location')]",
|
| 168 |
+
"copy": {
|
| 169 |
+
"name": "NICPrivateCopy",
|
| 170 |
+
"count": "[parameters('vmCount')]"
|
| 171 |
+
},
|
| 172 |
+
"properties": {
|
| 173 |
+
"ipConfigurations": [
|
| 174 |
+
{
|
| 175 |
+
"name": "[variables('networkIpConfig')]",
|
| 176 |
+
"properties": {
|
| 177 |
+
"subnet": {
|
| 178 |
+
"id": "[parameters('subnet')]"
|
| 179 |
+
},
|
| 180 |
+
"privateIPAllocationMethod": "Dynamic"
|
| 181 |
+
}
|
| 182 |
+
}
|
| 183 |
+
],
|
| 184 |
+
"networkSecurityGroup": {
|
| 185 |
+
"id": "[parameters('nsg')]"
|
| 186 |
+
},
|
| 187 |
+
"enableAcceleratedNetworking": "[parameters('enableAcceleratedNetworking')]"
|
| 188 |
+
},
|
| 189 |
+
"condition": "[not(parameters('provisionPublicIp'))]"
|
| 190 |
+
},
|
| 191 |
+
{
|
| 192 |
+
"type": "Microsoft.Network/publicIpAddresses",
|
| 193 |
+
"apiVersion": "2019-02-01",
|
| 194 |
+
"name": "[concat(variables('publicIpAddressName'), copyIndex())]",
|
| 195 |
+
"location": "[variables('location')]",
|
| 196 |
+
"properties": {
|
| 197 |
+
"publicIpAllocationMethod": "Static",
|
| 198 |
+
"publicIPAddressVersion": "IPv4"
|
| 199 |
+
},
|
| 200 |
+
"copy": {
|
| 201 |
+
"name": "PublicIpCopy",
|
| 202 |
+
"count": "[parameters('vmCount')]"
|
| 203 |
+
},
|
| 204 |
+
"sku": {
|
| 205 |
+
"name": "Basic",
|
| 206 |
+
"tier": "Regional"
|
| 207 |
+
},
|
| 208 |
+
"condition": "[parameters('provisionPublicIp')]"
|
| 209 |
+
},
|
| 210 |
+
{
|
| 211 |
+
"type": "Microsoft.Compute/virtualMachines",
|
| 212 |
+
"apiVersion": "2019-03-01",
|
| 213 |
+
"name": "[concat(parameters('vmName'), copyIndex())]",
|
| 214 |
+
"location": "[variables('location')]",
|
| 215 |
+
"dependsOn": [
|
| 216 |
+
"[resourceId('Microsoft.Network/networkInterfaces/', concat(variables('networkInterfaceName'), copyIndex()))]"
|
| 217 |
+
],
|
| 218 |
+
"copy": {
|
| 219 |
+
"name": "VmCopy",
|
| 220 |
+
"count": "[parameters('vmCount')]"
|
| 221 |
+
},
|
| 222 |
+
"tags": "[parameters('vmTags')]",
|
| 223 |
+
"properties": {
|
| 224 |
+
"hardwareProfile": {
|
| 225 |
+
"vmSize": "[parameters('vmSize')]"
|
| 226 |
+
},
|
| 227 |
+
"storageProfile": {
|
| 228 |
+
"osDisk": {
|
| 229 |
+
"createOption": "fromImage",
|
| 230 |
+
"managedDisk": {
|
| 231 |
+
"storageAccountType": "[variables('osDiskType')]"
|
| 232 |
+
}
|
| 233 |
+
},
|
| 234 |
+
"imageReference": {
|
| 235 |
+
"publisher": "[parameters('imagePublisher')]",
|
| 236 |
+
"offer": "[parameters('imageOffer')]",
|
| 237 |
+
"sku": "[parameters('imageSku')]",
|
| 238 |
+
"version": "[parameters('imageVersion')]"
|
| 239 |
+
}
|
| 240 |
+
},
|
| 241 |
+
"networkProfile": {
|
| 242 |
+
"networkInterfaces": [
|
| 243 |
+
{
|
| 244 |
+
"id": "[resourceId('Microsoft.Network/networkInterfaces', concat(variables('networkInterfaceName'), copyIndex()))]"
|
| 245 |
+
}
|
| 246 |
+
]
|
| 247 |
+
},
|
| 248 |
+
"osProfile": {
|
| 249 |
+
"computerName": "[concat(parameters('vmName'), copyIndex())]",
|
| 250 |
+
"adminUsername": "[parameters('adminUsername')]",
|
| 251 |
+
"adminPassword": "[parameters('publicKey')]",
|
| 252 |
+
"linuxConfiguration": {
|
| 253 |
+
"disablePasswordAuthentication": true,
|
| 254 |
+
"ssh": {
|
| 255 |
+
"publicKeys": [
|
| 256 |
+
{
|
| 257 |
+
"path": "[concat('/home/', parameters('adminUsername'), '/.ssh/authorized_keys')]",
|
| 258 |
+
"keyData": "[parameters('publicKey')]"
|
| 259 |
+
}
|
| 260 |
+
]
|
| 261 |
+
}
|
| 262 |
+
}
|
| 263 |
+
},
|
| 264 |
+
"priority": "[parameters('priority')]",
|
| 265 |
+
"evictionPolicy": "[if(equals(parameters('priority'), 'Spot'), parameters('evictionPolicy'), '')]",
|
| 266 |
+
"billingProfile": "[parameters('billingProfile')]"
|
| 267 |
+
},
|
| 268 |
+
"identity": {
|
| 269 |
+
"type": "UserAssigned",
|
| 270 |
+
"userAssignedIdentities": {
|
| 271 |
+
"[parameters('msi')]": {
|
| 272 |
+
}
|
| 273 |
+
}
|
| 274 |
+
}
|
| 275 |
+
}
|
| 276 |
+
],
|
| 277 |
+
"outputs": {
|
| 278 |
+
"publicIp": {
|
| 279 |
+
"type": "array",
|
| 280 |
+
"copy": {
|
| 281 |
+
"count": "[parameters('vmCount')]",
|
| 282 |
+
"input": "[reference(concat(variables('publicIpAddressName'), copyIndex())).ipAddress]"
|
| 283 |
+
},
|
| 284 |
+
"condition": "[parameters('provisionPublicIp')]"
|
| 285 |
+
},
|
| 286 |
+
"privateIp": {
|
| 287 |
+
"type": "array",
|
| 288 |
+
"copy": {
|
| 289 |
+
"count": "[parameters('vmCount')]",
|
| 290 |
+
"input": "[reference(concat(variables('networkInterfaceName'), copyIndex())).ipConfigurations[0].properties.privateIPAddress]"
|
| 291 |
+
}
|
| 292 |
+
}
|
| 293 |
+
}
|
| 294 |
+
}
|
.venv/lib/python3.11/site-packages/ray/autoscaler/_private/_azure/config.py
ADDED
|
@@ -0,0 +1,208 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import json
|
| 2 |
+
import logging
|
| 3 |
+
import random
|
| 4 |
+
from hashlib import sha256
|
| 5 |
+
from pathlib import Path
|
| 6 |
+
from typing import Any, Callable
|
| 7 |
+
|
| 8 |
+
from azure.common.credentials import get_cli_profile
|
| 9 |
+
from azure.identity import AzureCliCredential
|
| 10 |
+
from azure.mgmt.resource import ResourceManagementClient
|
| 11 |
+
from azure.mgmt.resource.resources.models import DeploymentMode
|
| 12 |
+
|
| 13 |
+
UNIQUE_ID_LEN = 4
|
| 14 |
+
|
| 15 |
+
logger = logging.getLogger(__name__)
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
def get_azure_sdk_function(client: Any, function_name: str) -> Callable:
|
| 19 |
+
"""Retrieve a callable function from Azure SDK client object.
|
| 20 |
+
|
| 21 |
+
Newer versions of the various client SDKs renamed function names to
|
| 22 |
+
have a begin_ prefix. This function supports both the old and new
|
| 23 |
+
versions of the SDK by first trying the old name and falling back to
|
| 24 |
+
the prefixed new name.
|
| 25 |
+
"""
|
| 26 |
+
func = getattr(
|
| 27 |
+
client, function_name, getattr(client, f"begin_{function_name}", None)
|
| 28 |
+
)
|
| 29 |
+
if func is None:
|
| 30 |
+
raise AttributeError(
|
| 31 |
+
"'{obj}' object has no {func} or begin_{func} attribute".format(
|
| 32 |
+
obj={client.__name__}, func=function_name
|
| 33 |
+
)
|
| 34 |
+
)
|
| 35 |
+
return func
|
| 36 |
+
|
| 37 |
+
|
| 38 |
+
def bootstrap_azure(config):
|
| 39 |
+
config = _configure_key_pair(config)
|
| 40 |
+
config = _configure_resource_group(config)
|
| 41 |
+
return config
|
| 42 |
+
|
| 43 |
+
|
| 44 |
+
def _configure_resource_group(config):
|
| 45 |
+
# TODO: look at availability sets
|
| 46 |
+
# https://docs.microsoft.com/en-us/azure/virtual-machines/windows/tutorial-availability-sets
|
| 47 |
+
subscription_id = config["provider"].get("subscription_id")
|
| 48 |
+
if subscription_id is None:
|
| 49 |
+
subscription_id = get_cli_profile().get_subscription_id()
|
| 50 |
+
resource_client = ResourceManagementClient(AzureCliCredential(), subscription_id)
|
| 51 |
+
config["provider"]["subscription_id"] = subscription_id
|
| 52 |
+
logger.info("Using subscription id: %s", subscription_id)
|
| 53 |
+
|
| 54 |
+
assert (
|
| 55 |
+
"resource_group" in config["provider"]
|
| 56 |
+
), "Provider config must include resource_group field"
|
| 57 |
+
resource_group = config["provider"]["resource_group"]
|
| 58 |
+
|
| 59 |
+
assert (
|
| 60 |
+
"location" in config["provider"]
|
| 61 |
+
), "Provider config must include location field"
|
| 62 |
+
params = {"location": config["provider"]["location"]}
|
| 63 |
+
|
| 64 |
+
if "tags" in config["provider"]:
|
| 65 |
+
params["tags"] = config["provider"]["tags"]
|
| 66 |
+
|
| 67 |
+
logger.info("Creating/Updating resource group: %s", resource_group)
|
| 68 |
+
rg_create_or_update = get_azure_sdk_function(
|
| 69 |
+
client=resource_client.resource_groups, function_name="create_or_update"
|
| 70 |
+
)
|
| 71 |
+
rg_create_or_update(resource_group_name=resource_group, parameters=params)
|
| 72 |
+
|
| 73 |
+
# load the template file
|
| 74 |
+
current_path = Path(__file__).parent
|
| 75 |
+
template_path = current_path.joinpath("azure-config-template.json")
|
| 76 |
+
with open(template_path, "r") as template_fp:
|
| 77 |
+
template = json.load(template_fp)
|
| 78 |
+
|
| 79 |
+
logger.info("Using cluster name: %s", config["cluster_name"])
|
| 80 |
+
|
| 81 |
+
# set unique id for resources in this cluster
|
| 82 |
+
unique_id = config["provider"].get("unique_id")
|
| 83 |
+
if unique_id is None:
|
| 84 |
+
hasher = sha256()
|
| 85 |
+
hasher.update(config["provider"]["resource_group"].encode("utf-8"))
|
| 86 |
+
unique_id = hasher.hexdigest()[:UNIQUE_ID_LEN]
|
| 87 |
+
else:
|
| 88 |
+
unique_id = str(unique_id)
|
| 89 |
+
config["provider"]["unique_id"] = unique_id
|
| 90 |
+
logger.info("Using unique id: %s", unique_id)
|
| 91 |
+
cluster_id = "{}-{}".format(config["cluster_name"], unique_id)
|
| 92 |
+
|
| 93 |
+
subnet_mask = config["provider"].get("subnet_mask")
|
| 94 |
+
if subnet_mask is None:
|
| 95 |
+
# choose a random subnet, skipping most common value of 0
|
| 96 |
+
random.seed(unique_id)
|
| 97 |
+
subnet_mask = "10.{}.0.0/16".format(random.randint(1, 254))
|
| 98 |
+
logger.info("Using subnet mask: %s", subnet_mask)
|
| 99 |
+
|
| 100 |
+
# Copy over properties from existing subnet.
|
| 101 |
+
# Addresses issue (https://github.com/Azure/azure-quickstart-templates/issues/2786)
|
| 102 |
+
# where existing subnet properties will get overwritten unless explicitly specified
|
| 103 |
+
# during multiple deployments even if vnet/subnet do not change.
|
| 104 |
+
# May eventually be fixed by passing empty subnet list if they already exist:
|
| 105 |
+
# https://techcommunity.microsoft.com/t5/azure-networking-blog/azure-virtual-network-now-supports-updates-without-subnet/ba-p/4067952
|
| 106 |
+
list_by_rg = get_azure_sdk_function(
|
| 107 |
+
client=resource_client.resources, function_name="list_by_resource_group"
|
| 108 |
+
)
|
| 109 |
+
existing_vnets = list(
|
| 110 |
+
list_by_rg(
|
| 111 |
+
resource_group,
|
| 112 |
+
f"substringof('{unique_id}', name) and "
|
| 113 |
+
"resourceType eq 'Microsoft.Network/virtualNetworks'",
|
| 114 |
+
)
|
| 115 |
+
)
|
| 116 |
+
if len(existing_vnets) > 0:
|
| 117 |
+
vnid = existing_vnets[0].id
|
| 118 |
+
get_by_id = get_azure_sdk_function(
|
| 119 |
+
client=resource_client.resources, function_name="get_by_id"
|
| 120 |
+
)
|
| 121 |
+
subnet = get_by_id(vnid, resource_client.DEFAULT_API_VERSION).properties[
|
| 122 |
+
"subnets"
|
| 123 |
+
][0]
|
| 124 |
+
template_vnet = next(
|
| 125 |
+
(
|
| 126 |
+
rs
|
| 127 |
+
for rs in template["resources"]
|
| 128 |
+
if rs["type"] == "Microsoft.Network/virtualNetworks"
|
| 129 |
+
),
|
| 130 |
+
None,
|
| 131 |
+
)
|
| 132 |
+
if template_vnet is not None:
|
| 133 |
+
template_subnets = template_vnet["properties"].get("subnets")
|
| 134 |
+
if template_subnets is not None:
|
| 135 |
+
template_subnets[0]["properties"].update(subnet["properties"])
|
| 136 |
+
|
| 137 |
+
# Get or create an MSI name and resource group.
|
| 138 |
+
# Defaults to current resource group if not provided.
|
| 139 |
+
use_existing_msi = (
|
| 140 |
+
"msi_name" in config["provider"] and "msi_resource_group" in config["provider"]
|
| 141 |
+
)
|
| 142 |
+
msi_resource_group = config["provider"].get("msi_resource_group", resource_group)
|
| 143 |
+
msi_name = config["provider"].get("msi_name", f"ray-{cluster_id}-msi")
|
| 144 |
+
logger.info(
|
| 145 |
+
"Using msi_name: %s from msi_resource_group: %s", msi_name, msi_resource_group
|
| 146 |
+
)
|
| 147 |
+
|
| 148 |
+
parameters = {
|
| 149 |
+
"properties": {
|
| 150 |
+
"mode": DeploymentMode.incremental,
|
| 151 |
+
"template": template,
|
| 152 |
+
"parameters": {
|
| 153 |
+
"subnet": {"value": subnet_mask},
|
| 154 |
+
"clusterId": {"value": cluster_id},
|
| 155 |
+
"msiName": {"value": msi_name},
|
| 156 |
+
"msiResourceGroup": {"value": msi_resource_group},
|
| 157 |
+
"createMsi": {"value": not use_existing_msi},
|
| 158 |
+
},
|
| 159 |
+
}
|
| 160 |
+
}
|
| 161 |
+
|
| 162 |
+
create_or_update = get_azure_sdk_function(
|
| 163 |
+
client=resource_client.deployments, function_name="create_or_update"
|
| 164 |
+
)
|
| 165 |
+
outputs = (
|
| 166 |
+
create_or_update(
|
| 167 |
+
resource_group_name=resource_group,
|
| 168 |
+
deployment_name="ray-config",
|
| 169 |
+
parameters=parameters,
|
| 170 |
+
)
|
| 171 |
+
.result()
|
| 172 |
+
.properties.outputs
|
| 173 |
+
)
|
| 174 |
+
|
| 175 |
+
# append output resource ids to be used with vm creation
|
| 176 |
+
config["provider"]["msi"] = outputs["msi"]["value"]
|
| 177 |
+
config["provider"]["nsg"] = outputs["nsg"]["value"]
|
| 178 |
+
config["provider"]["subnet"] = outputs["subnet"]["value"]
|
| 179 |
+
|
| 180 |
+
return config
|
| 181 |
+
|
| 182 |
+
|
| 183 |
+
def _configure_key_pair(config):
|
| 184 |
+
ssh_user = config["auth"]["ssh_user"]
|
| 185 |
+
public_key = None
|
| 186 |
+
# search if the keys exist
|
| 187 |
+
for key_type in ["ssh_private_key", "ssh_public_key"]:
|
| 188 |
+
try:
|
| 189 |
+
key_path = Path(config["auth"][key_type]).expanduser()
|
| 190 |
+
except KeyError:
|
| 191 |
+
raise Exception("Config must define {}".format(key_type))
|
| 192 |
+
except TypeError:
|
| 193 |
+
raise Exception("Invalid config value for {}".format(key_type))
|
| 194 |
+
|
| 195 |
+
assert key_path.is_file(), "Could not find ssh key: {}".format(key_path)
|
| 196 |
+
|
| 197 |
+
if key_type == "ssh_public_key":
|
| 198 |
+
with open(key_path, "r") as f:
|
| 199 |
+
public_key = f.read()
|
| 200 |
+
|
| 201 |
+
for node_type in config["available_node_types"].values():
|
| 202 |
+
azure_arm_parameters = node_type["node_config"].setdefault(
|
| 203 |
+
"azure_arm_parameters", {}
|
| 204 |
+
)
|
| 205 |
+
azure_arm_parameters["adminUsername"] = ssh_user
|
| 206 |
+
azure_arm_parameters["publicKey"] = public_key
|
| 207 |
+
|
| 208 |
+
return config
|
.venv/lib/python3.11/site-packages/ray/autoscaler/_private/_azure/node_provider.py
ADDED
|
@@ -0,0 +1,488 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import json
|
| 2 |
+
import logging
|
| 3 |
+
import time
|
| 4 |
+
from concurrent.futures import Future, ThreadPoolExecutor
|
| 5 |
+
from pathlib import Path
|
| 6 |
+
from threading import RLock
|
| 7 |
+
from uuid import uuid4
|
| 8 |
+
|
| 9 |
+
from azure.core.exceptions import ResourceNotFoundError
|
| 10 |
+
from azure.identity import DefaultAzureCredential
|
| 11 |
+
from azure.mgmt.compute import ComputeManagementClient
|
| 12 |
+
from azure.mgmt.network import NetworkManagementClient
|
| 13 |
+
from azure.mgmt.resource import ResourceManagementClient
|
| 14 |
+
from azure.mgmt.resource.resources.models import DeploymentMode
|
| 15 |
+
|
| 16 |
+
from ray.autoscaler._private._azure.config import (
|
| 17 |
+
bootstrap_azure,
|
| 18 |
+
get_azure_sdk_function,
|
| 19 |
+
)
|
| 20 |
+
from ray.autoscaler._private.constants import (
|
| 21 |
+
AUTOSCALER_NODE_START_WAIT_S,
|
| 22 |
+
AUTOSCALER_NODE_TERMINATE_WAIT_S,
|
| 23 |
+
MAX_PARALLEL_SHUTDOWN_WORKERS,
|
| 24 |
+
)
|
| 25 |
+
from ray.autoscaler.node_provider import NodeProvider
|
| 26 |
+
from ray.autoscaler.tags import (
|
| 27 |
+
NODE_KIND_HEAD,
|
| 28 |
+
TAG_RAY_CLUSTER_NAME,
|
| 29 |
+
TAG_RAY_LAUNCH_CONFIG,
|
| 30 |
+
TAG_RAY_NODE_KIND,
|
| 31 |
+
TAG_RAY_NODE_NAME,
|
| 32 |
+
TAG_RAY_USER_NODE_TYPE,
|
| 33 |
+
)
|
| 34 |
+
|
| 35 |
+
VM_NAME_MAX_LEN = 64
|
| 36 |
+
UNIQUE_ID_LEN = 4
|
| 37 |
+
|
| 38 |
+
logger = logging.getLogger(__name__)
|
| 39 |
+
azure_logger = logging.getLogger("azure.core.pipeline.policies.http_logging_policy")
|
| 40 |
+
azure_logger.setLevel(logging.WARNING)
|
| 41 |
+
|
| 42 |
+
|
| 43 |
+
def synchronized(f):
|
| 44 |
+
def wrapper(self, *args, **kwargs):
|
| 45 |
+
self.lock.acquire()
|
| 46 |
+
try:
|
| 47 |
+
return f(self, *args, **kwargs)
|
| 48 |
+
finally:
|
| 49 |
+
self.lock.release()
|
| 50 |
+
|
| 51 |
+
return wrapper
|
| 52 |
+
|
| 53 |
+
|
| 54 |
+
class AzureNodeProvider(NodeProvider):
|
| 55 |
+
"""Node Provider for Azure
|
| 56 |
+
|
| 57 |
+
This provider assumes Azure credentials are set by running ``az login``
|
| 58 |
+
and the default subscription is configured through ``az account``
|
| 59 |
+
or set in the ``provider`` field of the autoscaler configuration.
|
| 60 |
+
|
| 61 |
+
Nodes may be in one of three states: {pending, running, terminated}. Nodes
|
| 62 |
+
appear immediately once started by ``create_node``, and transition
|
| 63 |
+
immediately to terminated when ``terminate_node`` is called.
|
| 64 |
+
"""
|
| 65 |
+
|
| 66 |
+
def __init__(self, provider_config, cluster_name):
|
| 67 |
+
NodeProvider.__init__(self, provider_config, cluster_name)
|
| 68 |
+
subscription_id = provider_config["subscription_id"]
|
| 69 |
+
self.cache_stopped_nodes = provider_config.get("cache_stopped_nodes", True)
|
| 70 |
+
credential = DefaultAzureCredential(exclude_shared_token_cache_credential=True)
|
| 71 |
+
self.compute_client = ComputeManagementClient(credential, subscription_id)
|
| 72 |
+
self.network_client = NetworkManagementClient(credential, subscription_id)
|
| 73 |
+
self.resource_client = ResourceManagementClient(credential, subscription_id)
|
| 74 |
+
|
| 75 |
+
self.lock = RLock()
|
| 76 |
+
|
| 77 |
+
# cache node objects
|
| 78 |
+
self.cached_nodes = {}
|
| 79 |
+
|
| 80 |
+
# Cache terminating node operations
|
| 81 |
+
self.terminating_nodes: dict[str, Future] = {}
|
| 82 |
+
self.termination_executor = ThreadPoolExecutor(
|
| 83 |
+
max_workers=MAX_PARALLEL_SHUTDOWN_WORKERS
|
| 84 |
+
)
|
| 85 |
+
|
| 86 |
+
@synchronized
|
| 87 |
+
def _get_filtered_nodes(self, tag_filters):
|
| 88 |
+
# add cluster name filter to only get nodes from this cluster
|
| 89 |
+
cluster_tag_filters = {**tag_filters, TAG_RAY_CLUSTER_NAME: self.cluster_name}
|
| 90 |
+
|
| 91 |
+
def match_tags(tags):
|
| 92 |
+
for k, v in cluster_tag_filters.items():
|
| 93 |
+
if tags.get(k) != v:
|
| 94 |
+
return False
|
| 95 |
+
return True
|
| 96 |
+
|
| 97 |
+
vms = self.compute_client.virtual_machines.list(
|
| 98 |
+
resource_group_name=self.provider_config["resource_group"]
|
| 99 |
+
)
|
| 100 |
+
|
| 101 |
+
nodes = [self._extract_metadata(vm) for vm in vms]
|
| 102 |
+
self.cached_nodes = {node["name"]: node for node in nodes}
|
| 103 |
+
|
| 104 |
+
# Update terminating nodes list by removing nodes that
|
| 105 |
+
# have finished termination.
|
| 106 |
+
self.terminating_nodes = {
|
| 107 |
+
k: v for k, v in self.terminating_nodes.items() if not v.done()
|
| 108 |
+
}
|
| 109 |
+
|
| 110 |
+
return {k: v for k, v in self.cached_nodes.items() if match_tags(v["tags"])}
|
| 111 |
+
|
| 112 |
+
def _extract_metadata(self, vm):
|
| 113 |
+
# get tags
|
| 114 |
+
metadata = {"name": vm.name, "tags": vm.tags, "status": ""}
|
| 115 |
+
|
| 116 |
+
# get status
|
| 117 |
+
resource_group = self.provider_config["resource_group"]
|
| 118 |
+
try:
|
| 119 |
+
instance = self.compute_client.virtual_machines.instance_view(
|
| 120 |
+
resource_group_name=resource_group, vm_name=vm.name
|
| 121 |
+
).as_dict()
|
| 122 |
+
except ResourceNotFoundError:
|
| 123 |
+
return metadata
|
| 124 |
+
|
| 125 |
+
for status in instance["statuses"]:
|
| 126 |
+
# If ProvisioningState is "failed" (e.g.,
|
| 127 |
+
# ProvisioningState/failed/RetryableError), we can get a third
|
| 128 |
+
# string here, so we need to limit to the first two outputs.
|
| 129 |
+
code, state = status["code"].split("/")[:2]
|
| 130 |
+
# skip provisioning status
|
| 131 |
+
if code == "PowerState":
|
| 132 |
+
metadata["status"] = state
|
| 133 |
+
break
|
| 134 |
+
|
| 135 |
+
# get ip data
|
| 136 |
+
nic_id = vm.network_profile.network_interfaces[0].id
|
| 137 |
+
metadata["nic_name"] = nic_id.split("/")[-1]
|
| 138 |
+
nic = self.network_client.network_interfaces.get(
|
| 139 |
+
resource_group_name=resource_group,
|
| 140 |
+
network_interface_name=metadata["nic_name"],
|
| 141 |
+
)
|
| 142 |
+
ip_config = nic.ip_configurations[0]
|
| 143 |
+
|
| 144 |
+
# Get public IP if not using internal IPs or if this is the
|
| 145 |
+
# head node and use_external_head_ip is True
|
| 146 |
+
if not self.provider_config.get("use_internal_ips", False) or (
|
| 147 |
+
self.provider_config.get("use_external_head_ip", False)
|
| 148 |
+
and metadata["tags"][TAG_RAY_NODE_KIND] == NODE_KIND_HEAD
|
| 149 |
+
):
|
| 150 |
+
public_ip_id = ip_config.public_ip_address.id
|
| 151 |
+
metadata["public_ip_name"] = public_ip_id.split("/")[-1]
|
| 152 |
+
public_ip = self.network_client.public_ip_addresses.get(
|
| 153 |
+
resource_group_name=resource_group,
|
| 154 |
+
public_ip_address_name=metadata["public_ip_name"],
|
| 155 |
+
)
|
| 156 |
+
metadata["external_ip"] = public_ip.ip_address
|
| 157 |
+
|
| 158 |
+
metadata["internal_ip"] = ip_config.private_ip_address
|
| 159 |
+
|
| 160 |
+
return metadata
|
| 161 |
+
|
| 162 |
+
def stopped_nodes(self, tag_filters):
|
| 163 |
+
"""Return a list of stopped node ids filtered by the specified tags dict."""
|
| 164 |
+
nodes = self._get_filtered_nodes(tag_filters=tag_filters)
|
| 165 |
+
return [k for k, v in nodes.items() if v["status"].startswith("deallocat")]
|
| 166 |
+
|
| 167 |
+
def non_terminated_nodes(self, tag_filters):
|
| 168 |
+
"""Return a list of node ids filtered by the specified tags dict.
|
| 169 |
+
|
| 170 |
+
This list must not include terminated nodes. For performance reasons,
|
| 171 |
+
providers are allowed to cache the result of a call to nodes() to
|
| 172 |
+
serve single-node queries (e.g. is_running(node_id)). This means that
|
| 173 |
+
nodes() must be called again to refresh results.
|
| 174 |
+
|
| 175 |
+
Examples:
|
| 176 |
+
>>> from ray.autoscaler.tags import TAG_RAY_NODE_KIND
|
| 177 |
+
>>> provider = ... # doctest: +SKIP
|
| 178 |
+
>>> provider.non_terminated_nodes( # doctest: +SKIP
|
| 179 |
+
... {TAG_RAY_NODE_KIND: "worker"})
|
| 180 |
+
["node-1", "node-2"]
|
| 181 |
+
"""
|
| 182 |
+
nodes = self._get_filtered_nodes(tag_filters=tag_filters)
|
| 183 |
+
return [
|
| 184 |
+
k
|
| 185 |
+
for k, v in nodes.items()
|
| 186 |
+
if not v["status"].startswith("deallocat") or k in self.terminating_nodes
|
| 187 |
+
]
|
| 188 |
+
|
| 189 |
+
def is_running(self, node_id):
|
| 190 |
+
"""Return whether the specified node is running."""
|
| 191 |
+
# always get current status
|
| 192 |
+
node = self._get_node(node_id=node_id)
|
| 193 |
+
return node["status"] == "running"
|
| 194 |
+
|
| 195 |
+
def is_terminated(self, node_id):
|
| 196 |
+
"""Return whether the specified node is terminated."""
|
| 197 |
+
# always get current status
|
| 198 |
+
node = self._get_node(node_id=node_id)
|
| 199 |
+
return node["status"].startswith("deallocat")
|
| 200 |
+
|
| 201 |
+
def node_tags(self, node_id):
|
| 202 |
+
"""Returns the tags of the given node (string dict)."""
|
| 203 |
+
return self._get_cached_node(node_id=node_id)["tags"]
|
| 204 |
+
|
| 205 |
+
def external_ip(self, node_id):
|
| 206 |
+
"""Returns the external ip of the given node."""
|
| 207 |
+
ip = (
|
| 208 |
+
self._get_cached_node(node_id=node_id)["external_ip"]
|
| 209 |
+
or self._get_node(node_id=node_id)["external_ip"]
|
| 210 |
+
)
|
| 211 |
+
return ip
|
| 212 |
+
|
| 213 |
+
def internal_ip(self, node_id):
|
| 214 |
+
"""Returns the internal ip (Ray ip) of the given node."""
|
| 215 |
+
ip = (
|
| 216 |
+
self._get_cached_node(node_id=node_id)["internal_ip"]
|
| 217 |
+
or self._get_node(node_id=node_id)["internal_ip"]
|
| 218 |
+
)
|
| 219 |
+
return ip
|
| 220 |
+
|
| 221 |
+
def create_node(self, node_config, tags, count):
|
| 222 |
+
resource_group = self.provider_config["resource_group"]
|
| 223 |
+
|
| 224 |
+
if self.cache_stopped_nodes:
|
| 225 |
+
VALIDITY_TAGS = [
|
| 226 |
+
TAG_RAY_CLUSTER_NAME,
|
| 227 |
+
TAG_RAY_NODE_KIND,
|
| 228 |
+
TAG_RAY_LAUNCH_CONFIG,
|
| 229 |
+
TAG_RAY_USER_NODE_TYPE,
|
| 230 |
+
]
|
| 231 |
+
filters = {tag: tags[tag] for tag in VALIDITY_TAGS if tag in tags}
|
| 232 |
+
reuse_nodes = self.stopped_nodes(filters)[:count]
|
| 233 |
+
logger.info(
|
| 234 |
+
f"Reusing nodes {list(reuse_nodes)}. "
|
| 235 |
+
"To disable reuse, set `cache_stopped_nodes: False` "
|
| 236 |
+
"under `provider` in the cluster configuration.",
|
| 237 |
+
)
|
| 238 |
+
start = get_azure_sdk_function(
|
| 239 |
+
client=self.compute_client.virtual_machines, function_name="start"
|
| 240 |
+
)
|
| 241 |
+
for node_id in reuse_nodes:
|
| 242 |
+
start(resource_group_name=resource_group, vm_name=node_id).wait()
|
| 243 |
+
self.set_node_tags(node_id, tags)
|
| 244 |
+
count -= len(reuse_nodes)
|
| 245 |
+
|
| 246 |
+
if count:
|
| 247 |
+
self._create_node(node_config, tags, count)
|
| 248 |
+
|
| 249 |
+
def _create_node(self, node_config, tags, count):
|
| 250 |
+
"""Creates a number of nodes within the namespace."""
|
| 251 |
+
resource_group = self.provider_config["resource_group"]
|
| 252 |
+
|
| 253 |
+
# load the template file
|
| 254 |
+
current_path = Path(__file__).parent
|
| 255 |
+
template_path = current_path.joinpath("azure-vm-template.json")
|
| 256 |
+
with open(template_path, "r") as template_fp:
|
| 257 |
+
template = json.load(template_fp)
|
| 258 |
+
|
| 259 |
+
# get the tags
|
| 260 |
+
config_tags = node_config.get("tags", {}).copy()
|
| 261 |
+
config_tags.update(tags)
|
| 262 |
+
config_tags[TAG_RAY_CLUSTER_NAME] = self.cluster_name
|
| 263 |
+
|
| 264 |
+
vm_name = "{node}-{unique_id}-{vm_id}".format(
|
| 265 |
+
node=config_tags.get(TAG_RAY_NODE_NAME, "node"),
|
| 266 |
+
unique_id=self.provider_config["unique_id"],
|
| 267 |
+
vm_id=uuid4().hex[:UNIQUE_ID_LEN],
|
| 268 |
+
)[:VM_NAME_MAX_LEN]
|
| 269 |
+
|
| 270 |
+
template_params = node_config["azure_arm_parameters"].copy()
|
| 271 |
+
template_params["vmName"] = vm_name
|
| 272 |
+
# Provision public IP if not using internal IPs or if this is the
|
| 273 |
+
# head node and use_external_head_ip is True
|
| 274 |
+
template_params["provisionPublicIp"] = not self.provider_config.get(
|
| 275 |
+
"use_internal_ips", False
|
| 276 |
+
) or (
|
| 277 |
+
self.provider_config.get("use_external_head_ip", False)
|
| 278 |
+
and config_tags[TAG_RAY_NODE_KIND] == NODE_KIND_HEAD
|
| 279 |
+
)
|
| 280 |
+
template_params["vmTags"] = config_tags
|
| 281 |
+
template_params["vmCount"] = count
|
| 282 |
+
template_params["msi"] = self.provider_config["msi"]
|
| 283 |
+
template_params["nsg"] = self.provider_config["nsg"]
|
| 284 |
+
template_params["subnet"] = self.provider_config["subnet"]
|
| 285 |
+
|
| 286 |
+
parameters = {
|
| 287 |
+
"properties": {
|
| 288 |
+
"mode": DeploymentMode.incremental,
|
| 289 |
+
"template": template,
|
| 290 |
+
"parameters": {
|
| 291 |
+
key: {"value": value} for key, value in template_params.items()
|
| 292 |
+
},
|
| 293 |
+
}
|
| 294 |
+
}
|
| 295 |
+
|
| 296 |
+
# TODO: we could get the private/public ips back directly
|
| 297 |
+
create_or_update = get_azure_sdk_function(
|
| 298 |
+
client=self.resource_client.deployments, function_name="create_or_update"
|
| 299 |
+
)
|
| 300 |
+
create_or_update(
|
| 301 |
+
resource_group_name=resource_group,
|
| 302 |
+
deployment_name=vm_name,
|
| 303 |
+
parameters=parameters,
|
| 304 |
+
).wait(timeout=AUTOSCALER_NODE_START_WAIT_S)
|
| 305 |
+
|
| 306 |
+
@synchronized
|
| 307 |
+
def set_node_tags(self, node_id, tags):
|
| 308 |
+
"""Sets the tag values (string dict) for the specified node."""
|
| 309 |
+
node_tags = self._get_cached_node(node_id)["tags"]
|
| 310 |
+
node_tags.update(tags)
|
| 311 |
+
update = get_azure_sdk_function(
|
| 312 |
+
client=self.compute_client.virtual_machines, function_name="update"
|
| 313 |
+
)
|
| 314 |
+
update(
|
| 315 |
+
resource_group_name=self.provider_config["resource_group"],
|
| 316 |
+
vm_name=node_id,
|
| 317 |
+
parameters={"tags": node_tags},
|
| 318 |
+
)
|
| 319 |
+
self.cached_nodes[node_id]["tags"] = node_tags
|
| 320 |
+
|
| 321 |
+
def terminate_node(self, node_id):
|
| 322 |
+
"""Terminates the specified node. This will delete the VM and
|
| 323 |
+
associated resources (NIC, IP, Storage) for the specified node."""
|
| 324 |
+
|
| 325 |
+
resource_group = self.provider_config["resource_group"]
|
| 326 |
+
|
| 327 |
+
if self.cache_stopped_nodes:
|
| 328 |
+
try:
|
| 329 |
+
# stop machine and leave all resources
|
| 330 |
+
logger.info(
|
| 331 |
+
f"Stopping instance {node_id}"
|
| 332 |
+
"(to fully terminate instead, "
|
| 333 |
+
"set `cache_stopped_nodes: False` "
|
| 334 |
+
"under `provider` in the cluster configuration)"
|
| 335 |
+
)
|
| 336 |
+
stop = get_azure_sdk_function(
|
| 337 |
+
client=self.compute_client.virtual_machines,
|
| 338 |
+
function_name="deallocate",
|
| 339 |
+
)
|
| 340 |
+
stop(resource_group_name=resource_group, vm_name=node_id)
|
| 341 |
+
except Exception as e:
|
| 342 |
+
logger.warning("Failed to stop VM: {}".format(e))
|
| 343 |
+
|
| 344 |
+
# If node_id is in terminating nodes dict, it's already terminating
|
| 345 |
+
# Otherwise, kick off termination and add it to the dict
|
| 346 |
+
elif node_id not in self.terminating_nodes:
|
| 347 |
+
self.terminating_nodes[node_id] = self.termination_executor.submit(
|
| 348 |
+
self._delete_node_and_resources, resource_group, node_id
|
| 349 |
+
)
|
| 350 |
+
|
| 351 |
+
def _delete_node_and_resources(self, resource_group, node_id):
|
| 352 |
+
try:
|
| 353 |
+
vm = self.compute_client.virtual_machines.get(
|
| 354 |
+
resource_group_name=resource_group, vm_name=node_id
|
| 355 |
+
)
|
| 356 |
+
except ResourceNotFoundError as e:
|
| 357 |
+
# Node no longer exists
|
| 358 |
+
logger.warning("Failed to delete VM: {}".format(e))
|
| 359 |
+
return
|
| 360 |
+
|
| 361 |
+
# Gather dependent disks
|
| 362 |
+
disks = set()
|
| 363 |
+
if vm.storage_profile is not None and vm.storage_profile.data_disks is not None:
|
| 364 |
+
for d in vm.storage_profile.data_disks:
|
| 365 |
+
if d.name is not None:
|
| 366 |
+
disks.add(d.name)
|
| 367 |
+
if (
|
| 368 |
+
vm.storage_profile is not None
|
| 369 |
+
and vm.storage_profile.os_disk is not None
|
| 370 |
+
and vm.storage_profile.os_disk.name is not None
|
| 371 |
+
):
|
| 372 |
+
disks.add(vm.storage_profile.os_disk.name)
|
| 373 |
+
|
| 374 |
+
# Gather dependent NICs and public IPs
|
| 375 |
+
nics = set()
|
| 376 |
+
ips = set()
|
| 377 |
+
if (
|
| 378 |
+
vm.network_profile is not None
|
| 379 |
+
and vm.network_profile.network_interfaces is not None
|
| 380 |
+
):
|
| 381 |
+
for nint in vm.network_profile.network_interfaces:
|
| 382 |
+
if nint.id is not None:
|
| 383 |
+
nic_name = nint.id.split("/")[-1]
|
| 384 |
+
nics.add(nic_name)
|
| 385 |
+
# Get public IP if not using internal IPs or if this is the
|
| 386 |
+
# head node and use_external_head_ip is True
|
| 387 |
+
if not self.provider_config.get("use_internal_ips", False) or (
|
| 388 |
+
self.provider_config.get("use_external_head_ip", False)
|
| 389 |
+
and vm.tags[TAG_RAY_NODE_KIND] == NODE_KIND_HEAD
|
| 390 |
+
):
|
| 391 |
+
nic = self.network_client.network_interfaces.get(
|
| 392 |
+
resource_group_name=resource_group,
|
| 393 |
+
network_interface_name=nic_name,
|
| 394 |
+
)
|
| 395 |
+
if nic.ip_configurations is not None:
|
| 396 |
+
for ipc in nic.ip_configurations:
|
| 397 |
+
if ipc.public_ip_address.id is not None:
|
| 398 |
+
ips.add(ipc.public_ip_address.id.split("/")[-1])
|
| 399 |
+
|
| 400 |
+
# Delete VM
|
| 401 |
+
st = time.monotonic()
|
| 402 |
+
delete = get_azure_sdk_function(
|
| 403 |
+
client=self.compute_client.virtual_machines,
|
| 404 |
+
function_name="delete",
|
| 405 |
+
)
|
| 406 |
+
try:
|
| 407 |
+
delete(resource_group_name=resource_group, vm_name=node_id).wait(
|
| 408 |
+
timeout=AUTOSCALER_NODE_TERMINATE_WAIT_S
|
| 409 |
+
)
|
| 410 |
+
except Exception as e:
|
| 411 |
+
logger.warning("Failed to delete VM: {}".format(e))
|
| 412 |
+
|
| 413 |
+
# Delete disks (no need to wait for these, but gather the LROs for end)
|
| 414 |
+
disk_lros = []
|
| 415 |
+
delete = get_azure_sdk_function(
|
| 416 |
+
client=self.compute_client.disks, function_name="delete"
|
| 417 |
+
)
|
| 418 |
+
for d in disks:
|
| 419 |
+
try:
|
| 420 |
+
disk_lros.append(
|
| 421 |
+
delete(
|
| 422 |
+
resource_group_name=resource_group,
|
| 423 |
+
disk_name=d,
|
| 424 |
+
)
|
| 425 |
+
)
|
| 426 |
+
except Exception as e:
|
| 427 |
+
logger.warning("Failed to delete disk: {}".format(e))
|
| 428 |
+
|
| 429 |
+
# Delete NICs
|
| 430 |
+
nic_lros = []
|
| 431 |
+
delete = get_azure_sdk_function(
|
| 432 |
+
client=self.network_client.network_interfaces, function_name="delete"
|
| 433 |
+
)
|
| 434 |
+
for n in nics:
|
| 435 |
+
try:
|
| 436 |
+
nic_lros.append(
|
| 437 |
+
delete(
|
| 438 |
+
resource_group_name=resource_group,
|
| 439 |
+
network_interface_name=n,
|
| 440 |
+
)
|
| 441 |
+
)
|
| 442 |
+
except Exception as e:
|
| 443 |
+
logger.warning("Failed to delete NIC: {}".format(e))
|
| 444 |
+
|
| 445 |
+
while (
|
| 446 |
+
not all(nlro.done() for nlro in nic_lros)
|
| 447 |
+
and (time.monotonic() - st) < AUTOSCALER_NODE_TERMINATE_WAIT_S
|
| 448 |
+
):
|
| 449 |
+
time.sleep(0.1)
|
| 450 |
+
|
| 451 |
+
# Delete Public IPs
|
| 452 |
+
delete = get_azure_sdk_function(
|
| 453 |
+
client=self.network_client.public_ip_addresses,
|
| 454 |
+
function_name="delete",
|
| 455 |
+
)
|
| 456 |
+
ip_lros = []
|
| 457 |
+
for ip in ips:
|
| 458 |
+
try:
|
| 459 |
+
ip_lros.append(
|
| 460 |
+
delete(
|
| 461 |
+
resource_group_name=resource_group,
|
| 462 |
+
public_ip_address_name=ip,
|
| 463 |
+
)
|
| 464 |
+
)
|
| 465 |
+
except Exception as e:
|
| 466 |
+
logger.warning("Failed to delete public IP: {}".format(e))
|
| 467 |
+
|
| 468 |
+
while (
|
| 469 |
+
not all(dlro.done() for dlro in disk_lros)
|
| 470 |
+
and (time.monotonic() - st) < AUTOSCALER_NODE_TERMINATE_WAIT_S
|
| 471 |
+
):
|
| 472 |
+
time.sleep(0.1)
|
| 473 |
+
while (
|
| 474 |
+
not all(iplro.done() for iplro in ip_lros)
|
| 475 |
+
and (time.monotonic() - st) < AUTOSCALER_NODE_TERMINATE_WAIT_S
|
| 476 |
+
):
|
| 477 |
+
time.sleep(0.1)
|
| 478 |
+
|
| 479 |
+
def _get_node(self, node_id):
|
| 480 |
+
self._get_filtered_nodes({}) # Side effect: updates cache
|
| 481 |
+
return self.cached_nodes[node_id]
|
| 482 |
+
|
| 483 |
+
def _get_cached_node(self, node_id):
|
| 484 |
+
return self.cached_nodes.get(node_id) or self._get_node(node_id=node_id)
|
| 485 |
+
|
| 486 |
+
@staticmethod
|
| 487 |
+
def bootstrap_config(cluster_config):
|
| 488 |
+
return bootstrap_azure(cluster_config)
|
.venv/lib/python3.11/site-packages/ray/autoscaler/_private/autoscaler.py
ADDED
|
@@ -0,0 +1,1508 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import copy
|
| 2 |
+
import logging
|
| 3 |
+
import math
|
| 4 |
+
import operator
|
| 5 |
+
import os
|
| 6 |
+
import queue
|
| 7 |
+
import subprocess
|
| 8 |
+
import threading
|
| 9 |
+
import time
|
| 10 |
+
from collections import Counter, defaultdict, namedtuple
|
| 11 |
+
from dataclasses import dataclass, field
|
| 12 |
+
from enum import Enum
|
| 13 |
+
from typing import Any, Callable, Dict, FrozenSet, List, Optional, Set, Tuple, Union
|
| 14 |
+
|
| 15 |
+
import yaml
|
| 16 |
+
|
| 17 |
+
import ray
|
| 18 |
+
import ray._private.ray_constants as ray_constants
|
| 19 |
+
from ray.autoscaler._private.constants import (
|
| 20 |
+
AUTOSCALER_HEARTBEAT_TIMEOUT_S,
|
| 21 |
+
AUTOSCALER_MAX_CONCURRENT_LAUNCHES,
|
| 22 |
+
AUTOSCALER_MAX_LAUNCH_BATCH,
|
| 23 |
+
AUTOSCALER_MAX_NUM_FAILURES,
|
| 24 |
+
AUTOSCALER_STATUS_LOG,
|
| 25 |
+
AUTOSCALER_UPDATE_INTERVAL_S,
|
| 26 |
+
DISABLE_LAUNCH_CONFIG_CHECK_KEY,
|
| 27 |
+
DISABLE_NODE_UPDATERS_KEY,
|
| 28 |
+
FOREGROUND_NODE_LAUNCH_KEY,
|
| 29 |
+
WORKER_LIVENESS_CHECK_KEY,
|
| 30 |
+
)
|
| 31 |
+
from ray.autoscaler._private.event_summarizer import EventSummarizer
|
| 32 |
+
from ray.autoscaler._private.legacy_info_string import legacy_log_info_string
|
| 33 |
+
from ray.autoscaler._private.load_metrics import LoadMetrics
|
| 34 |
+
from ray.autoscaler._private.local.node_provider import (
|
| 35 |
+
LocalNodeProvider,
|
| 36 |
+
record_local_head_state_if_needed,
|
| 37 |
+
)
|
| 38 |
+
from ray.autoscaler._private.node_launcher import BaseNodeLauncher, NodeLauncher
|
| 39 |
+
from ray.autoscaler._private.node_provider_availability_tracker import (
|
| 40 |
+
NodeAvailabilitySummary,
|
| 41 |
+
NodeProviderAvailabilityTracker,
|
| 42 |
+
)
|
| 43 |
+
from ray.autoscaler._private.node_tracker import NodeTracker
|
| 44 |
+
from ray.autoscaler._private.prom_metrics import AutoscalerPrometheusMetrics
|
| 45 |
+
from ray.autoscaler._private.providers import _get_node_provider
|
| 46 |
+
from ray.autoscaler._private.resource_demand_scheduler import (
|
| 47 |
+
ResourceDemandScheduler,
|
| 48 |
+
ResourceDict,
|
| 49 |
+
get_bin_pack_residual,
|
| 50 |
+
)
|
| 51 |
+
from ray.autoscaler._private.updater import NodeUpdaterThread
|
| 52 |
+
from ray.autoscaler._private.util import (
|
| 53 |
+
ConcurrentCounter,
|
| 54 |
+
NodeCount,
|
| 55 |
+
NodeID,
|
| 56 |
+
NodeIP,
|
| 57 |
+
NodeType,
|
| 58 |
+
NodeTypeConfigDict,
|
| 59 |
+
format_info_string,
|
| 60 |
+
hash_launch_conf,
|
| 61 |
+
hash_runtime_conf,
|
| 62 |
+
validate_config,
|
| 63 |
+
with_head_node_ip,
|
| 64 |
+
)
|
| 65 |
+
from ray.autoscaler.node_provider import NodeProvider
|
| 66 |
+
from ray.autoscaler.tags import (
|
| 67 |
+
NODE_KIND_HEAD,
|
| 68 |
+
NODE_KIND_UNMANAGED,
|
| 69 |
+
NODE_KIND_WORKER,
|
| 70 |
+
STATUS_UP_TO_DATE,
|
| 71 |
+
STATUS_UPDATE_FAILED,
|
| 72 |
+
TAG_RAY_FILE_MOUNTS_CONTENTS,
|
| 73 |
+
TAG_RAY_LAUNCH_CONFIG,
|
| 74 |
+
TAG_RAY_NODE_KIND,
|
| 75 |
+
TAG_RAY_NODE_STATUS,
|
| 76 |
+
TAG_RAY_RUNTIME_CONFIG,
|
| 77 |
+
TAG_RAY_USER_NODE_TYPE,
|
| 78 |
+
)
|
| 79 |
+
from ray.exceptions import RpcError
|
| 80 |
+
|
| 81 |
+
logger = logging.getLogger(__name__)
|
| 82 |
+
|
| 83 |
+
# Status of a node e.g. "up-to-date", see ray/autoscaler/tags.py
|
| 84 |
+
NodeStatus = str
|
| 85 |
+
|
| 86 |
+
# Tuple of modified fields for the given node_id returned by should_update
|
| 87 |
+
# that will be passed into a NodeUpdaterThread.
|
| 88 |
+
UpdateInstructions = namedtuple(
|
| 89 |
+
"UpdateInstructions",
|
| 90 |
+
["node_id", "setup_commands", "ray_start_commands", "docker_config"],
|
| 91 |
+
)
|
| 92 |
+
|
| 93 |
+
NodeLaunchData = Tuple[NodeTypeConfigDict, NodeCount, Optional[NodeType]]
|
| 94 |
+
|
| 95 |
+
|
| 96 |
+
@dataclass
|
| 97 |
+
class AutoscalerSummary:
|
| 98 |
+
active_nodes: Dict[NodeType, int]
|
| 99 |
+
idle_nodes: Optional[Dict[NodeType, int]]
|
| 100 |
+
pending_nodes: List[Tuple[NodeIP, NodeType, NodeStatus]]
|
| 101 |
+
pending_launches: Dict[NodeType, int]
|
| 102 |
+
failed_nodes: List[Tuple[NodeIP, NodeType]]
|
| 103 |
+
node_availability_summary: NodeAvailabilitySummary = field(
|
| 104 |
+
default_factory=lambda: NodeAvailabilitySummary({})
|
| 105 |
+
)
|
| 106 |
+
# A dictionary of node IP to a list of reasons the node is not idle.
|
| 107 |
+
node_activities: Optional[Dict[str, Tuple[NodeIP, List[str]]]] = None
|
| 108 |
+
pending_resources: Dict[str, int] = field(default_factory=lambda: {})
|
| 109 |
+
# A mapping from node name (the same key as `usage_by_node`) to node type.
|
| 110 |
+
# Optional for deployment modes which have the concept of node types and
|
| 111 |
+
# backwards compatibility.
|
| 112 |
+
node_type_mapping: Optional[Dict[str, str]] = None
|
| 113 |
+
# Whether the autoscaler summary is v1 or v2.
|
| 114 |
+
legacy: bool = False
|
| 115 |
+
|
| 116 |
+
|
| 117 |
+
class NonTerminatedNodes:
|
| 118 |
+
"""Class to extract and organize information on non-terminated nodes."""
|
| 119 |
+
|
| 120 |
+
def __init__(self, provider: NodeProvider):
|
| 121 |
+
start_time = time.time()
|
| 122 |
+
# All non-terminated nodes
|
| 123 |
+
self.all_node_ids = provider.non_terminated_nodes({})
|
| 124 |
+
|
| 125 |
+
# Managed worker nodes (node kind "worker"):
|
| 126 |
+
self.worker_ids: List[NodeID] = []
|
| 127 |
+
# The head node (node kind "head")
|
| 128 |
+
self.head_id: Optional[NodeID] = None
|
| 129 |
+
|
| 130 |
+
for node in self.all_node_ids:
|
| 131 |
+
node_kind = provider.node_tags(node)[TAG_RAY_NODE_KIND]
|
| 132 |
+
if node_kind == NODE_KIND_WORKER:
|
| 133 |
+
self.worker_ids.append(node)
|
| 134 |
+
elif node_kind == NODE_KIND_HEAD:
|
| 135 |
+
self.head_id = node
|
| 136 |
+
|
| 137 |
+
# Note: For typical use-cases, self.all_node_ids == self.worker_ids +
|
| 138 |
+
# [self.head_id]. The difference being in the case of unmanaged nodes.
|
| 139 |
+
|
| 140 |
+
# Record the time of the non_terminated nodes call. This typically
|
| 141 |
+
# translates to a "describe" or "list" call on most cluster managers
|
| 142 |
+
# which can be quite expensive. Note that we include the processing
|
| 143 |
+
# time because on some clients, there may be pagination and the
|
| 144 |
+
# underlying api calls may be done lazily.
|
| 145 |
+
self.non_terminated_nodes_time = time.time() - start_time
|
| 146 |
+
logger.info(
|
| 147 |
+
f"The autoscaler took {round(self.non_terminated_nodes_time, 3)}"
|
| 148 |
+
" seconds to fetch the list of non-terminated nodes."
|
| 149 |
+
)
|
| 150 |
+
|
| 151 |
+
def remove_terminating_nodes(self, terminating_nodes: List[NodeID]) -> None:
|
| 152 |
+
"""Remove nodes we're in the process of terminating from internal
|
| 153 |
+
state."""
|
| 154 |
+
|
| 155 |
+
def not_terminating(node):
|
| 156 |
+
return node not in terminating_nodes
|
| 157 |
+
|
| 158 |
+
self.worker_ids = list(filter(not_terminating, self.worker_ids))
|
| 159 |
+
self.all_node_ids = list(filter(not_terminating, self.all_node_ids))
|
| 160 |
+
|
| 161 |
+
|
| 162 |
+
# Whether a worker should be kept based on the min_workers and
|
| 163 |
+
# max_workers constraints.
|
| 164 |
+
#
|
| 165 |
+
# keep: should keep the worker
|
| 166 |
+
# terminate: should terminate the worker
|
| 167 |
+
# decide_later: the worker can be terminated if needed
|
| 168 |
+
KeepOrTerminate = Enum("KeepOrTerminate", "keep terminate decide_later")
|
| 169 |
+
|
| 170 |
+
|
| 171 |
+
class StandardAutoscaler:
|
| 172 |
+
"""The autoscaling control loop for a Ray cluster.
|
| 173 |
+
|
| 174 |
+
There are two ways to start an autoscaling cluster: manually by running
|
| 175 |
+
`ray start --head --autoscaling-config=/path/to/config.yaml` on a instance
|
| 176 |
+
that has permission to launch other instances, or you can also use `ray up
|
| 177 |
+
/path/to/config.yaml` from your laptop, which will configure the right
|
| 178 |
+
AWS/Cloud roles automatically. See the Ray documentation
|
| 179 |
+
(https://docs.ray.io/en/latest/) for a full definition of autoscaling behavior.
|
| 180 |
+
StandardAutoscaler's `update` method is periodically called in
|
| 181 |
+
`monitor.py`'s monitoring loop.
|
| 182 |
+
|
| 183 |
+
StandardAutoscaler is also used to bootstrap clusters (by adding workers
|
| 184 |
+
until the cluster size that can handle the resource demand is met).
|
| 185 |
+
"""
|
| 186 |
+
|
| 187 |
+
def __init__(
|
| 188 |
+
self,
|
| 189 |
+
# TODO(ekl): require config reader to be a callable always.
|
| 190 |
+
config_reader: Union[str, Callable[[], dict]],
|
| 191 |
+
load_metrics: LoadMetrics,
|
| 192 |
+
gcs_client: "ray._raylet.GcsClient",
|
| 193 |
+
session_name: Optional[str] = None,
|
| 194 |
+
max_launch_batch: int = AUTOSCALER_MAX_LAUNCH_BATCH,
|
| 195 |
+
max_concurrent_launches: int = AUTOSCALER_MAX_CONCURRENT_LAUNCHES,
|
| 196 |
+
max_failures: int = AUTOSCALER_MAX_NUM_FAILURES,
|
| 197 |
+
process_runner: Any = subprocess,
|
| 198 |
+
update_interval_s: int = AUTOSCALER_UPDATE_INTERVAL_S,
|
| 199 |
+
prefix_cluster_info: bool = False,
|
| 200 |
+
event_summarizer: Optional[EventSummarizer] = None,
|
| 201 |
+
prom_metrics: Optional[AutoscalerPrometheusMetrics] = None,
|
| 202 |
+
):
|
| 203 |
+
"""Create a StandardAutoscaler.
|
| 204 |
+
|
| 205 |
+
Args:
|
| 206 |
+
config_reader: Path to a Ray Autoscaler YAML, or a function to read
|
| 207 |
+
and return the latest config.
|
| 208 |
+
load_metrics: Provides metrics for the Ray cluster.
|
| 209 |
+
session_name: The session name of the cluster this autoscaler
|
| 210 |
+
is deployed.
|
| 211 |
+
max_launch_batch: Max number of nodes to launch in one request.
|
| 212 |
+
max_concurrent_launches: Max number of nodes that can be
|
| 213 |
+
concurrently launched. This value and `max_launch_batch`
|
| 214 |
+
determine the number of batches that are used to launch nodes.
|
| 215 |
+
max_failures: Number of failures that the autoscaler will tolerate
|
| 216 |
+
before exiting.
|
| 217 |
+
process_runner: Subproc-like interface used by the CommandRunner.
|
| 218 |
+
update_interval_s: Seconds between running the autoscaling loop.
|
| 219 |
+
prefix_cluster_info: Whether to add the cluster name to info strs.
|
| 220 |
+
event_summarizer: Utility to consolidate duplicated messages.
|
| 221 |
+
prom_metrics: Prometheus metrics for autoscaler-related operations.
|
| 222 |
+
gcs_client: client for interactions with the GCS. Used to drain nodes
|
| 223 |
+
before termination.
|
| 224 |
+
"""
|
| 225 |
+
|
| 226 |
+
if isinstance(config_reader, str):
|
| 227 |
+
# Auto wrap with file reader.
|
| 228 |
+
def read_fn():
|
| 229 |
+
with open(config_reader) as f:
|
| 230 |
+
new_config = yaml.safe_load(f.read())
|
| 231 |
+
return new_config
|
| 232 |
+
|
| 233 |
+
self.config_reader = read_fn
|
| 234 |
+
else:
|
| 235 |
+
self.config_reader = config_reader
|
| 236 |
+
|
| 237 |
+
self.node_provider_availability_tracker = NodeProviderAvailabilityTracker()
|
| 238 |
+
# Prefix each line of info string with cluster name if True
|
| 239 |
+
self.prefix_cluster_info = prefix_cluster_info
|
| 240 |
+
# Keep this before self.reset (self.provider needs to be created
|
| 241 |
+
# exactly once).
|
| 242 |
+
self.provider = None
|
| 243 |
+
# Keep this before self.reset (if an exception occurs in reset
|
| 244 |
+
# then prom_metrics must be instantitiated to increment the
|
| 245 |
+
# exception counter)
|
| 246 |
+
self.prom_metrics = prom_metrics or AutoscalerPrometheusMetrics(
|
| 247 |
+
session_name=session_name
|
| 248 |
+
) # noqa
|
| 249 |
+
self.resource_demand_scheduler = None
|
| 250 |
+
self.reset(errors_fatal=True)
|
| 251 |
+
self.load_metrics = load_metrics
|
| 252 |
+
|
| 253 |
+
self.max_failures = max_failures
|
| 254 |
+
self.max_launch_batch = max_launch_batch
|
| 255 |
+
self.max_concurrent_launches = max_concurrent_launches
|
| 256 |
+
self.process_runner = process_runner
|
| 257 |
+
self.event_summarizer = event_summarizer or EventSummarizer()
|
| 258 |
+
|
| 259 |
+
# Map from node_id to NodeUpdater threads
|
| 260 |
+
self.updaters: Dict[NodeID, NodeUpdaterThread] = {}
|
| 261 |
+
self.num_failed_updates: Dict[NodeID, int] = defaultdict(int)
|
| 262 |
+
self.num_successful_updates: Dict[NodeID, int] = defaultdict(int)
|
| 263 |
+
self.num_failures = 0
|
| 264 |
+
self.last_update_time = 0.0
|
| 265 |
+
self.update_interval_s = update_interval_s
|
| 266 |
+
|
| 267 |
+
# Keeps track of pending and running nodes
|
| 268 |
+
self.non_terminated_nodes: Optional[NonTerminatedNodes] = None
|
| 269 |
+
|
| 270 |
+
# Tracks nodes scheduled for termination
|
| 271 |
+
self.nodes_to_terminate: List[NodeID] = []
|
| 272 |
+
|
| 273 |
+
# Disable NodeUpdater threads if true.
|
| 274 |
+
# Should be set to true in situations where another component, such as
|
| 275 |
+
# a Kubernetes operator, is responsible for Ray setup on nodes.
|
| 276 |
+
self.disable_node_updaters = self.config["provider"].get(
|
| 277 |
+
DISABLE_NODE_UPDATERS_KEY, False
|
| 278 |
+
)
|
| 279 |
+
logger.info(f"{DISABLE_NODE_UPDATERS_KEY}:{self.disable_node_updaters}")
|
| 280 |
+
|
| 281 |
+
# Disable launch config checking if true.
|
| 282 |
+
# This is set in the fake_multinode situations where there isn't any
|
| 283 |
+
# meaningful node "type" to enforce.
|
| 284 |
+
self.disable_launch_config_check = self.config["provider"].get(
|
| 285 |
+
DISABLE_LAUNCH_CONFIG_CHECK_KEY, False
|
| 286 |
+
)
|
| 287 |
+
logger.info(
|
| 288 |
+
f"{DISABLE_LAUNCH_CONFIG_CHECK_KEY}:{self.disable_launch_config_check}"
|
| 289 |
+
)
|
| 290 |
+
|
| 291 |
+
# By default, the autoscaler launches nodes in batches asynchronously in
|
| 292 |
+
# background threads.
|
| 293 |
+
# When the following flag is set, that behavior is disabled, so that nodes
|
| 294 |
+
# are launched in the main thread, all in one batch, blocking until all
|
| 295 |
+
# NodeProvider.create_node calls have returned.
|
| 296 |
+
self.foreground_node_launch = self.config["provider"].get(
|
| 297 |
+
FOREGROUND_NODE_LAUNCH_KEY, False
|
| 298 |
+
)
|
| 299 |
+
logger.info(f"{FOREGROUND_NODE_LAUNCH_KEY}:{self.foreground_node_launch}")
|
| 300 |
+
|
| 301 |
+
# By default, the autoscaler kills and/or tries to recover
|
| 302 |
+
# a worker node if it hasn't produced a resource heartbeat in the last 30
|
| 303 |
+
# seconds. The worker_liveness_check flag allows disabling this behavior in
|
| 304 |
+
# settings where another component, such as a Kubernetes operator, is
|
| 305 |
+
# responsible for healthchecks.
|
| 306 |
+
self.worker_liveness_check = self.config["provider"].get(
|
| 307 |
+
WORKER_LIVENESS_CHECK_KEY, True
|
| 308 |
+
)
|
| 309 |
+
logger.info(f"{WORKER_LIVENESS_CHECK_KEY}:{self.worker_liveness_check}")
|
| 310 |
+
|
| 311 |
+
# Node launchers
|
| 312 |
+
self.foreground_node_launcher: Optional[BaseNodeLauncher] = None
|
| 313 |
+
self.launch_queue: Optional[queue.Queue[NodeLaunchData]] = None
|
| 314 |
+
self.pending_launches = ConcurrentCounter()
|
| 315 |
+
if self.foreground_node_launch:
|
| 316 |
+
self.foreground_node_launcher = BaseNodeLauncher(
|
| 317 |
+
provider=self.provider,
|
| 318 |
+
pending=self.pending_launches,
|
| 319 |
+
event_summarizer=self.event_summarizer,
|
| 320 |
+
node_provider_availability_tracker=self.node_provider_availability_tracker, # noqa: E501 Flake and black disagree how to format this.
|
| 321 |
+
session_name=session_name,
|
| 322 |
+
node_types=self.available_node_types,
|
| 323 |
+
prom_metrics=self.prom_metrics,
|
| 324 |
+
)
|
| 325 |
+
else:
|
| 326 |
+
self.launch_queue = queue.Queue()
|
| 327 |
+
max_batches = math.ceil(max_concurrent_launches / float(max_launch_batch))
|
| 328 |
+
for i in range(int(max_batches)):
|
| 329 |
+
node_launcher = NodeLauncher(
|
| 330 |
+
provider=self.provider,
|
| 331 |
+
queue=self.launch_queue,
|
| 332 |
+
index=i,
|
| 333 |
+
pending=self.pending_launches,
|
| 334 |
+
event_summarizer=self.event_summarizer,
|
| 335 |
+
node_provider_availability_tracker=self.node_provider_availability_tracker, # noqa: E501 Flake and black disagreee how to format this.
|
| 336 |
+
session_name=session_name,
|
| 337 |
+
node_types=self.available_node_types,
|
| 338 |
+
prom_metrics=self.prom_metrics,
|
| 339 |
+
)
|
| 340 |
+
node_launcher.daemon = True
|
| 341 |
+
node_launcher.start()
|
| 342 |
+
|
| 343 |
+
# NodeTracker maintains soft state to track the number of recently
|
| 344 |
+
# failed nodes. It is best effort only.
|
| 345 |
+
self.node_tracker = NodeTracker()
|
| 346 |
+
|
| 347 |
+
# Expand local file_mounts to allow ~ in the paths. This can't be done
|
| 348 |
+
# earlier when the config is written since we might be on different
|
| 349 |
+
# platform and the expansion would result in wrong path.
|
| 350 |
+
self.config["file_mounts"] = {
|
| 351 |
+
remote: os.path.expanduser(local)
|
| 352 |
+
for remote, local in self.config["file_mounts"].items()
|
| 353 |
+
}
|
| 354 |
+
|
| 355 |
+
self.gcs_client = gcs_client
|
| 356 |
+
|
| 357 |
+
for local_path in self.config["file_mounts"].values():
|
| 358 |
+
assert os.path.exists(local_path)
|
| 359 |
+
logger.info("StandardAutoscaler: {}".format(self.config))
|
| 360 |
+
|
| 361 |
+
@property
|
| 362 |
+
def all_node_types(self) -> Set[str]:
|
| 363 |
+
return self.config["available_node_types"].keys()
|
| 364 |
+
|
| 365 |
+
def update(self):
|
| 366 |
+
try:
|
| 367 |
+
self.reset(errors_fatal=False)
|
| 368 |
+
self._update()
|
| 369 |
+
except Exception as e:
|
| 370 |
+
self.prom_metrics.update_loop_exceptions.inc()
|
| 371 |
+
logger.exception("StandardAutoscaler: Error during autoscaling.")
|
| 372 |
+
self.num_failures += 1
|
| 373 |
+
if self.num_failures > self.max_failures:
|
| 374 |
+
logger.critical("StandardAutoscaler: Too many errors, abort.")
|
| 375 |
+
raise e
|
| 376 |
+
|
| 377 |
+
def _update(self):
|
| 378 |
+
# For type checking, assert that these objects have been instantitiated.
|
| 379 |
+
assert self.provider
|
| 380 |
+
assert self.resource_demand_scheduler
|
| 381 |
+
|
| 382 |
+
now = time.time()
|
| 383 |
+
# Throttle autoscaling updates to this interval to avoid exceeding
|
| 384 |
+
# rate limits on API calls.
|
| 385 |
+
if now - self.last_update_time < self.update_interval_s:
|
| 386 |
+
return
|
| 387 |
+
|
| 388 |
+
self.last_update_time = now
|
| 389 |
+
|
| 390 |
+
# Query the provider to update the list of non-terminated nodes
|
| 391 |
+
self.non_terminated_nodes = NonTerminatedNodes(self.provider)
|
| 392 |
+
|
| 393 |
+
# Back off the update if the provider says it's not safe to proceed.
|
| 394 |
+
if not self.provider.safe_to_scale():
|
| 395 |
+
logger.info(
|
| 396 |
+
"Backing off of autoscaler update."
|
| 397 |
+
f" Will try again in {self.update_interval_s} seconds."
|
| 398 |
+
)
|
| 399 |
+
return
|
| 400 |
+
|
| 401 |
+
# This will accumulate the nodes we need to terminate.
|
| 402 |
+
self.nodes_to_terminate = []
|
| 403 |
+
|
| 404 |
+
# Update running nodes gauge
|
| 405 |
+
num_workers = len(self.non_terminated_nodes.worker_ids)
|
| 406 |
+
self.prom_metrics.running_workers.set(num_workers)
|
| 407 |
+
|
| 408 |
+
# Remove from LoadMetrics the ips unknown to the NodeProvider.
|
| 409 |
+
self.load_metrics.prune_active_ips(
|
| 410 |
+
active_ips=[
|
| 411 |
+
self.provider.internal_ip(node_id)
|
| 412 |
+
for node_id in self.non_terminated_nodes.all_node_ids
|
| 413 |
+
]
|
| 414 |
+
)
|
| 415 |
+
|
| 416 |
+
# Update status strings
|
| 417 |
+
if AUTOSCALER_STATUS_LOG:
|
| 418 |
+
logger.info(self.info_string())
|
| 419 |
+
legacy_log_info_string(self, self.non_terminated_nodes.worker_ids)
|
| 420 |
+
|
| 421 |
+
if not self.provider.is_readonly():
|
| 422 |
+
self.terminate_nodes_to_enforce_config_constraints(now)
|
| 423 |
+
|
| 424 |
+
if self.disable_node_updaters:
|
| 425 |
+
# Don't handle unhealthy nodes if the liveness check is disabled.
|
| 426 |
+
# self.worker_liveness_check is True by default.
|
| 427 |
+
if self.worker_liveness_check:
|
| 428 |
+
self.terminate_unhealthy_nodes(now)
|
| 429 |
+
else:
|
| 430 |
+
self.process_completed_updates()
|
| 431 |
+
self.update_nodes()
|
| 432 |
+
# Don't handle unhealthy nodes if the liveness check is disabled.
|
| 433 |
+
# self.worker_liveness_check is True by default.
|
| 434 |
+
if self.worker_liveness_check:
|
| 435 |
+
self.attempt_to_recover_unhealthy_nodes(now)
|
| 436 |
+
self.set_prometheus_updater_data()
|
| 437 |
+
|
| 438 |
+
# Dict[NodeType, int], List[ResourceDict]
|
| 439 |
+
to_launch, unfulfilled = self.resource_demand_scheduler.get_nodes_to_launch(
|
| 440 |
+
self.non_terminated_nodes.all_node_ids,
|
| 441 |
+
self.pending_launches.breakdown(),
|
| 442 |
+
self.load_metrics.get_resource_demand_vector(),
|
| 443 |
+
self.load_metrics.get_resource_utilization(),
|
| 444 |
+
self.load_metrics.get_pending_placement_groups(),
|
| 445 |
+
self.load_metrics.get_static_node_resources_by_ip(),
|
| 446 |
+
ensure_min_cluster_size=self.load_metrics.get_resource_requests(),
|
| 447 |
+
node_availability_summary=self.node_provider_availability_tracker.summary(),
|
| 448 |
+
)
|
| 449 |
+
self._report_pending_infeasible(unfulfilled)
|
| 450 |
+
|
| 451 |
+
if not self.provider.is_readonly():
|
| 452 |
+
self.launch_required_nodes(to_launch)
|
| 453 |
+
|
| 454 |
+
# Execute optional end-of-update logic.
|
| 455 |
+
# Keep this method call at the end of autoscaler._update().
|
| 456 |
+
self.provider.post_process()
|
| 457 |
+
|
| 458 |
+
# Record the amount of time the autoscaler took for
|
| 459 |
+
# this _update() iteration.
|
| 460 |
+
update_time = time.time() - self.last_update_time
|
| 461 |
+
logger.info(
|
| 462 |
+
f"The autoscaler took {round(update_time, 3)}"
|
| 463 |
+
" seconds to complete the update iteration."
|
| 464 |
+
)
|
| 465 |
+
self.prom_metrics.update_time.observe(update_time)
|
| 466 |
+
|
| 467 |
+
def terminate_nodes_to_enforce_config_constraints(self, now: float):
|
| 468 |
+
"""Terminates nodes to enforce constraints defined by the autoscaling
|
| 469 |
+
config.
|
| 470 |
+
|
| 471 |
+
(1) Terminates nodes in excess of `max_workers`.
|
| 472 |
+
(2) Terminates nodes idle for longer than `idle_timeout_minutes`.
|
| 473 |
+
(3) Terminates outdated nodes,
|
| 474 |
+
namely nodes whose configs don't match `node_config` for the
|
| 475 |
+
relevant node type.
|
| 476 |
+
|
| 477 |
+
Avoids terminating non-outdated nodes required by
|
| 478 |
+
autoscaler.sdk.request_resources().
|
| 479 |
+
"""
|
| 480 |
+
# For type checking, assert that these objects have been instantitiated.
|
| 481 |
+
assert self.non_terminated_nodes
|
| 482 |
+
assert self.provider
|
| 483 |
+
|
| 484 |
+
last_used = self.load_metrics.ray_nodes_last_used_time_by_ip
|
| 485 |
+
|
| 486 |
+
idle_timeout_s = 60 * self.config["idle_timeout_minutes"]
|
| 487 |
+
|
| 488 |
+
last_used_cutoff = now - idle_timeout_s
|
| 489 |
+
|
| 490 |
+
# Sort based on last used to make sure to keep min_workers that
|
| 491 |
+
# were most recently used. Otherwise, _keep_min_workers_of_node_type
|
| 492 |
+
# might keep a node that should be terminated.
|
| 493 |
+
sorted_node_ids = self._sort_based_on_last_used(
|
| 494 |
+
self.non_terminated_nodes.worker_ids, last_used
|
| 495 |
+
)
|
| 496 |
+
|
| 497 |
+
# Don't terminate nodes needed by request_resources()
|
| 498 |
+
nodes_not_allowed_to_terminate: FrozenSet[NodeID] = {}
|
| 499 |
+
if self.load_metrics.get_resource_requests():
|
| 500 |
+
nodes_not_allowed_to_terminate = (
|
| 501 |
+
self._get_nodes_needed_for_request_resources(sorted_node_ids)
|
| 502 |
+
)
|
| 503 |
+
|
| 504 |
+
# Tracks counts of nodes we intend to keep for each node type.
|
| 505 |
+
node_type_counts = defaultdict(int)
|
| 506 |
+
|
| 507 |
+
def keep_node(node_id: NodeID) -> None:
|
| 508 |
+
assert self.provider
|
| 509 |
+
# Update per-type counts.
|
| 510 |
+
tags = self.provider.node_tags(node_id)
|
| 511 |
+
if TAG_RAY_USER_NODE_TYPE in tags:
|
| 512 |
+
node_type = tags[TAG_RAY_USER_NODE_TYPE]
|
| 513 |
+
node_type_counts[node_type] += 1
|
| 514 |
+
|
| 515 |
+
# Nodes that we could terminate, if needed.
|
| 516 |
+
nodes_we_could_terminate: List[NodeID] = []
|
| 517 |
+
|
| 518 |
+
for node_id in sorted_node_ids:
|
| 519 |
+
# Make sure to not kill idle node types if the number of workers
|
| 520 |
+
# of that type is lower/equal to the min_workers of that type
|
| 521 |
+
# or it is needed for request_resources().
|
| 522 |
+
should_keep_or_terminate, reason = self._keep_worker_of_node_type(
|
| 523 |
+
node_id, node_type_counts
|
| 524 |
+
)
|
| 525 |
+
if should_keep_or_terminate == KeepOrTerminate.terminate:
|
| 526 |
+
self.schedule_node_termination(node_id, reason, logger.info)
|
| 527 |
+
continue
|
| 528 |
+
if (
|
| 529 |
+
should_keep_or_terminate == KeepOrTerminate.keep
|
| 530 |
+
or node_id in nodes_not_allowed_to_terminate
|
| 531 |
+
) and self.launch_config_ok(node_id):
|
| 532 |
+
keep_node(node_id)
|
| 533 |
+
continue
|
| 534 |
+
|
| 535 |
+
node_ip = self.provider.internal_ip(node_id)
|
| 536 |
+
|
| 537 |
+
if node_ip in last_used and last_used[node_ip] < last_used_cutoff:
|
| 538 |
+
self.schedule_node_termination(node_id, "idle", logger.info)
|
| 539 |
+
# Get the local time of the node's last use as a string.
|
| 540 |
+
formatted_last_used_time = time.asctime(
|
| 541 |
+
time.localtime(last_used[node_ip])
|
| 542 |
+
)
|
| 543 |
+
logger.info(f"Node last used: {formatted_last_used_time}.")
|
| 544 |
+
# Note that the current time will appear in the log prefix.
|
| 545 |
+
elif not self.launch_config_ok(node_id):
|
| 546 |
+
self.schedule_node_termination(node_id, "outdated", logger.info)
|
| 547 |
+
else:
|
| 548 |
+
keep_node(node_id)
|
| 549 |
+
nodes_we_could_terminate.append(node_id)
|
| 550 |
+
|
| 551 |
+
# Terminate nodes if there are too many
|
| 552 |
+
num_workers = len(self.non_terminated_nodes.worker_ids)
|
| 553 |
+
num_extra_nodes_to_terminate = (
|
| 554 |
+
num_workers - len(self.nodes_to_terminate) - self.config["max_workers"]
|
| 555 |
+
)
|
| 556 |
+
|
| 557 |
+
if num_extra_nodes_to_terminate > len(nodes_we_could_terminate):
|
| 558 |
+
logger.warning(
|
| 559 |
+
"StandardAutoscaler: trying to terminate "
|
| 560 |
+
f"{num_extra_nodes_to_terminate} nodes, while only "
|
| 561 |
+
f"{len(nodes_we_could_terminate)} are safe to terminate."
|
| 562 |
+
" Inconsistent config is likely."
|
| 563 |
+
)
|
| 564 |
+
num_extra_nodes_to_terminate = len(nodes_we_could_terminate)
|
| 565 |
+
|
| 566 |
+
# If num_extra_nodes_to_terminate is negative or zero,
|
| 567 |
+
# we would have less than max_workers nodes after terminating
|
| 568 |
+
# nodes_to_terminate and we do not need to terminate anything else.
|
| 569 |
+
if num_extra_nodes_to_terminate > 0:
|
| 570 |
+
extra_nodes_to_terminate = nodes_we_could_terminate[
|
| 571 |
+
-num_extra_nodes_to_terminate:
|
| 572 |
+
]
|
| 573 |
+
for node_id in extra_nodes_to_terminate:
|
| 574 |
+
self.schedule_node_termination(node_id, "max workers", logger.info)
|
| 575 |
+
|
| 576 |
+
self.terminate_scheduled_nodes()
|
| 577 |
+
|
| 578 |
+
def schedule_node_termination(
|
| 579 |
+
self, node_id: NodeID, reason_opt: Optional[str], logger_method: Callable
|
| 580 |
+
) -> None:
|
| 581 |
+
# For type checking, assert that this object has been instantitiated.
|
| 582 |
+
assert self.provider
|
| 583 |
+
|
| 584 |
+
if reason_opt is None:
|
| 585 |
+
raise Exception("reason should be not None.")
|
| 586 |
+
reason: str = reason_opt
|
| 587 |
+
node_ip = self.provider.internal_ip(node_id)
|
| 588 |
+
# Log, record an event, and add node_id to nodes_to_terminate.
|
| 589 |
+
logger_method(
|
| 590 |
+
"StandardAutoscaler: "
|
| 591 |
+
f"Terminating the node with id {node_id}"
|
| 592 |
+
f" and ip {node_ip}."
|
| 593 |
+
f" ({reason})"
|
| 594 |
+
)
|
| 595 |
+
self.event_summarizer.add(
|
| 596 |
+
"Removing {} nodes of type "
|
| 597 |
+
+ self._get_node_type(node_id)
|
| 598 |
+
+ " ({}).".format(reason),
|
| 599 |
+
quantity=1,
|
| 600 |
+
aggregate=operator.add,
|
| 601 |
+
)
|
| 602 |
+
self.nodes_to_terminate.append(node_id)
|
| 603 |
+
|
| 604 |
+
def terminate_scheduled_nodes(self):
|
| 605 |
+
"""Terminate scheduled nodes and clean associated autoscaler state."""
|
| 606 |
+
# For type checking, assert that these objects have been instantitiated.
|
| 607 |
+
assert self.provider
|
| 608 |
+
assert self.non_terminated_nodes
|
| 609 |
+
|
| 610 |
+
if not self.nodes_to_terminate:
|
| 611 |
+
return
|
| 612 |
+
|
| 613 |
+
# Drain the nodes
|
| 614 |
+
self.drain_nodes_via_gcs(self.nodes_to_terminate)
|
| 615 |
+
# Terminate the nodes
|
| 616 |
+
self.provider.terminate_nodes(self.nodes_to_terminate)
|
| 617 |
+
for node in self.nodes_to_terminate:
|
| 618 |
+
self.node_tracker.untrack(node)
|
| 619 |
+
self.prom_metrics.stopped_nodes.inc()
|
| 620 |
+
|
| 621 |
+
# Update internal node lists
|
| 622 |
+
self.non_terminated_nodes.remove_terminating_nodes(self.nodes_to_terminate)
|
| 623 |
+
|
| 624 |
+
self.nodes_to_terminate = []
|
| 625 |
+
|
| 626 |
+
def drain_nodes_via_gcs(self, provider_node_ids_to_drain: List[NodeID]):
|
| 627 |
+
"""Send an RPC request to the GCS to drain (prepare for termination)
|
| 628 |
+
the nodes with the given node provider ids.
|
| 629 |
+
|
| 630 |
+
note: The current implementation of DrainNode on the GCS side is to
|
| 631 |
+
de-register and gracefully shut down the Raylets. In the future,
|
| 632 |
+
the behavior may change to better reflect the name "Drain."
|
| 633 |
+
See https://github.com/ray-project/ray/pull/19350.
|
| 634 |
+
"""
|
| 635 |
+
# For type checking, assert that this object has been instantitiated.
|
| 636 |
+
assert self.provider
|
| 637 |
+
|
| 638 |
+
# The GCS expects Raylet ids in the request, rather than NodeProvider
|
| 639 |
+
# ids. To get the Raylet ids of the nodes to we're draining, we make
|
| 640 |
+
# the following translations of identifiers:
|
| 641 |
+
# node provider node id -> ip -> raylet id
|
| 642 |
+
|
| 643 |
+
# Convert node provider node ids to ips.
|
| 644 |
+
node_ips = set()
|
| 645 |
+
failed_ip_fetch = False
|
| 646 |
+
for provider_node_id in provider_node_ids_to_drain:
|
| 647 |
+
# If the provider's call to fetch ip fails, the exception is not
|
| 648 |
+
# fatal. Log the exception and proceed.
|
| 649 |
+
try:
|
| 650 |
+
ip = self.provider.internal_ip(provider_node_id)
|
| 651 |
+
node_ips.add(ip)
|
| 652 |
+
except Exception:
|
| 653 |
+
logger.exception(
|
| 654 |
+
"Failed to get ip of node with id"
|
| 655 |
+
f" {provider_node_id} during scale-down."
|
| 656 |
+
)
|
| 657 |
+
failed_ip_fetch = True
|
| 658 |
+
if failed_ip_fetch:
|
| 659 |
+
self.prom_metrics.drain_node_exceptions.inc()
|
| 660 |
+
|
| 661 |
+
# Only attempt to drain connected nodes, i.e. nodes with ips in
|
| 662 |
+
# LoadMetrics.
|
| 663 |
+
connected_node_ips = node_ips & self.load_metrics.raylet_id_by_ip.keys()
|
| 664 |
+
|
| 665 |
+
# Convert ips to Raylet ids.
|
| 666 |
+
# (The assignment ip->raylet_id is well-defined under current
|
| 667 |
+
# assumptions. See "use_node_id_as_ip" in monitor.py)
|
| 668 |
+
raylet_ids_to_drain = {
|
| 669 |
+
self.load_metrics.raylet_id_by_ip[ip] for ip in connected_node_ips
|
| 670 |
+
}
|
| 671 |
+
|
| 672 |
+
if not raylet_ids_to_drain:
|
| 673 |
+
return
|
| 674 |
+
|
| 675 |
+
logger.info(f"Draining {len(raylet_ids_to_drain)} raylet(s).")
|
| 676 |
+
try:
|
| 677 |
+
# A successful response indicates that the GCS has marked the
|
| 678 |
+
# desired nodes as "drained." The cloud provider can then terminate
|
| 679 |
+
# the nodes without the GCS printing an error.
|
| 680 |
+
# Check if we succeeded in draining all of the intended nodes by
|
| 681 |
+
# looking at the RPC response.
|
| 682 |
+
drained_raylet_ids = set(
|
| 683 |
+
self.gcs_client.drain_nodes(raylet_ids_to_drain, timeout=5)
|
| 684 |
+
)
|
| 685 |
+
failed_to_drain = raylet_ids_to_drain - drained_raylet_ids
|
| 686 |
+
if failed_to_drain:
|
| 687 |
+
self.prom_metrics.drain_node_exceptions.inc()
|
| 688 |
+
logger.error(f"Failed to drain {len(failed_to_drain)} raylet(s).")
|
| 689 |
+
# If we get a gRPC error with an UNIMPLEMENTED code, fail silently.
|
| 690 |
+
# This error indicates that the GCS is using Ray version < 1.8.0,
|
| 691 |
+
# for which DrainNode is not implemented.
|
| 692 |
+
except RpcError as e:
|
| 693 |
+
# If the code is UNIMPLEMENTED, pass.
|
| 694 |
+
if e.rpc_code == ray._raylet.GRPC_STATUS_CODE_UNIMPLEMENTED:
|
| 695 |
+
pass
|
| 696 |
+
# Otherwise, it's a plain old gRPC error and we should log it.
|
| 697 |
+
else:
|
| 698 |
+
self.prom_metrics.drain_node_exceptions.inc()
|
| 699 |
+
logger.exception("Failed to drain Ray nodes. Traceback follows.")
|
| 700 |
+
except Exception:
|
| 701 |
+
# We don't need to interrupt the autoscaler update with an
|
| 702 |
+
# exception, but we should log what went wrong and record the
|
| 703 |
+
# failure in Prometheus.
|
| 704 |
+
self.prom_metrics.drain_node_exceptions.inc()
|
| 705 |
+
logger.exception("Failed to drain Ray nodes. Traceback follows.")
|
| 706 |
+
|
| 707 |
+
def launch_required_nodes(self, to_launch: Dict[NodeType, int]) -> None:
|
| 708 |
+
if to_launch:
|
| 709 |
+
for node_type, count in to_launch.items():
|
| 710 |
+
self.launch_new_node(count, node_type=node_type)
|
| 711 |
+
|
| 712 |
+
def update_nodes(self):
|
| 713 |
+
"""Run NodeUpdaterThreads to run setup commands, sync files,
|
| 714 |
+
and/or start Ray.
|
| 715 |
+
"""
|
| 716 |
+
# Update nodes with out-of-date files.
|
| 717 |
+
# TODO(edoakes): Spawning these threads directly seems to cause
|
| 718 |
+
# problems. They should at a minimum be spawned as daemon threads.
|
| 719 |
+
# See https://github.com/ray-project/ray/pull/5903 for more info.
|
| 720 |
+
T = []
|
| 721 |
+
for node_id, setup_commands, ray_start_commands, docker_config in (
|
| 722 |
+
self.should_update(node_id)
|
| 723 |
+
for node_id in self.non_terminated_nodes.worker_ids
|
| 724 |
+
):
|
| 725 |
+
if node_id is not None:
|
| 726 |
+
resources = self._node_resources(node_id)
|
| 727 |
+
labels = self._node_labels(node_id)
|
| 728 |
+
logger.debug(f"{node_id}: Starting new thread runner.")
|
| 729 |
+
T.append(
|
| 730 |
+
threading.Thread(
|
| 731 |
+
target=self.spawn_updater,
|
| 732 |
+
args=(
|
| 733 |
+
node_id,
|
| 734 |
+
setup_commands,
|
| 735 |
+
ray_start_commands,
|
| 736 |
+
resources,
|
| 737 |
+
labels,
|
| 738 |
+
docker_config,
|
| 739 |
+
),
|
| 740 |
+
)
|
| 741 |
+
)
|
| 742 |
+
for t in T:
|
| 743 |
+
t.start()
|
| 744 |
+
for t in T:
|
| 745 |
+
t.join()
|
| 746 |
+
|
| 747 |
+
def process_completed_updates(self):
|
| 748 |
+
"""Clean up completed NodeUpdaterThreads."""
|
| 749 |
+
completed_nodes = []
|
| 750 |
+
for node_id, updater in self.updaters.items():
|
| 751 |
+
if not updater.is_alive():
|
| 752 |
+
completed_nodes.append(node_id)
|
| 753 |
+
if completed_nodes:
|
| 754 |
+
failed_nodes = []
|
| 755 |
+
for node_id in completed_nodes:
|
| 756 |
+
updater = self.updaters[node_id]
|
| 757 |
+
if updater.exitcode == 0:
|
| 758 |
+
self.num_successful_updates[node_id] += 1
|
| 759 |
+
self.prom_metrics.successful_updates.inc()
|
| 760 |
+
if updater.for_recovery:
|
| 761 |
+
self.prom_metrics.successful_recoveries.inc()
|
| 762 |
+
if updater.update_time:
|
| 763 |
+
self.prom_metrics.worker_update_time.observe(
|
| 764 |
+
updater.update_time
|
| 765 |
+
)
|
| 766 |
+
# Mark the node as active to prevent the node recovery
|
| 767 |
+
# logic immediately trying to restart Ray on the new node.
|
| 768 |
+
self.load_metrics.mark_active(self.provider.internal_ip(node_id))
|
| 769 |
+
else:
|
| 770 |
+
failed_nodes.append(node_id)
|
| 771 |
+
self.num_failed_updates[node_id] += 1
|
| 772 |
+
self.prom_metrics.failed_updates.inc()
|
| 773 |
+
if updater.for_recovery:
|
| 774 |
+
self.prom_metrics.failed_recoveries.inc()
|
| 775 |
+
self.node_tracker.untrack(node_id)
|
| 776 |
+
del self.updaters[node_id]
|
| 777 |
+
|
| 778 |
+
if failed_nodes:
|
| 779 |
+
# Some nodes in failed_nodes may already have been terminated
|
| 780 |
+
# during an update (for being idle after missing a heartbeat).
|
| 781 |
+
|
| 782 |
+
# Update the list of non-terminated workers.
|
| 783 |
+
for node_id in failed_nodes:
|
| 784 |
+
# Check if the node has already been terminated.
|
| 785 |
+
if node_id in self.non_terminated_nodes.worker_ids:
|
| 786 |
+
self.schedule_node_termination(
|
| 787 |
+
node_id, "launch failed", logger.error
|
| 788 |
+
)
|
| 789 |
+
else:
|
| 790 |
+
logger.warning(
|
| 791 |
+
f"StandardAutoscaler: {node_id}:"
|
| 792 |
+
" Failed to update node."
|
| 793 |
+
" Node has already been terminated."
|
| 794 |
+
)
|
| 795 |
+
self.terminate_scheduled_nodes()
|
| 796 |
+
|
| 797 |
+
def set_prometheus_updater_data(self):
|
| 798 |
+
"""Record total number of active NodeUpdaterThreads and how many of
|
| 799 |
+
these are being run to recover nodes.
|
| 800 |
+
"""
|
| 801 |
+
self.prom_metrics.updating_nodes.set(len(self.updaters))
|
| 802 |
+
num_recovering = 0
|
| 803 |
+
for updater in self.updaters.values():
|
| 804 |
+
if updater.for_recovery:
|
| 805 |
+
num_recovering += 1
|
| 806 |
+
self.prom_metrics.recovering_nodes.set(num_recovering)
|
| 807 |
+
|
| 808 |
+
def _report_pending_infeasible(self, unfulfilled: List[ResourceDict]):
|
| 809 |
+
"""Emit event messages for infeasible or unschedulable tasks.
|
| 810 |
+
|
| 811 |
+
This adds messages to the event summarizer for warning on infeasible
|
| 812 |
+
or "cluster full" resource requests.
|
| 813 |
+
|
| 814 |
+
Args:
|
| 815 |
+
unfulfilled: List of resource demands that would be unfulfilled
|
| 816 |
+
even after full scale-up.
|
| 817 |
+
"""
|
| 818 |
+
# For type checking, assert that this object has been instantitiated.
|
| 819 |
+
assert self.resource_demand_scheduler
|
| 820 |
+
pending = []
|
| 821 |
+
infeasible = []
|
| 822 |
+
for bundle in unfulfilled:
|
| 823 |
+
placement_group = any(
|
| 824 |
+
"_group_" in k
|
| 825 |
+
or k == ray_constants.PLACEMENT_GROUP_BUNDLE_RESOURCE_NAME
|
| 826 |
+
for k in bundle
|
| 827 |
+
)
|
| 828 |
+
if placement_group:
|
| 829 |
+
continue
|
| 830 |
+
if self.resource_demand_scheduler.is_feasible(bundle):
|
| 831 |
+
pending.append(bundle)
|
| 832 |
+
else:
|
| 833 |
+
infeasible.append(bundle)
|
| 834 |
+
if pending:
|
| 835 |
+
if self.load_metrics.cluster_full_of_actors_detected:
|
| 836 |
+
for request in pending:
|
| 837 |
+
self.event_summarizer.add_once_per_interval(
|
| 838 |
+
"Warning: The following resource request cannot be "
|
| 839 |
+
"scheduled right now: {}. This is likely due to all "
|
| 840 |
+
"cluster resources being claimed by actors. Consider "
|
| 841 |
+
"creating fewer actors or adding more nodes "
|
| 842 |
+
"to this Ray cluster.".format(request),
|
| 843 |
+
key="pending_{}".format(sorted(request.items())),
|
| 844 |
+
interval_s=30,
|
| 845 |
+
)
|
| 846 |
+
if infeasible:
|
| 847 |
+
for request in infeasible:
|
| 848 |
+
self.event_summarizer.add_once_per_interval(
|
| 849 |
+
"Error: No available node types can fulfill resource "
|
| 850 |
+
"request {}. Add suitable node types to this cluster to "
|
| 851 |
+
"resolve this issue.".format(request),
|
| 852 |
+
key="infeasible_{}".format(sorted(request.items())),
|
| 853 |
+
interval_s=30,
|
| 854 |
+
)
|
| 855 |
+
|
| 856 |
+
def _sort_based_on_last_used(
|
| 857 |
+
self, nodes: List[NodeID], last_used: Dict[str, float]
|
| 858 |
+
) -> List[NodeID]:
|
| 859 |
+
"""Sort the nodes based on the last time they were used.
|
| 860 |
+
|
| 861 |
+
The first item in the return list is the most recently used.
|
| 862 |
+
"""
|
| 863 |
+
last_used_copy = copy.deepcopy(last_used)
|
| 864 |
+
# Add the unconnected nodes as the least recently used (the end of
|
| 865 |
+
# list). This prioritizes connected nodes.
|
| 866 |
+
least_recently_used = -1
|
| 867 |
+
|
| 868 |
+
def last_time_used(node_id: NodeID):
|
| 869 |
+
assert self.provider
|
| 870 |
+
node_ip = self.provider.internal_ip(node_id)
|
| 871 |
+
if node_ip not in last_used_copy:
|
| 872 |
+
return least_recently_used
|
| 873 |
+
else:
|
| 874 |
+
return last_used_copy[node_ip]
|
| 875 |
+
|
| 876 |
+
return sorted(nodes, key=last_time_used, reverse=True)
|
| 877 |
+
|
| 878 |
+
def _get_nodes_needed_for_request_resources(
|
| 879 |
+
self, sorted_node_ids: List[NodeID]
|
| 880 |
+
) -> FrozenSet[NodeID]:
|
| 881 |
+
# TODO(ameer): try merging this with resource_demand_scheduler
|
| 882 |
+
# code responsible for adding nodes for request_resources().
|
| 883 |
+
"""Returns the nodes NOT allowed to terminate due to request_resources().
|
| 884 |
+
|
| 885 |
+
Args:
|
| 886 |
+
sorted_node_ids: the node ids sorted based on last used (LRU last).
|
| 887 |
+
|
| 888 |
+
Returns:
|
| 889 |
+
FrozenSet[NodeID]: a set of nodes (node ids) that
|
| 890 |
+
we should NOT terminate.
|
| 891 |
+
"""
|
| 892 |
+
# For type checking, assert that this object has been instantitiated.
|
| 893 |
+
assert self.provider
|
| 894 |
+
|
| 895 |
+
nodes_not_allowed_to_terminate: Set[NodeID] = set()
|
| 896 |
+
static_node_resources: Dict[
|
| 897 |
+
NodeIP, ResourceDict
|
| 898 |
+
] = self.load_metrics.get_static_node_resources_by_ip()
|
| 899 |
+
|
| 900 |
+
head_node_resources: ResourceDict = copy.deepcopy(
|
| 901 |
+
self.available_node_types[self.config["head_node_type"]]["resources"]
|
| 902 |
+
)
|
| 903 |
+
# TODO(ameer): this is somewhat duplicated in
|
| 904 |
+
# resource_demand_scheduler.py.
|
| 905 |
+
if not head_node_resources:
|
| 906 |
+
# Legacy yaml might include {} in the resources field.
|
| 907 |
+
head_node_ip = self.provider.internal_ip(self.non_terminated_nodes.head_id)
|
| 908 |
+
head_node_resources = static_node_resources.get(head_node_ip, {})
|
| 909 |
+
|
| 910 |
+
max_node_resources: List[ResourceDict] = [head_node_resources]
|
| 911 |
+
resource_demand_vector_worker_node_ids = []
|
| 912 |
+
# Get max resources on all the non terminated nodes.
|
| 913 |
+
for node_id in sorted_node_ids:
|
| 914 |
+
tags = self.provider.node_tags(node_id)
|
| 915 |
+
if TAG_RAY_USER_NODE_TYPE in tags:
|
| 916 |
+
node_type = tags[TAG_RAY_USER_NODE_TYPE]
|
| 917 |
+
node_resources: ResourceDict = copy.deepcopy(
|
| 918 |
+
self.available_node_types[node_type]["resources"]
|
| 919 |
+
)
|
| 920 |
+
if not node_resources:
|
| 921 |
+
# Legacy yaml might include {} in the resources field.
|
| 922 |
+
node_ip = self.provider.internal_ip(node_id)
|
| 923 |
+
node_resources = static_node_resources.get(node_ip, {})
|
| 924 |
+
max_node_resources.append(node_resources)
|
| 925 |
+
resource_demand_vector_worker_node_ids.append(node_id)
|
| 926 |
+
# Since it is sorted based on last used, we "keep" nodes that are
|
| 927 |
+
# most recently used when we binpack. We assume get_bin_pack_residual
|
| 928 |
+
# is following the given order here.
|
| 929 |
+
used_resource_requests: List[ResourceDict]
|
| 930 |
+
_, used_resource_requests = get_bin_pack_residual(
|
| 931 |
+
max_node_resources, self.load_metrics.get_resource_requests()
|
| 932 |
+
)
|
| 933 |
+
# Remove the first entry (the head node).
|
| 934 |
+
max_node_resources.pop(0)
|
| 935 |
+
# Remove the first entry (the head node).
|
| 936 |
+
used_resource_requests.pop(0)
|
| 937 |
+
for i, node_id in enumerate(resource_demand_vector_worker_node_ids):
|
| 938 |
+
if (
|
| 939 |
+
used_resource_requests[i] == max_node_resources[i]
|
| 940 |
+
and max_node_resources[i]
|
| 941 |
+
):
|
| 942 |
+
# No resources of the node were needed for request_resources().
|
| 943 |
+
# max_node_resources[i] is an empty dict for legacy yamls
|
| 944 |
+
# before the node is connected.
|
| 945 |
+
pass
|
| 946 |
+
else:
|
| 947 |
+
nodes_not_allowed_to_terminate.add(node_id)
|
| 948 |
+
return frozenset(nodes_not_allowed_to_terminate)
|
| 949 |
+
|
| 950 |
+
def _keep_worker_of_node_type(
|
| 951 |
+
self, node_id: NodeID, node_type_counts: Dict[NodeType, int]
|
| 952 |
+
) -> Tuple[KeepOrTerminate, Optional[str]]:
|
| 953 |
+
"""Determines if a worker should be kept based on the min_workers
|
| 954 |
+
and max_workers constraint of the worker's node_type.
|
| 955 |
+
|
| 956 |
+
Returns KeepOrTerminate.keep when both of the following hold:
|
| 957 |
+
(a) The worker's node_type is present among the keys of the current
|
| 958 |
+
config's available_node_types dict.
|
| 959 |
+
(b) Deleting the node would violate the min_workers constraint for that
|
| 960 |
+
worker's node_type.
|
| 961 |
+
|
| 962 |
+
Returns KeepOrTerminate.terminate when both the following hold:
|
| 963 |
+
(a) The worker's node_type is not present among the keys of the current
|
| 964 |
+
config's available_node_types dict.
|
| 965 |
+
(b) Keeping the node would violate the max_workers constraint for that
|
| 966 |
+
worker's node_type.
|
| 967 |
+
|
| 968 |
+
Return KeepOrTerminate.decide_later otherwise.
|
| 969 |
+
|
| 970 |
+
Args:
|
| 971 |
+
node_type_counts(Dict[NodeType, int]): The non_terminated node
|
| 972 |
+
types counted so far.
|
| 973 |
+
Returns:
|
| 974 |
+
KeepOrTerminate: keep if the node should be kept, terminate if the
|
| 975 |
+
node should be terminated, decide_later if we are allowed
|
| 976 |
+
to terminate it, but do not have to.
|
| 977 |
+
Optional[str]: reason for termination. Not None on
|
| 978 |
+
KeepOrTerminate.terminate, None otherwise.
|
| 979 |
+
"""
|
| 980 |
+
# For type checking, assert that this object has been instantitiated.
|
| 981 |
+
assert self.provider
|
| 982 |
+
|
| 983 |
+
tags = self.provider.node_tags(node_id)
|
| 984 |
+
if TAG_RAY_USER_NODE_TYPE in tags:
|
| 985 |
+
node_type = tags[TAG_RAY_USER_NODE_TYPE]
|
| 986 |
+
|
| 987 |
+
min_workers = self.available_node_types.get(node_type, {}).get(
|
| 988 |
+
"min_workers", 0
|
| 989 |
+
)
|
| 990 |
+
max_workers = self.available_node_types.get(node_type, {}).get(
|
| 991 |
+
"max_workers", 0
|
| 992 |
+
)
|
| 993 |
+
if node_type not in self.available_node_types:
|
| 994 |
+
# The node type has been deleted from the cluster config.
|
| 995 |
+
# Allow terminating it if needed.
|
| 996 |
+
available_node_types = list(self.available_node_types.keys())
|
| 997 |
+
return (
|
| 998 |
+
KeepOrTerminate.terminate,
|
| 999 |
+
f"not in available_node_types: {available_node_types}",
|
| 1000 |
+
)
|
| 1001 |
+
new_count = node_type_counts[node_type] + 1
|
| 1002 |
+
if new_count <= min(min_workers, max_workers):
|
| 1003 |
+
return KeepOrTerminate.keep, None
|
| 1004 |
+
if new_count > max_workers:
|
| 1005 |
+
return KeepOrTerminate.terminate, "max_workers_per_type"
|
| 1006 |
+
|
| 1007 |
+
return KeepOrTerminate.decide_later, None
|
| 1008 |
+
|
| 1009 |
+
def _node_resources(self, node_id):
|
| 1010 |
+
node_type = self.provider.node_tags(node_id).get(TAG_RAY_USER_NODE_TYPE)
|
| 1011 |
+
if self.available_node_types:
|
| 1012 |
+
return self.available_node_types.get(node_type, {}).get("resources", {})
|
| 1013 |
+
else:
|
| 1014 |
+
return {}
|
| 1015 |
+
|
| 1016 |
+
def _node_labels(self, node_id):
|
| 1017 |
+
node_type = self.provider.node_tags(node_id).get(TAG_RAY_USER_NODE_TYPE)
|
| 1018 |
+
if self.available_node_types:
|
| 1019 |
+
return self.available_node_types.get(node_type, {}).get("labels", {})
|
| 1020 |
+
else:
|
| 1021 |
+
return {}
|
| 1022 |
+
|
| 1023 |
+
def reset(self, errors_fatal=False):
|
| 1024 |
+
sync_continuously = False
|
| 1025 |
+
if hasattr(self, "config"):
|
| 1026 |
+
sync_continuously = self.config.get("file_mounts_sync_continuously", False)
|
| 1027 |
+
try:
|
| 1028 |
+
new_config = self.config_reader()
|
| 1029 |
+
if new_config != getattr(self, "config", None):
|
| 1030 |
+
try:
|
| 1031 |
+
validate_config(new_config)
|
| 1032 |
+
except Exception as e:
|
| 1033 |
+
self.prom_metrics.config_validation_exceptions.inc()
|
| 1034 |
+
logger.debug(
|
| 1035 |
+
"Cluster config validation failed. The version of "
|
| 1036 |
+
"the ray CLI you launched this cluster with may "
|
| 1037 |
+
"be higher than the version of ray being run on "
|
| 1038 |
+
"the cluster. Some new features may not be "
|
| 1039 |
+
"available until you upgrade ray on your cluster.",
|
| 1040 |
+
exc_info=e,
|
| 1041 |
+
)
|
| 1042 |
+
logger.debug(
|
| 1043 |
+
f"New config after validation: {new_config},"
|
| 1044 |
+
f" of type: {type(new_config)}"
|
| 1045 |
+
)
|
| 1046 |
+
(new_runtime_hash, new_file_mounts_contents_hash) = hash_runtime_conf(
|
| 1047 |
+
new_config["file_mounts"],
|
| 1048 |
+
new_config["cluster_synced_files"],
|
| 1049 |
+
[
|
| 1050 |
+
new_config["worker_setup_commands"],
|
| 1051 |
+
new_config["worker_start_ray_commands"],
|
| 1052 |
+
],
|
| 1053 |
+
generate_file_mounts_contents_hash=sync_continuously,
|
| 1054 |
+
)
|
| 1055 |
+
self.config = new_config
|
| 1056 |
+
self.runtime_hash = new_runtime_hash
|
| 1057 |
+
self.file_mounts_contents_hash = new_file_mounts_contents_hash
|
| 1058 |
+
if not self.provider:
|
| 1059 |
+
self.provider = _get_node_provider(
|
| 1060 |
+
self.config["provider"], self.config["cluster_name"]
|
| 1061 |
+
)
|
| 1062 |
+
|
| 1063 |
+
# If using the LocalNodeProvider, make sure the head node is marked
|
| 1064 |
+
# non-terminated.
|
| 1065 |
+
if isinstance(self.provider, LocalNodeProvider):
|
| 1066 |
+
record_local_head_state_if_needed(self.provider)
|
| 1067 |
+
|
| 1068 |
+
self.available_node_types = self.config["available_node_types"]
|
| 1069 |
+
upscaling_speed = self.config.get("upscaling_speed")
|
| 1070 |
+
aggressive = self.config.get("autoscaling_mode") == "aggressive"
|
| 1071 |
+
target_utilization_fraction = self.config.get("target_utilization_fraction")
|
| 1072 |
+
if upscaling_speed:
|
| 1073 |
+
upscaling_speed = float(upscaling_speed)
|
| 1074 |
+
# TODO(ameer): consider adding (if users ask) an option of
|
| 1075 |
+
# initial_upscaling_num_workers.
|
| 1076 |
+
elif aggressive:
|
| 1077 |
+
upscaling_speed = 99999
|
| 1078 |
+
logger.warning(
|
| 1079 |
+
"Legacy aggressive autoscaling mode "
|
| 1080 |
+
"detected. Replacing it by setting upscaling_speed to "
|
| 1081 |
+
"99999."
|
| 1082 |
+
)
|
| 1083 |
+
elif target_utilization_fraction:
|
| 1084 |
+
upscaling_speed = 1 / max(target_utilization_fraction, 0.001) - 1
|
| 1085 |
+
logger.warning(
|
| 1086 |
+
"Legacy target_utilization_fraction config "
|
| 1087 |
+
"detected. Replacing it by setting upscaling_speed to "
|
| 1088 |
+
+ "1 / target_utilization_fraction - 1."
|
| 1089 |
+
)
|
| 1090 |
+
else:
|
| 1091 |
+
upscaling_speed = 1.0
|
| 1092 |
+
if self.resource_demand_scheduler:
|
| 1093 |
+
# The node types are autofilled internally for legacy yamls,
|
| 1094 |
+
# overwriting the class will remove the inferred node resources
|
| 1095 |
+
# for legacy yamls.
|
| 1096 |
+
self.resource_demand_scheduler.reset_config(
|
| 1097 |
+
self.provider,
|
| 1098 |
+
self.available_node_types,
|
| 1099 |
+
self.config["max_workers"],
|
| 1100 |
+
self.config["head_node_type"],
|
| 1101 |
+
upscaling_speed,
|
| 1102 |
+
)
|
| 1103 |
+
else:
|
| 1104 |
+
self.resource_demand_scheduler = ResourceDemandScheduler(
|
| 1105 |
+
self.provider,
|
| 1106 |
+
self.available_node_types,
|
| 1107 |
+
self.config["max_workers"],
|
| 1108 |
+
self.config["head_node_type"],
|
| 1109 |
+
upscaling_speed,
|
| 1110 |
+
)
|
| 1111 |
+
|
| 1112 |
+
except Exception as e:
|
| 1113 |
+
self.prom_metrics.reset_exceptions.inc()
|
| 1114 |
+
if errors_fatal:
|
| 1115 |
+
raise e
|
| 1116 |
+
else:
|
| 1117 |
+
logger.exception("StandardAutoscaler: Error parsing config.")
|
| 1118 |
+
|
| 1119 |
+
def launch_config_ok(self, node_id):
|
| 1120 |
+
if self.disable_launch_config_check:
|
| 1121 |
+
return True
|
| 1122 |
+
node_tags = self.provider.node_tags(node_id)
|
| 1123 |
+
tag_launch_conf = node_tags.get(TAG_RAY_LAUNCH_CONFIG)
|
| 1124 |
+
node_type = node_tags.get(TAG_RAY_USER_NODE_TYPE)
|
| 1125 |
+
if node_type not in self.available_node_types:
|
| 1126 |
+
# The node type has been deleted from the cluster config.
|
| 1127 |
+
# Don't keep the node.
|
| 1128 |
+
return False
|
| 1129 |
+
|
| 1130 |
+
# The `worker_nodes` field is deprecated in favor of per-node-type
|
| 1131 |
+
# node_configs. We allow it for backwards-compatibility.
|
| 1132 |
+
launch_config = copy.deepcopy(self.config.get("worker_nodes", {}))
|
| 1133 |
+
if node_type:
|
| 1134 |
+
launch_config.update(
|
| 1135 |
+
self.config["available_node_types"][node_type]["node_config"]
|
| 1136 |
+
)
|
| 1137 |
+
calculated_launch_hash = hash_launch_conf(launch_config, self.config["auth"])
|
| 1138 |
+
|
| 1139 |
+
if calculated_launch_hash != tag_launch_conf:
|
| 1140 |
+
return False
|
| 1141 |
+
return True
|
| 1142 |
+
|
| 1143 |
+
def files_up_to_date(self, node_id):
|
| 1144 |
+
node_tags = self.provider.node_tags(node_id)
|
| 1145 |
+
applied_config_hash = node_tags.get(TAG_RAY_RUNTIME_CONFIG)
|
| 1146 |
+
applied_file_mounts_contents_hash = node_tags.get(TAG_RAY_FILE_MOUNTS_CONTENTS)
|
| 1147 |
+
if applied_config_hash != self.runtime_hash or (
|
| 1148 |
+
self.file_mounts_contents_hash is not None
|
| 1149 |
+
and self.file_mounts_contents_hash != applied_file_mounts_contents_hash
|
| 1150 |
+
):
|
| 1151 |
+
logger.info(
|
| 1152 |
+
"StandardAutoscaler: "
|
| 1153 |
+
"{}: Runtime state is ({},{}), want ({},{})".format(
|
| 1154 |
+
node_id,
|
| 1155 |
+
applied_config_hash,
|
| 1156 |
+
applied_file_mounts_contents_hash,
|
| 1157 |
+
self.runtime_hash,
|
| 1158 |
+
self.file_mounts_contents_hash,
|
| 1159 |
+
)
|
| 1160 |
+
)
|
| 1161 |
+
return False
|
| 1162 |
+
return True
|
| 1163 |
+
|
| 1164 |
+
def heartbeat_on_time(self, node_id: NodeID, now: float) -> bool:
|
| 1165 |
+
"""Determine whether we've received a heartbeat from a node within the
|
| 1166 |
+
last AUTOSCALER_HEARTBEAT_TIMEOUT_S seconds.
|
| 1167 |
+
"""
|
| 1168 |
+
# For type checking, assert that this object has been instantitiated.
|
| 1169 |
+
assert self.provider
|
| 1170 |
+
|
| 1171 |
+
key = self.provider.internal_ip(node_id)
|
| 1172 |
+
|
| 1173 |
+
if key in self.load_metrics.last_heartbeat_time_by_ip:
|
| 1174 |
+
last_heartbeat_time = self.load_metrics.last_heartbeat_time_by_ip[key]
|
| 1175 |
+
delta = now - last_heartbeat_time
|
| 1176 |
+
if delta < AUTOSCALER_HEARTBEAT_TIMEOUT_S:
|
| 1177 |
+
return True
|
| 1178 |
+
return False
|
| 1179 |
+
|
| 1180 |
+
def terminate_unhealthy_nodes(self, now: float):
|
| 1181 |
+
"""Terminated nodes for which we haven't received a heartbeat on time.
|
| 1182 |
+
These nodes are subsequently terminated.
|
| 1183 |
+
"""
|
| 1184 |
+
# For type checking, assert that these objects have been instantitiated.
|
| 1185 |
+
assert self.provider
|
| 1186 |
+
assert self.non_terminated_nodes
|
| 1187 |
+
|
| 1188 |
+
for node_id in self.non_terminated_nodes.worker_ids:
|
| 1189 |
+
node_status = self.provider.node_tags(node_id)[TAG_RAY_NODE_STATUS]
|
| 1190 |
+
# We're not responsible for taking down
|
| 1191 |
+
# nodes with pending or failed status:
|
| 1192 |
+
if not node_status == STATUS_UP_TO_DATE:
|
| 1193 |
+
continue
|
| 1194 |
+
# This node is up-to-date. If it hasn't had the chance to produce
|
| 1195 |
+
# a heartbeat, fake the heartbeat now (see logic for completed node
|
| 1196 |
+
# updaters).
|
| 1197 |
+
ip = self.provider.internal_ip(node_id)
|
| 1198 |
+
if ip not in self.load_metrics.last_heartbeat_time_by_ip:
|
| 1199 |
+
self.load_metrics.mark_active(ip)
|
| 1200 |
+
# Heartbeat indicates node is healthy:
|
| 1201 |
+
if self.heartbeat_on_time(node_id, now):
|
| 1202 |
+
continue
|
| 1203 |
+
self.schedule_node_termination(
|
| 1204 |
+
node_id, "lost contact with raylet", logger.warning
|
| 1205 |
+
)
|
| 1206 |
+
self.terminate_scheduled_nodes()
|
| 1207 |
+
|
| 1208 |
+
def attempt_to_recover_unhealthy_nodes(self, now):
|
| 1209 |
+
for node_id in self.non_terminated_nodes.worker_ids:
|
| 1210 |
+
self.recover_if_needed(node_id, now)
|
| 1211 |
+
|
| 1212 |
+
def recover_if_needed(self, node_id, now):
|
| 1213 |
+
if not self.can_update(node_id):
|
| 1214 |
+
return
|
| 1215 |
+
if self.heartbeat_on_time(node_id, now):
|
| 1216 |
+
return
|
| 1217 |
+
|
| 1218 |
+
logger.warning(
|
| 1219 |
+
"StandardAutoscaler: "
|
| 1220 |
+
"{}: No recent heartbeat, "
|
| 1221 |
+
"restarting Ray to recover...".format(node_id)
|
| 1222 |
+
)
|
| 1223 |
+
self.event_summarizer.add(
|
| 1224 |
+
"Restarting {} nodes of type "
|
| 1225 |
+
+ self._get_node_type(node_id)
|
| 1226 |
+
+ " (lost contact with raylet).",
|
| 1227 |
+
quantity=1,
|
| 1228 |
+
aggregate=operator.add,
|
| 1229 |
+
)
|
| 1230 |
+
head_node_ip = self.provider.internal_ip(self.non_terminated_nodes.head_id)
|
| 1231 |
+
updater = NodeUpdaterThread(
|
| 1232 |
+
node_id=node_id,
|
| 1233 |
+
provider_config=self.config["provider"],
|
| 1234 |
+
provider=self.provider,
|
| 1235 |
+
auth_config=self.config["auth"],
|
| 1236 |
+
cluster_name=self.config["cluster_name"],
|
| 1237 |
+
file_mounts={},
|
| 1238 |
+
initialization_commands=[],
|
| 1239 |
+
setup_commands=[],
|
| 1240 |
+
ray_start_commands=with_head_node_ip(
|
| 1241 |
+
self.config["worker_start_ray_commands"], head_node_ip
|
| 1242 |
+
),
|
| 1243 |
+
runtime_hash=self.runtime_hash,
|
| 1244 |
+
file_mounts_contents_hash=self.file_mounts_contents_hash,
|
| 1245 |
+
process_runner=self.process_runner,
|
| 1246 |
+
use_internal_ip=True,
|
| 1247 |
+
is_head_node=False,
|
| 1248 |
+
docker_config=self.config.get("docker"),
|
| 1249 |
+
node_resources=self._node_resources(node_id),
|
| 1250 |
+
node_labels=self._node_labels(node_id),
|
| 1251 |
+
for_recovery=True,
|
| 1252 |
+
)
|
| 1253 |
+
updater.start()
|
| 1254 |
+
self.updaters[node_id] = updater
|
| 1255 |
+
|
| 1256 |
+
def _get_node_type(self, node_id: str) -> str:
|
| 1257 |
+
# For type checking, assert that this object has been instantitiated.
|
| 1258 |
+
assert self.provider
|
| 1259 |
+
|
| 1260 |
+
node_tags = self.provider.node_tags(node_id)
|
| 1261 |
+
if TAG_RAY_USER_NODE_TYPE in node_tags:
|
| 1262 |
+
return node_tags[TAG_RAY_USER_NODE_TYPE]
|
| 1263 |
+
else:
|
| 1264 |
+
return "unknown_node_type"
|
| 1265 |
+
|
| 1266 |
+
def _get_node_type_specific_fields(self, node_id: str, fields_key: str) -> Any:
|
| 1267 |
+
# For type checking, assert that this object has been instantitiated.
|
| 1268 |
+
assert self.provider
|
| 1269 |
+
|
| 1270 |
+
fields = self.config[fields_key]
|
| 1271 |
+
node_tags = self.provider.node_tags(node_id)
|
| 1272 |
+
if TAG_RAY_USER_NODE_TYPE in node_tags:
|
| 1273 |
+
node_type = node_tags[TAG_RAY_USER_NODE_TYPE]
|
| 1274 |
+
if node_type not in self.available_node_types:
|
| 1275 |
+
raise ValueError(f"Unknown node type tag: {node_type}.")
|
| 1276 |
+
node_specific_config = self.available_node_types[node_type]
|
| 1277 |
+
if fields_key in node_specific_config:
|
| 1278 |
+
fields = node_specific_config[fields_key]
|
| 1279 |
+
return fields
|
| 1280 |
+
|
| 1281 |
+
def _get_node_specific_docker_config(self, node_id):
|
| 1282 |
+
if "docker" not in self.config:
|
| 1283 |
+
return {}
|
| 1284 |
+
docker_config = copy.deepcopy(self.config.get("docker", {}))
|
| 1285 |
+
node_specific_docker = self._get_node_type_specific_fields(node_id, "docker")
|
| 1286 |
+
docker_config.update(node_specific_docker)
|
| 1287 |
+
return docker_config
|
| 1288 |
+
|
| 1289 |
+
def should_update(self, node_id):
|
| 1290 |
+
if not self.can_update(node_id):
|
| 1291 |
+
return UpdateInstructions(None, None, None, None) # no update
|
| 1292 |
+
|
| 1293 |
+
status = self.provider.node_tags(node_id).get(TAG_RAY_NODE_STATUS)
|
| 1294 |
+
if status == STATUS_UP_TO_DATE and self.files_up_to_date(node_id):
|
| 1295 |
+
return UpdateInstructions(None, None, None, None) # no update
|
| 1296 |
+
|
| 1297 |
+
successful_updated = self.num_successful_updates.get(node_id, 0) > 0
|
| 1298 |
+
if successful_updated and self.config.get("restart_only", False):
|
| 1299 |
+
setup_commands = []
|
| 1300 |
+
ray_start_commands = self.config["worker_start_ray_commands"]
|
| 1301 |
+
elif successful_updated and self.config.get("no_restart", False):
|
| 1302 |
+
setup_commands = self._get_node_type_specific_fields(
|
| 1303 |
+
node_id, "worker_setup_commands"
|
| 1304 |
+
)
|
| 1305 |
+
ray_start_commands = []
|
| 1306 |
+
else:
|
| 1307 |
+
setup_commands = self._get_node_type_specific_fields(
|
| 1308 |
+
node_id, "worker_setup_commands"
|
| 1309 |
+
)
|
| 1310 |
+
ray_start_commands = self.config["worker_start_ray_commands"]
|
| 1311 |
+
|
| 1312 |
+
docker_config = self._get_node_specific_docker_config(node_id)
|
| 1313 |
+
return UpdateInstructions(
|
| 1314 |
+
node_id=node_id,
|
| 1315 |
+
setup_commands=setup_commands,
|
| 1316 |
+
ray_start_commands=ray_start_commands,
|
| 1317 |
+
docker_config=docker_config,
|
| 1318 |
+
)
|
| 1319 |
+
|
| 1320 |
+
def spawn_updater(
|
| 1321 |
+
self,
|
| 1322 |
+
node_id,
|
| 1323 |
+
setup_commands,
|
| 1324 |
+
ray_start_commands,
|
| 1325 |
+
node_resources,
|
| 1326 |
+
node_labels,
|
| 1327 |
+
docker_config,
|
| 1328 |
+
):
|
| 1329 |
+
logger.info(
|
| 1330 |
+
f"Creating new (spawn_updater) updater thread for node" f" {node_id}."
|
| 1331 |
+
)
|
| 1332 |
+
ip = self.provider.internal_ip(node_id)
|
| 1333 |
+
node_type = self._get_node_type(node_id)
|
| 1334 |
+
self.node_tracker.track(node_id, ip, node_type)
|
| 1335 |
+
head_node_ip = self.provider.internal_ip(self.non_terminated_nodes.head_id)
|
| 1336 |
+
updater = NodeUpdaterThread(
|
| 1337 |
+
node_id=node_id,
|
| 1338 |
+
provider_config=self.config["provider"],
|
| 1339 |
+
provider=self.provider,
|
| 1340 |
+
auth_config=self.config["auth"],
|
| 1341 |
+
cluster_name=self.config["cluster_name"],
|
| 1342 |
+
file_mounts=self.config["file_mounts"],
|
| 1343 |
+
initialization_commands=with_head_node_ip(
|
| 1344 |
+
self._get_node_type_specific_fields(node_id, "initialization_commands"),
|
| 1345 |
+
head_node_ip,
|
| 1346 |
+
),
|
| 1347 |
+
setup_commands=with_head_node_ip(setup_commands, head_node_ip),
|
| 1348 |
+
ray_start_commands=with_head_node_ip(ray_start_commands, head_node_ip),
|
| 1349 |
+
runtime_hash=self.runtime_hash,
|
| 1350 |
+
file_mounts_contents_hash=self.file_mounts_contents_hash,
|
| 1351 |
+
is_head_node=False,
|
| 1352 |
+
cluster_synced_files=self.config["cluster_synced_files"],
|
| 1353 |
+
rsync_options={
|
| 1354 |
+
"rsync_exclude": self.config.get("rsync_exclude"),
|
| 1355 |
+
"rsync_filter": self.config.get("rsync_filter"),
|
| 1356 |
+
},
|
| 1357 |
+
process_runner=self.process_runner,
|
| 1358 |
+
use_internal_ip=True,
|
| 1359 |
+
docker_config=docker_config,
|
| 1360 |
+
node_resources=node_resources,
|
| 1361 |
+
node_labels=node_labels,
|
| 1362 |
+
)
|
| 1363 |
+
updater.start()
|
| 1364 |
+
self.updaters[node_id] = updater
|
| 1365 |
+
|
| 1366 |
+
def can_update(self, node_id):
|
| 1367 |
+
if self.disable_node_updaters:
|
| 1368 |
+
return False
|
| 1369 |
+
if node_id in self.updaters:
|
| 1370 |
+
return False
|
| 1371 |
+
if not self.launch_config_ok(node_id):
|
| 1372 |
+
return False
|
| 1373 |
+
if self.num_failed_updates.get(node_id, 0) > 0: # TODO(ekl) retry?
|
| 1374 |
+
return False
|
| 1375 |
+
logger.debug(
|
| 1376 |
+
f"{node_id} is not being updated and "
|
| 1377 |
+
"passes config check (can_update=True)."
|
| 1378 |
+
)
|
| 1379 |
+
return True
|
| 1380 |
+
|
| 1381 |
+
def launch_new_node(self, count: int, node_type: str) -> None:
|
| 1382 |
+
logger.info("StandardAutoscaler: Queue {} new nodes for launch".format(count))
|
| 1383 |
+
self.pending_launches.inc(node_type, count)
|
| 1384 |
+
config = copy.deepcopy(self.config)
|
| 1385 |
+
if self.foreground_node_launch:
|
| 1386 |
+
assert self.foreground_node_launcher is not None
|
| 1387 |
+
# Launch in the main thread and block.
|
| 1388 |
+
self.foreground_node_launcher.launch_node(config, count, node_type)
|
| 1389 |
+
else:
|
| 1390 |
+
assert self.launch_queue is not None
|
| 1391 |
+
# Split into individual launch requests of the max batch size.
|
| 1392 |
+
while count > 0:
|
| 1393 |
+
# Enqueue launch data for the background NodeUpdater threads.
|
| 1394 |
+
self.launch_queue.put(
|
| 1395 |
+
(config, min(count, self.max_launch_batch), node_type)
|
| 1396 |
+
)
|
| 1397 |
+
count -= self.max_launch_batch
|
| 1398 |
+
|
| 1399 |
+
def kill_workers(self):
|
| 1400 |
+
logger.error("StandardAutoscaler: kill_workers triggered")
|
| 1401 |
+
nodes = self.workers()
|
| 1402 |
+
if nodes:
|
| 1403 |
+
self.provider.terminate_nodes(nodes)
|
| 1404 |
+
for node in nodes:
|
| 1405 |
+
self.node_tracker.untrack(node)
|
| 1406 |
+
self.prom_metrics.stopped_nodes.inc()
|
| 1407 |
+
logger.error("StandardAutoscaler: terminated {} node(s)".format(len(nodes)))
|
| 1408 |
+
|
| 1409 |
+
def summary(self) -> Optional[AutoscalerSummary]:
|
| 1410 |
+
"""Summarizes the active, pending, and failed node launches.
|
| 1411 |
+
|
| 1412 |
+
An active node is a node whose raylet is actively reporting heartbeats.
|
| 1413 |
+
A pending node is non-active node whose node tag is uninitialized,
|
| 1414 |
+
waiting for ssh, syncing files, or setting up.
|
| 1415 |
+
If a node is not pending or active, it is failed.
|
| 1416 |
+
|
| 1417 |
+
Returns:
|
| 1418 |
+
AutoscalerSummary: The summary.
|
| 1419 |
+
"""
|
| 1420 |
+
# For type checking, assert that this object has been instantitiated.
|
| 1421 |
+
assert self.provider
|
| 1422 |
+
|
| 1423 |
+
if not self.non_terminated_nodes:
|
| 1424 |
+
return None
|
| 1425 |
+
active_nodes: Dict[NodeType, int] = Counter()
|
| 1426 |
+
pending_nodes = []
|
| 1427 |
+
failed_nodes = []
|
| 1428 |
+
non_failed = set()
|
| 1429 |
+
|
| 1430 |
+
node_type_mapping = {}
|
| 1431 |
+
|
| 1432 |
+
for node_id in self.non_terminated_nodes.all_node_ids:
|
| 1433 |
+
ip = self.provider.internal_ip(node_id)
|
| 1434 |
+
node_tags = self.provider.node_tags(node_id)
|
| 1435 |
+
|
| 1436 |
+
if not all(
|
| 1437 |
+
tag in node_tags
|
| 1438 |
+
for tag in (
|
| 1439 |
+
TAG_RAY_NODE_KIND,
|
| 1440 |
+
TAG_RAY_USER_NODE_TYPE,
|
| 1441 |
+
TAG_RAY_NODE_STATUS,
|
| 1442 |
+
)
|
| 1443 |
+
):
|
| 1444 |
+
# In some node providers, creation of a node and tags is not
|
| 1445 |
+
# atomic, so just skip it.
|
| 1446 |
+
continue
|
| 1447 |
+
|
| 1448 |
+
if node_tags[TAG_RAY_NODE_KIND] == NODE_KIND_UNMANAGED:
|
| 1449 |
+
continue
|
| 1450 |
+
node_type = node_tags[TAG_RAY_USER_NODE_TYPE]
|
| 1451 |
+
|
| 1452 |
+
node_type_mapping[ip] = node_type
|
| 1453 |
+
|
| 1454 |
+
# TODO (Alex): If a node's raylet has died, it shouldn't be marked
|
| 1455 |
+
# as active.
|
| 1456 |
+
is_active = self.load_metrics.is_active(ip)
|
| 1457 |
+
if is_active:
|
| 1458 |
+
active_nodes[node_type] += 1
|
| 1459 |
+
non_failed.add(node_id)
|
| 1460 |
+
else:
|
| 1461 |
+
status = node_tags[TAG_RAY_NODE_STATUS]
|
| 1462 |
+
completed_states = [STATUS_UP_TO_DATE, STATUS_UPDATE_FAILED]
|
| 1463 |
+
is_pending = status not in completed_states
|
| 1464 |
+
if is_pending:
|
| 1465 |
+
pending_nodes.append((node_id, ip, node_type, status))
|
| 1466 |
+
non_failed.add(node_id)
|
| 1467 |
+
|
| 1468 |
+
failed_nodes = self.node_tracker.get_all_failed_node_info(non_failed)
|
| 1469 |
+
|
| 1470 |
+
# The concurrent counter leaves some 0 counts in, so we need to
|
| 1471 |
+
# manually filter those out.
|
| 1472 |
+
pending_launches = {}
|
| 1473 |
+
for node_type, count in self.pending_launches.breakdown().items():
|
| 1474 |
+
if count:
|
| 1475 |
+
pending_launches[node_type] = count
|
| 1476 |
+
|
| 1477 |
+
pending_resources = {}
|
| 1478 |
+
for node_resources in self.resource_demand_scheduler.calculate_node_resources(
|
| 1479 |
+
nodes=[node_id for node_id, _, _, _ in pending_nodes],
|
| 1480 |
+
pending_nodes=pending_launches,
|
| 1481 |
+
# We don't fill this field out because we're intentionally only
|
| 1482 |
+
# passing pending nodes (which aren't tracked by load metrics
|
| 1483 |
+
# anyways).
|
| 1484 |
+
unused_resources_by_ip={},
|
| 1485 |
+
)[0]:
|
| 1486 |
+
for key, value in node_resources.items():
|
| 1487 |
+
pending_resources[key] = value + pending_resources.get(key, 0)
|
| 1488 |
+
|
| 1489 |
+
return AutoscalerSummary(
|
| 1490 |
+
# Convert active_nodes from counter to dict for later serialization
|
| 1491 |
+
active_nodes=dict(active_nodes),
|
| 1492 |
+
idle_nodes=None,
|
| 1493 |
+
pending_nodes=[
|
| 1494 |
+
(ip, node_type, status) for _, ip, node_type, status in pending_nodes
|
| 1495 |
+
],
|
| 1496 |
+
pending_launches=pending_launches,
|
| 1497 |
+
failed_nodes=failed_nodes,
|
| 1498 |
+
node_availability_summary=self.node_provider_availability_tracker.summary(),
|
| 1499 |
+
pending_resources=pending_resources,
|
| 1500 |
+
node_type_mapping=node_type_mapping,
|
| 1501 |
+
legacy=True,
|
| 1502 |
+
)
|
| 1503 |
+
|
| 1504 |
+
def info_string(self):
|
| 1505 |
+
lm_summary = self.load_metrics.summary()
|
| 1506 |
+
autoscaler_summary = self.summary()
|
| 1507 |
+
assert autoscaler_summary
|
| 1508 |
+
return "\n" + format_info_string(lm_summary, autoscaler_summary)
|
.venv/lib/python3.11/site-packages/ray/autoscaler/_private/cli_logger.py
ADDED
|
@@ -0,0 +1,825 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Logger implementing the Command Line Interface.
|
| 2 |
+
|
| 3 |
+
A replacement for the standard Python `logging` API
|
| 4 |
+
designed for implementing a better CLI UX for the cluster launcher.
|
| 5 |
+
|
| 6 |
+
Supports color, bold text, italics, underlines, etc.
|
| 7 |
+
(depending on TTY features)
|
| 8 |
+
as well as indentation and other structured output.
|
| 9 |
+
"""
|
| 10 |
+
import inspect
|
| 11 |
+
import logging
|
| 12 |
+
import os
|
| 13 |
+
import sys
|
| 14 |
+
import time
|
| 15 |
+
from contextlib import contextmanager
|
| 16 |
+
from functools import wraps
|
| 17 |
+
from typing import Any, Callable, Dict, List, Optional, Tuple
|
| 18 |
+
|
| 19 |
+
import click
|
| 20 |
+
import colorama
|
| 21 |
+
|
| 22 |
+
# Import ray first to use the bundled colorama
|
| 23 |
+
import ray # noqa: F401
|
| 24 |
+
|
| 25 |
+
if sys.platform == "win32":
|
| 26 |
+
import msvcrt
|
| 27 |
+
else:
|
| 28 |
+
import select
|
| 29 |
+
|
| 30 |
+
|
| 31 |
+
class _ColorfulMock:
|
| 32 |
+
def __init__(self):
|
| 33 |
+
# do not do any color work
|
| 34 |
+
self.identity = lambda x: x
|
| 35 |
+
|
| 36 |
+
self.colorful = self
|
| 37 |
+
self.colormode = None
|
| 38 |
+
|
| 39 |
+
self.NO_COLORS = None
|
| 40 |
+
self.ANSI_8_COLORS = None
|
| 41 |
+
|
| 42 |
+
def disable(self):
|
| 43 |
+
pass
|
| 44 |
+
|
| 45 |
+
@contextmanager
|
| 46 |
+
def with_style(self, x):
|
| 47 |
+
class IdentityClass:
|
| 48 |
+
def __getattr__(self, name):
|
| 49 |
+
return lambda y: y
|
| 50 |
+
|
| 51 |
+
yield IdentityClass()
|
| 52 |
+
|
| 53 |
+
def __getattr__(self, name):
|
| 54 |
+
if name == "with_style":
|
| 55 |
+
return self.with_style
|
| 56 |
+
|
| 57 |
+
return self.identity
|
| 58 |
+
|
| 59 |
+
|
| 60 |
+
try:
|
| 61 |
+
import colorful as _cf
|
| 62 |
+
from colorful.core import ColorfulString
|
| 63 |
+
|
| 64 |
+
_cf.use_8_ansi_colors()
|
| 65 |
+
except ModuleNotFoundError:
|
| 66 |
+
# We mock Colorful to restrict the colors used for consistency
|
| 67 |
+
# anyway, so we also allow for not having colorful at all.
|
| 68 |
+
# If the Ray Core dependency on colorful is ever removed,
|
| 69 |
+
# the CliLogger code will still work.
|
| 70 |
+
class ColorfulString:
|
| 71 |
+
pass
|
| 72 |
+
|
| 73 |
+
_cf = _ColorfulMock()
|
| 74 |
+
|
| 75 |
+
|
| 76 |
+
# We want to only allow specific formatting
|
| 77 |
+
# to prevent people from accidentally making bad looking color schemes.
|
| 78 |
+
#
|
| 79 |
+
# This is especially important since most will look bad on either light
|
| 80 |
+
# or dark themes.
|
| 81 |
+
class _ColorfulProxy:
|
| 82 |
+
_proxy_allowlist = [
|
| 83 |
+
"disable",
|
| 84 |
+
"reset",
|
| 85 |
+
"bold",
|
| 86 |
+
"italic",
|
| 87 |
+
"underlined",
|
| 88 |
+
# used instead of `gray` as `dimmed` adapts to
|
| 89 |
+
# both light and dark themes
|
| 90 |
+
"dimmed",
|
| 91 |
+
"dodgerBlue", # group
|
| 92 |
+
"limeGreen", # success
|
| 93 |
+
"red", # error
|
| 94 |
+
"orange", # warning
|
| 95 |
+
"skyBlue", # label
|
| 96 |
+
"magenta", # syntax highlighting key words and symbols
|
| 97 |
+
"yellow", # syntax highlighting strings
|
| 98 |
+
]
|
| 99 |
+
|
| 100 |
+
def __getattr__(self, name):
|
| 101 |
+
res = getattr(_cf, name)
|
| 102 |
+
if callable(res) and name not in _ColorfulProxy._proxy_allowlist:
|
| 103 |
+
raise ValueError(
|
| 104 |
+
"Usage of the colorful method '" + name + "' is forbidden "
|
| 105 |
+
"by the proxy to keep a consistent color scheme. "
|
| 106 |
+
"Check `cli_logger.py` for allowed methods"
|
| 107 |
+
)
|
| 108 |
+
return res
|
| 109 |
+
|
| 110 |
+
|
| 111 |
+
cf = _ColorfulProxy()
|
| 112 |
+
|
| 113 |
+
colorama.init(strip=False)
|
| 114 |
+
|
| 115 |
+
|
| 116 |
+
def _external_caller_info():
|
| 117 |
+
"""Get the info from the caller frame.
|
| 118 |
+
|
| 119 |
+
Used to override the logging function and line number with the correct
|
| 120 |
+
ones. See the comment on _patched_makeRecord for more info.
|
| 121 |
+
"""
|
| 122 |
+
|
| 123 |
+
frame = inspect.currentframe()
|
| 124 |
+
caller = frame
|
| 125 |
+
levels = 0
|
| 126 |
+
while caller.f_code.co_filename == __file__:
|
| 127 |
+
caller = caller.f_back
|
| 128 |
+
levels += 1
|
| 129 |
+
return {
|
| 130 |
+
"lineno": caller.f_lineno,
|
| 131 |
+
"filename": os.path.basename(caller.f_code.co_filename),
|
| 132 |
+
}
|
| 133 |
+
|
| 134 |
+
|
| 135 |
+
def _format_msg(
|
| 136 |
+
msg: str,
|
| 137 |
+
*args: Any,
|
| 138 |
+
no_format: bool = None,
|
| 139 |
+
_tags: Dict[str, Any] = None,
|
| 140 |
+
_numbered: Tuple[str, int, int] = None,
|
| 141 |
+
**kwargs: Any,
|
| 142 |
+
):
|
| 143 |
+
"""Formats a message for printing.
|
| 144 |
+
|
| 145 |
+
Renders `msg` using the built-in `str.format` and the passed-in
|
| 146 |
+
`*args` and `**kwargs`.
|
| 147 |
+
|
| 148 |
+
Args:
|
| 149 |
+
*args (Any): `.format` arguments for `msg`.
|
| 150 |
+
no_format (bool):
|
| 151 |
+
If `no_format` is `True`,
|
| 152 |
+
`.format` will not be called on the message.
|
| 153 |
+
|
| 154 |
+
Useful if the output is user-provided or may otherwise
|
| 155 |
+
contain an unexpected formatting string (e.g. "{}").
|
| 156 |
+
_tags (Dict[str, Any]):
|
| 157 |
+
key-value pairs to display at the end of
|
| 158 |
+
the message in square brackets.
|
| 159 |
+
|
| 160 |
+
If a tag is set to `True`, it is printed without the value,
|
| 161 |
+
the presence of the tag treated as a "flag".
|
| 162 |
+
|
| 163 |
+
E.g. `_format_msg("hello", _tags=dict(from=mom, signed=True))`
|
| 164 |
+
`hello [from=Mom, signed]`
|
| 165 |
+
_numbered (Tuple[str, int, int]):
|
| 166 |
+
`(brackets, i, n)`
|
| 167 |
+
|
| 168 |
+
The `brackets` string is composed of two "bracket" characters,
|
| 169 |
+
`i` is the index, `n` is the total.
|
| 170 |
+
|
| 171 |
+
The string `{i}/{n}` surrounded by the "brackets" is
|
| 172 |
+
prepended to the message.
|
| 173 |
+
|
| 174 |
+
This is used to number steps in a procedure, with different
|
| 175 |
+
brackets specifying different major tasks.
|
| 176 |
+
|
| 177 |
+
E.g. `_format_msg("hello", _numbered=("[]", 0, 5))`
|
| 178 |
+
`[0/5] hello`
|
| 179 |
+
|
| 180 |
+
Returns:
|
| 181 |
+
The formatted message.
|
| 182 |
+
"""
|
| 183 |
+
|
| 184 |
+
if isinstance(msg, str) or isinstance(msg, ColorfulString):
|
| 185 |
+
tags_str = ""
|
| 186 |
+
if _tags is not None:
|
| 187 |
+
tags_list = []
|
| 188 |
+
for k, v in _tags.items():
|
| 189 |
+
if v is True:
|
| 190 |
+
tags_list += [k]
|
| 191 |
+
continue
|
| 192 |
+
if v is False:
|
| 193 |
+
continue
|
| 194 |
+
|
| 195 |
+
tags_list += [k + "=" + v]
|
| 196 |
+
if tags_list:
|
| 197 |
+
tags_str = cf.reset(cf.dimmed(" [{}]".format(", ".join(tags_list))))
|
| 198 |
+
|
| 199 |
+
numbering_str = ""
|
| 200 |
+
if _numbered is not None:
|
| 201 |
+
chars, i, n = _numbered
|
| 202 |
+
numbering_str = cf.dimmed(chars[0] + str(i) + "/" + str(n) + chars[1]) + " "
|
| 203 |
+
|
| 204 |
+
if no_format:
|
| 205 |
+
# todo: throw if given args/kwargs?
|
| 206 |
+
return numbering_str + msg + tags_str
|
| 207 |
+
return numbering_str + msg.format(*args, **kwargs) + tags_str
|
| 208 |
+
|
| 209 |
+
if kwargs:
|
| 210 |
+
raise ValueError("We do not support printing kwargs yet.")
|
| 211 |
+
|
| 212 |
+
res = [msg, *args]
|
| 213 |
+
res = [str(x) for x in res]
|
| 214 |
+
return ", ".join(res)
|
| 215 |
+
|
| 216 |
+
|
| 217 |
+
# TODO: come up with a plan to unify logging.
|
| 218 |
+
# formatter = logging.Formatter(
|
| 219 |
+
# # TODO(maximsmol): figure out the required log level padding
|
| 220 |
+
# # width automatically
|
| 221 |
+
# fmt="[{asctime}] {levelname:6} {message}",
|
| 222 |
+
# datefmt="%x %X",
|
| 223 |
+
# # We want alignment on our level names
|
| 224 |
+
# style="{")
|
| 225 |
+
|
| 226 |
+
|
| 227 |
+
def _isatty():
|
| 228 |
+
"""More robust check for interactive terminal/tty."""
|
| 229 |
+
try:
|
| 230 |
+
# https://stackoverflow.com/questions/6108330/
|
| 231 |
+
# checking-for-interactive-shell-in-a-python-script
|
| 232 |
+
return sys.__stdin__.isatty()
|
| 233 |
+
except Exception:
|
| 234 |
+
# sometimes this can fail due to closed output
|
| 235 |
+
# either way, no-tty is generally safe fallback.
|
| 236 |
+
return False
|
| 237 |
+
|
| 238 |
+
|
| 239 |
+
class _CliLogger:
|
| 240 |
+
"""Singleton class for CLI logging.
|
| 241 |
+
|
| 242 |
+
Without calling 'cli_logger.configure', the CLILogger will default
|
| 243 |
+
to 'record' style logging.
|
| 244 |
+
|
| 245 |
+
Attributes:
|
| 246 |
+
color_mode (str):
|
| 247 |
+
Can be "true", "false", or "auto".
|
| 248 |
+
|
| 249 |
+
Enables or disables `colorful`.
|
| 250 |
+
|
| 251 |
+
If `color_mode` is "auto", is set to `not stdout.isatty()`
|
| 252 |
+
indent_level (int):
|
| 253 |
+
The current indentation level.
|
| 254 |
+
|
| 255 |
+
All messages will be indented by prepending `" " * indent_level`
|
| 256 |
+
vebosity (int):
|
| 257 |
+
Output verbosity.
|
| 258 |
+
|
| 259 |
+
Low verbosity will disable `verbose` and `very_verbose` messages.
|
| 260 |
+
"""
|
| 261 |
+
|
| 262 |
+
color_mode: str
|
| 263 |
+
# color_mode: Union[Literal["auto"], Literal["false"], Literal["true"]]
|
| 264 |
+
indent_level: int
|
| 265 |
+
interactive: bool
|
| 266 |
+
VALID_LOG_STYLES = ("auto", "record", "pretty")
|
| 267 |
+
|
| 268 |
+
_autodetected_cf_colormode: int
|
| 269 |
+
|
| 270 |
+
def __init__(self):
|
| 271 |
+
self.indent_level = 0
|
| 272 |
+
|
| 273 |
+
self._verbosity = 0
|
| 274 |
+
self._verbosity_overriden = False
|
| 275 |
+
self._color_mode = "auto"
|
| 276 |
+
self._log_style = "record"
|
| 277 |
+
self.pretty = False
|
| 278 |
+
self.interactive = False
|
| 279 |
+
|
| 280 |
+
# store whatever colorful has detected for future use if
|
| 281 |
+
# the color ouput is toggled (colorful detects # of supported colors,
|
| 282 |
+
# so it has some non-trivial logic to determine this)
|
| 283 |
+
self._autodetected_cf_colormode = cf.colorful.colormode
|
| 284 |
+
self.set_format()
|
| 285 |
+
|
| 286 |
+
def set_format(self, format_tmpl=None):
|
| 287 |
+
if not format_tmpl:
|
| 288 |
+
from ray.autoscaler._private.constants import LOGGER_FORMAT
|
| 289 |
+
|
| 290 |
+
format_tmpl = LOGGER_FORMAT
|
| 291 |
+
self._formatter = logging.Formatter(format_tmpl)
|
| 292 |
+
|
| 293 |
+
def configure(self, log_style=None, color_mode=None, verbosity=None):
|
| 294 |
+
"""Configures the logger according to values."""
|
| 295 |
+
if log_style is not None:
|
| 296 |
+
self._set_log_style(log_style)
|
| 297 |
+
|
| 298 |
+
if color_mode is not None:
|
| 299 |
+
self._set_color_mode(color_mode)
|
| 300 |
+
|
| 301 |
+
if verbosity is not None:
|
| 302 |
+
self._set_verbosity(verbosity)
|
| 303 |
+
|
| 304 |
+
self.detect_colors()
|
| 305 |
+
|
| 306 |
+
@property
|
| 307 |
+
def log_style(self):
|
| 308 |
+
return self._log_style
|
| 309 |
+
|
| 310 |
+
def _set_log_style(self, x):
|
| 311 |
+
"""Configures interactivity and formatting."""
|
| 312 |
+
self._log_style = x.lower()
|
| 313 |
+
self.interactive = _isatty()
|
| 314 |
+
|
| 315 |
+
if self._log_style == "auto":
|
| 316 |
+
self.pretty = _isatty()
|
| 317 |
+
elif self._log_style == "record":
|
| 318 |
+
self.pretty = False
|
| 319 |
+
self._set_color_mode("false")
|
| 320 |
+
elif self._log_style == "pretty":
|
| 321 |
+
self.pretty = True
|
| 322 |
+
|
| 323 |
+
@property
|
| 324 |
+
def color_mode(self):
|
| 325 |
+
return self._color_mode
|
| 326 |
+
|
| 327 |
+
def _set_color_mode(self, x):
|
| 328 |
+
self._color_mode = x.lower()
|
| 329 |
+
self.detect_colors()
|
| 330 |
+
|
| 331 |
+
@property
|
| 332 |
+
def verbosity(self):
|
| 333 |
+
if self._verbosity_overriden:
|
| 334 |
+
return self._verbosity
|
| 335 |
+
elif not self.pretty:
|
| 336 |
+
return 999
|
| 337 |
+
return self._verbosity
|
| 338 |
+
|
| 339 |
+
def _set_verbosity(self, x):
|
| 340 |
+
self._verbosity = x
|
| 341 |
+
self._verbosity_overriden = True
|
| 342 |
+
|
| 343 |
+
def detect_colors(self):
|
| 344 |
+
"""Update color output settings.
|
| 345 |
+
|
| 346 |
+
Parse the `color_mode` string and optionally disable or force-enable
|
| 347 |
+
color output
|
| 348 |
+
(8-color ANSI if no terminal detected to be safe) in colorful.
|
| 349 |
+
"""
|
| 350 |
+
if self.color_mode == "true":
|
| 351 |
+
if self._autodetected_cf_colormode != cf.NO_COLORS:
|
| 352 |
+
cf.colormode = self._autodetected_cf_colormode
|
| 353 |
+
else:
|
| 354 |
+
cf.colormode = cf.ANSI_8_COLORS
|
| 355 |
+
return
|
| 356 |
+
if self.color_mode == "false":
|
| 357 |
+
cf.disable()
|
| 358 |
+
return
|
| 359 |
+
if self.color_mode == "auto":
|
| 360 |
+
# colorful autodetects tty settings
|
| 361 |
+
return
|
| 362 |
+
|
| 363 |
+
raise ValueError("Invalid log color setting: " + self.color_mode)
|
| 364 |
+
|
| 365 |
+
def newline(self):
|
| 366 |
+
"""Print a line feed."""
|
| 367 |
+
self.print("")
|
| 368 |
+
|
| 369 |
+
def _print(
|
| 370 |
+
self,
|
| 371 |
+
msg: str,
|
| 372 |
+
_level_str: str = "INFO",
|
| 373 |
+
_linefeed: bool = True,
|
| 374 |
+
end: str = None,
|
| 375 |
+
):
|
| 376 |
+
"""Proxy for printing messages.
|
| 377 |
+
|
| 378 |
+
Args:
|
| 379 |
+
msg: Message to print.
|
| 380 |
+
linefeed (bool):
|
| 381 |
+
If `linefeed` is `False` no linefeed is printed at the
|
| 382 |
+
end of the message.
|
| 383 |
+
"""
|
| 384 |
+
if self.pretty:
|
| 385 |
+
rendered_message = " " * self.indent_level + msg
|
| 386 |
+
else:
|
| 387 |
+
if msg.strip() == "":
|
| 388 |
+
return
|
| 389 |
+
caller_info = _external_caller_info()
|
| 390 |
+
record = logging.LogRecord(
|
| 391 |
+
name="cli",
|
| 392 |
+
# We override the level name later
|
| 393 |
+
# TODO(maximsmol): give approximate level #s to our log levels
|
| 394 |
+
level=0,
|
| 395 |
+
# The user-facing logs do not need this information anyway
|
| 396 |
+
# and it would be very tedious to extract since _print
|
| 397 |
+
# can be at varying depths in the call stack
|
| 398 |
+
# TODO(maximsmol): do it anyway to be extra
|
| 399 |
+
pathname=caller_info["filename"],
|
| 400 |
+
lineno=caller_info["lineno"],
|
| 401 |
+
msg=msg,
|
| 402 |
+
args={},
|
| 403 |
+
# No exception
|
| 404 |
+
exc_info=None,
|
| 405 |
+
)
|
| 406 |
+
record.levelname = _level_str
|
| 407 |
+
rendered_message = self._formatter.format(record)
|
| 408 |
+
|
| 409 |
+
# We aren't using standard python logging convention, so we hardcode
|
| 410 |
+
# the log levels for now.
|
| 411 |
+
if _level_str in ["WARNING", "ERROR", "PANIC"]:
|
| 412 |
+
stream = sys.stderr
|
| 413 |
+
else:
|
| 414 |
+
stream = sys.stdout
|
| 415 |
+
|
| 416 |
+
if not _linefeed:
|
| 417 |
+
stream.write(rendered_message)
|
| 418 |
+
stream.flush()
|
| 419 |
+
return
|
| 420 |
+
|
| 421 |
+
kwargs = {"end": end}
|
| 422 |
+
print(rendered_message, file=stream, **kwargs)
|
| 423 |
+
|
| 424 |
+
def indented(self):
|
| 425 |
+
"""Context manager that starts an indented block of output."""
|
| 426 |
+
cli_logger = self
|
| 427 |
+
|
| 428 |
+
class IndentedContextManager:
|
| 429 |
+
def __enter__(self):
|
| 430 |
+
cli_logger.indent_level += 1
|
| 431 |
+
|
| 432 |
+
def __exit__(self, type, value, tb):
|
| 433 |
+
cli_logger.indent_level -= 1
|
| 434 |
+
|
| 435 |
+
return IndentedContextManager()
|
| 436 |
+
|
| 437 |
+
def group(self, msg: str, *args: Any, **kwargs: Any):
|
| 438 |
+
"""Print a group title in a special color and start an indented block.
|
| 439 |
+
|
| 440 |
+
For arguments, see `_format_msg`.
|
| 441 |
+
"""
|
| 442 |
+
self.print(cf.dodgerBlue(msg), *args, **kwargs)
|
| 443 |
+
|
| 444 |
+
return self.indented()
|
| 445 |
+
|
| 446 |
+
def verbatim_error_ctx(self, msg: str, *args: Any, **kwargs: Any):
|
| 447 |
+
"""Context manager for printing multi-line error messages.
|
| 448 |
+
|
| 449 |
+
Displays a start sequence "!!! {optional message}"
|
| 450 |
+
and a matching end sequence "!!!".
|
| 451 |
+
|
| 452 |
+
The string "!!!" can be used as a "tombstone" for searching.
|
| 453 |
+
|
| 454 |
+
For arguments, see `_format_msg`.
|
| 455 |
+
"""
|
| 456 |
+
cli_logger = self
|
| 457 |
+
|
| 458 |
+
class VerbatimErorContextManager:
|
| 459 |
+
def __enter__(self):
|
| 460 |
+
cli_logger.error(cf.bold("!!! ") + "{}", msg, *args, **kwargs)
|
| 461 |
+
|
| 462 |
+
def __exit__(self, type, value, tb):
|
| 463 |
+
cli_logger.error(cf.bold("!!!"))
|
| 464 |
+
|
| 465 |
+
return VerbatimErorContextManager()
|
| 466 |
+
|
| 467 |
+
def labeled_value(self, key: str, msg: str, *args: Any, **kwargs: Any):
|
| 468 |
+
"""Displays a key-value pair with special formatting.
|
| 469 |
+
|
| 470 |
+
Args:
|
| 471 |
+
key: Label that is prepended to the message.
|
| 472 |
+
|
| 473 |
+
For other arguments, see `_format_msg`.
|
| 474 |
+
"""
|
| 475 |
+
self._print(cf.skyBlue(key) + ": " + _format_msg(cf.bold(msg), *args, **kwargs))
|
| 476 |
+
|
| 477 |
+
def verbose(self, msg: str, *args: Any, **kwargs: Any):
|
| 478 |
+
"""Prints a message if verbosity is not 0.
|
| 479 |
+
|
| 480 |
+
For arguments, see `_format_msg`.
|
| 481 |
+
"""
|
| 482 |
+
if self.verbosity > 0:
|
| 483 |
+
self.print(msg, *args, _level_str="VINFO", **kwargs)
|
| 484 |
+
|
| 485 |
+
def verbose_warning(self, msg, *args, **kwargs):
|
| 486 |
+
"""Prints a formatted warning if verbosity is not 0.
|
| 487 |
+
|
| 488 |
+
For arguments, see `_format_msg`.
|
| 489 |
+
"""
|
| 490 |
+
if self.verbosity > 0:
|
| 491 |
+
self._warning(msg, *args, _level_str="VWARN", **kwargs)
|
| 492 |
+
|
| 493 |
+
def verbose_error(self, msg: str, *args: Any, **kwargs: Any):
|
| 494 |
+
"""Logs an error if verbosity is not 0.
|
| 495 |
+
|
| 496 |
+
For arguments, see `_format_msg`.
|
| 497 |
+
"""
|
| 498 |
+
if self.verbosity > 0:
|
| 499 |
+
self._error(msg, *args, _level_str="VERR", **kwargs)
|
| 500 |
+
|
| 501 |
+
def very_verbose(self, msg: str, *args: Any, **kwargs: Any):
|
| 502 |
+
"""Prints if verbosity is > 1.
|
| 503 |
+
|
| 504 |
+
For arguments, see `_format_msg`.
|
| 505 |
+
"""
|
| 506 |
+
if self.verbosity > 1:
|
| 507 |
+
self.print(msg, *args, _level_str="VVINFO", **kwargs)
|
| 508 |
+
|
| 509 |
+
def success(self, msg: str, *args: Any, **kwargs: Any):
|
| 510 |
+
"""Prints a formatted success message.
|
| 511 |
+
|
| 512 |
+
For arguments, see `_format_msg`.
|
| 513 |
+
"""
|
| 514 |
+
self.print(cf.limeGreen(msg), *args, _level_str="SUCC", **kwargs)
|
| 515 |
+
|
| 516 |
+
def _warning(self, msg: str, *args: Any, _level_str: str = None, **kwargs: Any):
|
| 517 |
+
"""Prints a formatted warning message.
|
| 518 |
+
|
| 519 |
+
For arguments, see `_format_msg`.
|
| 520 |
+
"""
|
| 521 |
+
if _level_str is None:
|
| 522 |
+
raise ValueError("Log level not set.")
|
| 523 |
+
self.print(cf.orange(msg), *args, _level_str=_level_str, **kwargs)
|
| 524 |
+
|
| 525 |
+
def warning(self, *args, **kwargs):
|
| 526 |
+
self._warning(*args, _level_str="WARN", **kwargs)
|
| 527 |
+
|
| 528 |
+
def _error(self, msg: str, *args: Any, _level_str: str = None, **kwargs: Any):
|
| 529 |
+
"""Prints a formatted error message.
|
| 530 |
+
|
| 531 |
+
For arguments, see `_format_msg`.
|
| 532 |
+
"""
|
| 533 |
+
if _level_str is None:
|
| 534 |
+
raise ValueError("Log level not set.")
|
| 535 |
+
self.print(cf.red(msg), *args, _level_str=_level_str, **kwargs)
|
| 536 |
+
|
| 537 |
+
def error(self, *args, **kwargs):
|
| 538 |
+
self._error(*args, _level_str="ERR", **kwargs)
|
| 539 |
+
|
| 540 |
+
def panic(self, *args, **kwargs):
|
| 541 |
+
self._error(*args, _level_str="PANIC", **kwargs)
|
| 542 |
+
|
| 543 |
+
# Fine to expose _level_str here, since this is a general log function.
|
| 544 |
+
def print(
|
| 545 |
+
self,
|
| 546 |
+
msg: str,
|
| 547 |
+
*args: Any,
|
| 548 |
+
_level_str: str = "INFO",
|
| 549 |
+
end: str = None,
|
| 550 |
+
**kwargs: Any,
|
| 551 |
+
):
|
| 552 |
+
"""Prints a message.
|
| 553 |
+
|
| 554 |
+
For arguments, see `_format_msg`.
|
| 555 |
+
"""
|
| 556 |
+
self._print(_format_msg(msg, *args, **kwargs), _level_str=_level_str, end=end)
|
| 557 |
+
|
| 558 |
+
def info(self, msg: str, no_format=True, *args, **kwargs):
|
| 559 |
+
self.print(msg, no_format=no_format, *args, **kwargs)
|
| 560 |
+
|
| 561 |
+
def abort(
|
| 562 |
+
self, msg: Optional[str] = None, *args: Any, exc: Any = None, **kwargs: Any
|
| 563 |
+
):
|
| 564 |
+
"""Prints an error and aborts execution.
|
| 565 |
+
|
| 566 |
+
Print an error and throw an exception to terminate the program
|
| 567 |
+
(the exception will not print a message).
|
| 568 |
+
"""
|
| 569 |
+
if msg is not None:
|
| 570 |
+
self._error(msg, *args, _level_str="PANIC", **kwargs)
|
| 571 |
+
|
| 572 |
+
if exc is not None:
|
| 573 |
+
raise exc
|
| 574 |
+
|
| 575 |
+
exc_cls = click.ClickException
|
| 576 |
+
if self.pretty:
|
| 577 |
+
exc_cls = SilentClickException
|
| 578 |
+
|
| 579 |
+
if msg is None:
|
| 580 |
+
msg = "Exiting due to cli_logger.abort()"
|
| 581 |
+
raise exc_cls(msg)
|
| 582 |
+
|
| 583 |
+
def doassert(self, val: bool, msg: str, *args: Any, **kwargs: Any):
|
| 584 |
+
"""Handle assertion without throwing a scary exception.
|
| 585 |
+
|
| 586 |
+
Args:
|
| 587 |
+
val: Value to check.
|
| 588 |
+
|
| 589 |
+
For other arguments, see `_format_msg`.
|
| 590 |
+
"""
|
| 591 |
+
if not val:
|
| 592 |
+
exc = None
|
| 593 |
+
if not self.pretty:
|
| 594 |
+
exc = AssertionError()
|
| 595 |
+
|
| 596 |
+
# TODO(maximsmol): rework asserts so that we get the expression
|
| 597 |
+
# that triggered the assert
|
| 598 |
+
# to do this, install a global try-catch
|
| 599 |
+
# for AssertionError and raise them normally
|
| 600 |
+
self.abort(msg, *args, exc=exc, **kwargs)
|
| 601 |
+
|
| 602 |
+
def render_list(self, xs: List[str], separator: str = cf.reset(", ")):
|
| 603 |
+
"""Render a list of bolded values using a non-bolded separator."""
|
| 604 |
+
return separator.join([str(cf.bold(x)) for x in xs])
|
| 605 |
+
|
| 606 |
+
def confirm(
|
| 607 |
+
self,
|
| 608 |
+
yes: bool,
|
| 609 |
+
msg: str,
|
| 610 |
+
*args: Any,
|
| 611 |
+
_abort: bool = False,
|
| 612 |
+
_default: bool = False,
|
| 613 |
+
_timeout_s: Optional[float] = None,
|
| 614 |
+
**kwargs: Any,
|
| 615 |
+
):
|
| 616 |
+
"""Display a confirmation dialog.
|
| 617 |
+
|
| 618 |
+
Valid answers are "y/yes/true/1" and "n/no/false/0".
|
| 619 |
+
|
| 620 |
+
Args:
|
| 621 |
+
yes: If `yes` is `True` the dialog will default to "yes"
|
| 622 |
+
and continue without waiting for user input.
|
| 623 |
+
_abort (bool):
|
| 624 |
+
If `_abort` is `True`,
|
| 625 |
+
"no" means aborting the program.
|
| 626 |
+
_default (bool):
|
| 627 |
+
The default action to take if the user just presses enter
|
| 628 |
+
with no input.
|
| 629 |
+
_timeout_s (float):
|
| 630 |
+
If user has no input within _timeout_s seconds, the default
|
| 631 |
+
action is taken. None means no timeout.
|
| 632 |
+
"""
|
| 633 |
+
should_abort = _abort
|
| 634 |
+
default = _default
|
| 635 |
+
|
| 636 |
+
if not self.interactive and not yes:
|
| 637 |
+
# no formatting around --yes here since this is non-interactive
|
| 638 |
+
self.error(
|
| 639 |
+
"This command requires user confirmation. "
|
| 640 |
+
"When running non-interactively, supply --yes to skip."
|
| 641 |
+
)
|
| 642 |
+
raise ValueError("Non-interactive confirm without --yes.")
|
| 643 |
+
|
| 644 |
+
if default:
|
| 645 |
+
yn_str = "Y/n"
|
| 646 |
+
else:
|
| 647 |
+
yn_str = "y/N"
|
| 648 |
+
|
| 649 |
+
confirm_str = cf.underlined("Confirm [" + yn_str + "]:") + " "
|
| 650 |
+
|
| 651 |
+
rendered_message = _format_msg(msg, *args, **kwargs)
|
| 652 |
+
# the rendered message ends with ascii coding
|
| 653 |
+
if rendered_message and not msg.endswith("\n"):
|
| 654 |
+
rendered_message += " "
|
| 655 |
+
|
| 656 |
+
msg_len = len(rendered_message.split("\n")[-1])
|
| 657 |
+
complete_str = rendered_message + confirm_str
|
| 658 |
+
|
| 659 |
+
if yes:
|
| 660 |
+
self._print(complete_str + "y " + cf.dimmed("[automatic, due to --yes]"))
|
| 661 |
+
return True
|
| 662 |
+
|
| 663 |
+
self._print(complete_str, _linefeed=False)
|
| 664 |
+
|
| 665 |
+
res = None
|
| 666 |
+
yes_answers = ["y", "yes", "true", "1"]
|
| 667 |
+
no_answers = ["n", "no", "false", "0"]
|
| 668 |
+
try:
|
| 669 |
+
while True:
|
| 670 |
+
if _timeout_s is None:
|
| 671 |
+
ans = sys.stdin.readline()
|
| 672 |
+
elif sys.platform == "win32":
|
| 673 |
+
# Windows doesn't support select
|
| 674 |
+
start_time = time.time()
|
| 675 |
+
ans = ""
|
| 676 |
+
while True:
|
| 677 |
+
if (time.time() - start_time) >= _timeout_s:
|
| 678 |
+
self.newline()
|
| 679 |
+
ans = "\n"
|
| 680 |
+
break
|
| 681 |
+
elif msvcrt.kbhit():
|
| 682 |
+
ch = msvcrt.getwch()
|
| 683 |
+
if ch in ("\n", "\r"):
|
| 684 |
+
self.newline()
|
| 685 |
+
ans = ans + "\n"
|
| 686 |
+
break
|
| 687 |
+
elif ch == "\b":
|
| 688 |
+
if ans:
|
| 689 |
+
ans = ans[:-1]
|
| 690 |
+
# Emulate backspace erasing
|
| 691 |
+
print("\b \b", end="", flush=True)
|
| 692 |
+
else:
|
| 693 |
+
ans = ans + ch
|
| 694 |
+
print(ch, end="", flush=True)
|
| 695 |
+
else:
|
| 696 |
+
time.sleep(0.1)
|
| 697 |
+
else:
|
| 698 |
+
ready, _, _ = select.select([sys.stdin], [], [], _timeout_s)
|
| 699 |
+
if not ready:
|
| 700 |
+
self.newline()
|
| 701 |
+
ans = "\n"
|
| 702 |
+
else:
|
| 703 |
+
ans = sys.stdin.readline()
|
| 704 |
+
|
| 705 |
+
ans = ans.lower()
|
| 706 |
+
|
| 707 |
+
if ans == "\n":
|
| 708 |
+
res = default
|
| 709 |
+
break
|
| 710 |
+
|
| 711 |
+
ans = ans.strip()
|
| 712 |
+
if ans in yes_answers:
|
| 713 |
+
res = True
|
| 714 |
+
break
|
| 715 |
+
if ans in no_answers:
|
| 716 |
+
res = False
|
| 717 |
+
break
|
| 718 |
+
|
| 719 |
+
indent = " " * msg_len
|
| 720 |
+
self.error(
|
| 721 |
+
"{}Invalid answer: {}. Expected {} or {}",
|
| 722 |
+
indent,
|
| 723 |
+
cf.bold(ans.strip()),
|
| 724 |
+
self.render_list(yes_answers, "/"),
|
| 725 |
+
self.render_list(no_answers, "/"),
|
| 726 |
+
)
|
| 727 |
+
self._print(indent + confirm_str, _linefeed=False)
|
| 728 |
+
except KeyboardInterrupt:
|
| 729 |
+
self.newline()
|
| 730 |
+
res = default
|
| 731 |
+
|
| 732 |
+
if not res and should_abort:
|
| 733 |
+
# todo: make sure we tell the user if they
|
| 734 |
+
# need to do cleanup
|
| 735 |
+
self._print("Exiting...")
|
| 736 |
+
raise SilentClickException(
|
| 737 |
+
"Exiting due to the response to confirm(should_abort=True)."
|
| 738 |
+
)
|
| 739 |
+
|
| 740 |
+
return res
|
| 741 |
+
|
| 742 |
+
def prompt(self, msg: str, *args, **kwargs):
|
| 743 |
+
"""Prompt the user for some text input.
|
| 744 |
+
|
| 745 |
+
Args:
|
| 746 |
+
msg: The mesage to display to the user before the prompt.
|
| 747 |
+
|
| 748 |
+
Returns:
|
| 749 |
+
The string entered by the user.
|
| 750 |
+
"""
|
| 751 |
+
complete_str = cf.underlined(msg)
|
| 752 |
+
rendered_message = _format_msg(complete_str, *args, **kwargs)
|
| 753 |
+
# the rendered message ends with ascii coding
|
| 754 |
+
if rendered_message and not msg.endswith("\n"):
|
| 755 |
+
rendered_message += " "
|
| 756 |
+
self._print(rendered_message, linefeed=False)
|
| 757 |
+
|
| 758 |
+
res = ""
|
| 759 |
+
try:
|
| 760 |
+
ans = sys.stdin.readline()
|
| 761 |
+
ans = ans.lower()
|
| 762 |
+
res = ans.strip()
|
| 763 |
+
except KeyboardInterrupt:
|
| 764 |
+
self.newline()
|
| 765 |
+
|
| 766 |
+
return res
|
| 767 |
+
|
| 768 |
+
def flush(self):
|
| 769 |
+
sys.stdout.flush()
|
| 770 |
+
sys.stderr.flush()
|
| 771 |
+
|
| 772 |
+
|
| 773 |
+
class SilentClickException(click.ClickException):
|
| 774 |
+
"""`ClickException` that does not print a message.
|
| 775 |
+
|
| 776 |
+
Some of our tooling relies on catching ClickException in particular.
|
| 777 |
+
|
| 778 |
+
However the default prints a message, which is undesirable since we expect
|
| 779 |
+
our code to log errors manually using `cli_logger.error()` to allow for
|
| 780 |
+
colors and other formatting.
|
| 781 |
+
"""
|
| 782 |
+
|
| 783 |
+
def __init__(self, message: str):
|
| 784 |
+
super(SilentClickException, self).__init__(message)
|
| 785 |
+
|
| 786 |
+
def show(self, file=None):
|
| 787 |
+
pass
|
| 788 |
+
|
| 789 |
+
|
| 790 |
+
cli_logger = _CliLogger()
|
| 791 |
+
|
| 792 |
+
CLICK_LOGGING_OPTIONS = [
|
| 793 |
+
click.option(
|
| 794 |
+
"--log-style",
|
| 795 |
+
required=False,
|
| 796 |
+
type=click.Choice(cli_logger.VALID_LOG_STYLES, case_sensitive=False),
|
| 797 |
+
default="auto",
|
| 798 |
+
help=(
|
| 799 |
+
"If 'pretty', outputs with formatting and color. If 'record', "
|
| 800 |
+
"outputs record-style without formatting. "
|
| 801 |
+
"'auto' defaults to 'pretty', and disables pretty logging "
|
| 802 |
+
"if stdin is *not* a TTY."
|
| 803 |
+
),
|
| 804 |
+
),
|
| 805 |
+
click.option(
|
| 806 |
+
"--log-color",
|
| 807 |
+
required=False,
|
| 808 |
+
type=click.Choice(["auto", "false", "true"], case_sensitive=False),
|
| 809 |
+
default="auto",
|
| 810 |
+
help=("Use color logging. Auto enables color logging if stdout is a TTY."),
|
| 811 |
+
),
|
| 812 |
+
click.option("-v", "--verbose", default=None, count=True),
|
| 813 |
+
]
|
| 814 |
+
|
| 815 |
+
|
| 816 |
+
def add_click_logging_options(f: Callable) -> Callable:
|
| 817 |
+
for option in reversed(CLICK_LOGGING_OPTIONS):
|
| 818 |
+
f = option(f)
|
| 819 |
+
|
| 820 |
+
@wraps(f)
|
| 821 |
+
def wrapper(*args, log_style=None, log_color=None, verbose=None, **kwargs):
|
| 822 |
+
cli_logger.configure(log_style, log_color, verbose)
|
| 823 |
+
return f(*args, **kwargs)
|
| 824 |
+
|
| 825 |
+
return wrapper
|
.venv/lib/python3.11/site-packages/ray/autoscaler/_private/cli_logger_demoall.py
ADDED
|
@@ -0,0 +1,40 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python
|
| 2 |
+
|
| 3 |
+
# This is an executable script that runs an example of every single CliLogger
|
| 4 |
+
# function for demonstration purposes. Primarily useful for tuning color and
|
| 5 |
+
# other formatting.
|
| 6 |
+
|
| 7 |
+
from ray.autoscaler._private.cli_logger import cf, cli_logger
|
| 8 |
+
|
| 9 |
+
cli_logger.configure(log_style="auto", verbosity=999)
|
| 10 |
+
|
| 11 |
+
cli_logger.print(cf.bold("Bold ") + cf.italic("Italic ") + cf.underlined("Underlined"))
|
| 12 |
+
cli_logger.labeled_value("Label", "value")
|
| 13 |
+
cli_logger.print("List: {}", cli_logger.render_list([1, 2, 3]))
|
| 14 |
+
cli_logger.newline()
|
| 15 |
+
cli_logger.very_verbose("Very verbose")
|
| 16 |
+
cli_logger.verbose("Verbose")
|
| 17 |
+
cli_logger.verbose_warning("Verbose warning")
|
| 18 |
+
cli_logger.verbose_error("Verbose error")
|
| 19 |
+
cli_logger.print("Info")
|
| 20 |
+
cli_logger.success("Success")
|
| 21 |
+
cli_logger.warning("Warning")
|
| 22 |
+
cli_logger.error("Error")
|
| 23 |
+
cli_logger.newline()
|
| 24 |
+
try:
|
| 25 |
+
cli_logger.abort("Abort")
|
| 26 |
+
except Exception:
|
| 27 |
+
pass
|
| 28 |
+
try:
|
| 29 |
+
cli_logger.doassert(False, "Assert")
|
| 30 |
+
except Exception:
|
| 31 |
+
pass
|
| 32 |
+
cli_logger.newline()
|
| 33 |
+
cli_logger.confirm(True, "example")
|
| 34 |
+
cli_logger.newline()
|
| 35 |
+
with cli_logger.indented():
|
| 36 |
+
cli_logger.print("Indented")
|
| 37 |
+
with cli_logger.group("Group"):
|
| 38 |
+
cli_logger.print("Group contents")
|
| 39 |
+
with cli_logger.verbatim_error_ctx("Verbtaim error"):
|
| 40 |
+
cli_logger.print("Error contents")
|
.venv/lib/python3.11/site-packages/ray/autoscaler/_private/cluster_dump.py
ADDED
|
@@ -0,0 +1,652 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import re
|
| 3 |
+
import subprocess
|
| 4 |
+
import sys
|
| 5 |
+
import tarfile
|
| 6 |
+
import tempfile
|
| 7 |
+
import threading
|
| 8 |
+
from concurrent.futures import ThreadPoolExecutor
|
| 9 |
+
from contextlib import contextmanager
|
| 10 |
+
from typing import List, Optional, Sequence, Tuple
|
| 11 |
+
|
| 12 |
+
import yaml
|
| 13 |
+
|
| 14 |
+
import ray # noqa: F401
|
| 15 |
+
from ray.autoscaler._private.cli_logger import cli_logger
|
| 16 |
+
from ray.autoscaler._private.providers import _get_node_provider
|
| 17 |
+
from ray.autoscaler.tags import NODE_KIND_HEAD, NODE_KIND_WORKER, TAG_RAY_NODE_KIND
|
| 18 |
+
|
| 19 |
+
# Import psutil after ray so the packaged version is used.
|
| 20 |
+
import psutil
|
| 21 |
+
|
| 22 |
+
MAX_PARALLEL_SSH_WORKERS = 8
|
| 23 |
+
DEFAULT_SSH_USER = "ubuntu"
|
| 24 |
+
DEFAULT_SSH_KEYS = ["~/ray_bootstrap_key.pem", "~/.ssh/ray-autoscaler_2_us-west-2.pem"]
|
| 25 |
+
|
| 26 |
+
|
| 27 |
+
class CommandFailed(RuntimeError):
|
| 28 |
+
pass
|
| 29 |
+
|
| 30 |
+
|
| 31 |
+
class LocalCommandFailed(CommandFailed):
|
| 32 |
+
pass
|
| 33 |
+
|
| 34 |
+
|
| 35 |
+
class RemoteCommandFailed(CommandFailed):
|
| 36 |
+
pass
|
| 37 |
+
|
| 38 |
+
|
| 39 |
+
class GetParameters:
|
| 40 |
+
def __init__(
|
| 41 |
+
self,
|
| 42 |
+
logs: bool = True,
|
| 43 |
+
debug_state: bool = True,
|
| 44 |
+
pip: bool = True,
|
| 45 |
+
processes: bool = True,
|
| 46 |
+
processes_verbose: bool = True,
|
| 47 |
+
processes_list: Optional[List[Tuple[str, bool]]] = None,
|
| 48 |
+
):
|
| 49 |
+
self.logs = logs
|
| 50 |
+
self.debug_state = debug_state
|
| 51 |
+
self.pip = pip
|
| 52 |
+
self.processes = processes
|
| 53 |
+
self.processes_verbose = processes_verbose
|
| 54 |
+
self.processes_list = processes_list
|
| 55 |
+
|
| 56 |
+
|
| 57 |
+
class Node:
|
| 58 |
+
"""Node (as in "machine")"""
|
| 59 |
+
|
| 60 |
+
def __init__(
|
| 61 |
+
self,
|
| 62 |
+
host: str,
|
| 63 |
+
ssh_user: str = "ubuntu",
|
| 64 |
+
ssh_key: str = "~/ray_bootstrap_key.pem",
|
| 65 |
+
docker_container: Optional[str] = None,
|
| 66 |
+
is_head: bool = False,
|
| 67 |
+
):
|
| 68 |
+
self.host = host
|
| 69 |
+
self.ssh_user = ssh_user
|
| 70 |
+
self.ssh_key = ssh_key
|
| 71 |
+
self.docker_container = docker_container
|
| 72 |
+
self.is_head = is_head
|
| 73 |
+
|
| 74 |
+
|
| 75 |
+
class Archive:
|
| 76 |
+
"""Archive object to collect and compress files into a single file.
|
| 77 |
+
|
| 78 |
+
Objects of this class can be passed around to different data collection
|
| 79 |
+
functions. These functions can use the :meth:`subdir` method to add
|
| 80 |
+
files to a sub directory of the archive.
|
| 81 |
+
|
| 82 |
+
"""
|
| 83 |
+
|
| 84 |
+
def __init__(self, file: Optional[str] = None):
|
| 85 |
+
self.file = file or tempfile.mkstemp(prefix="ray_logs_", suffix=".tar.gz")[1]
|
| 86 |
+
self.tar = None
|
| 87 |
+
self._lock = threading.Lock()
|
| 88 |
+
|
| 89 |
+
@property
|
| 90 |
+
def is_open(self):
|
| 91 |
+
return bool(self.tar)
|
| 92 |
+
|
| 93 |
+
def open(self):
|
| 94 |
+
self.tar = tarfile.open(self.file, "w:gz")
|
| 95 |
+
|
| 96 |
+
def close(self):
|
| 97 |
+
self.tar.close()
|
| 98 |
+
self.tar = None
|
| 99 |
+
|
| 100 |
+
def __enter__(self):
|
| 101 |
+
self.open()
|
| 102 |
+
return self
|
| 103 |
+
|
| 104 |
+
def __exit__(self, exc_type, exc_val, exc_tb):
|
| 105 |
+
self.close()
|
| 106 |
+
|
| 107 |
+
@contextmanager
|
| 108 |
+
def subdir(self, subdir: str, root: Optional[str] = "/"):
|
| 109 |
+
"""Open a context to add files to the archive.
|
| 110 |
+
|
| 111 |
+
Example:
|
| 112 |
+
|
| 113 |
+
.. code-block:: python
|
| 114 |
+
|
| 115 |
+
with Archive("file.tar.gz") as archive:
|
| 116 |
+
with archive.subdir("logfiles", root="/tmp/logs") as sd:
|
| 117 |
+
# Will be added as `logfiles/nested/file.txt`
|
| 118 |
+
sd.add("/tmp/logs/nested/file.txt")
|
| 119 |
+
|
| 120 |
+
Args:
|
| 121 |
+
subdir: Subdir to which to add files to. Calling the
|
| 122 |
+
``add(path)`` command will place files into the ``subdir``
|
| 123 |
+
directory of the archive.
|
| 124 |
+
root: Root path. Files without an explicit ``arcname``
|
| 125 |
+
will be named relatively to this path.
|
| 126 |
+
|
| 127 |
+
Yields:
|
| 128 |
+
A context object that can be used to add files to the archive.
|
| 129 |
+
"""
|
| 130 |
+
root = os.path.abspath(root)
|
| 131 |
+
|
| 132 |
+
class _Context:
|
| 133 |
+
@staticmethod
|
| 134 |
+
def add(path: str, arcname: Optional[str] = None):
|
| 135 |
+
path = os.path.abspath(path)
|
| 136 |
+
arcname = arcname or os.path.join(subdir, os.path.relpath(path, root))
|
| 137 |
+
|
| 138 |
+
self._lock.acquire()
|
| 139 |
+
self.tar.add(path, arcname=arcname)
|
| 140 |
+
self._lock.release()
|
| 141 |
+
|
| 142 |
+
yield _Context()
|
| 143 |
+
|
| 144 |
+
|
| 145 |
+
###
|
| 146 |
+
# Functions to gather logs and information on the local node
|
| 147 |
+
###
|
| 148 |
+
|
| 149 |
+
|
| 150 |
+
def get_local_ray_logs(
|
| 151 |
+
archive: Archive,
|
| 152 |
+
exclude: Optional[Sequence[str]] = None,
|
| 153 |
+
session_log_dir: str = "/tmp/ray/session_latest",
|
| 154 |
+
) -> Archive:
|
| 155 |
+
"""Copy local log files into an archive.
|
| 156 |
+
|
| 157 |
+
Args:
|
| 158 |
+
archive: Archive object to add log files to.
|
| 159 |
+
exclude (Sequence[str]): Sequence of regex patterns. Files that match
|
| 160 |
+
any of these patterns will not be included in the archive.
|
| 161 |
+
session_dir: Path to the Ray session files. Defaults to
|
| 162 |
+
``/tmp/ray/session_latest``
|
| 163 |
+
|
| 164 |
+
Returns:
|
| 165 |
+
Open archive object.
|
| 166 |
+
|
| 167 |
+
"""
|
| 168 |
+
if not archive.is_open:
|
| 169 |
+
archive.open()
|
| 170 |
+
|
| 171 |
+
exclude = exclude or []
|
| 172 |
+
|
| 173 |
+
session_log_dir = os.path.join(os.path.expanduser(session_log_dir), "logs")
|
| 174 |
+
|
| 175 |
+
with archive.subdir("logs", root=session_log_dir) as sd:
|
| 176 |
+
for root, dirs, files in os.walk(session_log_dir):
|
| 177 |
+
for file in files:
|
| 178 |
+
file_path = os.path.join(root, file)
|
| 179 |
+
rel_path = os.path.relpath(file_path, start=session_log_dir)
|
| 180 |
+
# Skip file if it matches any pattern in `exclude`
|
| 181 |
+
if any(re.match(pattern, rel_path) for pattern in exclude):
|
| 182 |
+
continue
|
| 183 |
+
sd.add(file_path)
|
| 184 |
+
|
| 185 |
+
return archive
|
| 186 |
+
|
| 187 |
+
|
| 188 |
+
def get_local_debug_state(
|
| 189 |
+
archive: Archive, session_dir: str = "/tmp/ray/session_latest"
|
| 190 |
+
) -> Archive:
|
| 191 |
+
"""Copy local log files into an archive.
|
| 192 |
+
|
| 193 |
+
Args:
|
| 194 |
+
archive: Archive object to add log files to.
|
| 195 |
+
session_dir: Path to the Ray session files. Defaults to
|
| 196 |
+
``/tmp/ray/session_latest``
|
| 197 |
+
|
| 198 |
+
Returns:
|
| 199 |
+
Open archive object.
|
| 200 |
+
|
| 201 |
+
"""
|
| 202 |
+
if not archive.is_open:
|
| 203 |
+
archive.open()
|
| 204 |
+
|
| 205 |
+
session_dir = os.path.expanduser(session_dir)
|
| 206 |
+
debug_state_file = os.path.join(session_dir, "logs/debug_state.txt")
|
| 207 |
+
|
| 208 |
+
if not os.path.exists(debug_state_file):
|
| 209 |
+
raise LocalCommandFailed("No `debug_state.txt` file found.")
|
| 210 |
+
|
| 211 |
+
with archive.subdir("", root=session_dir) as sd:
|
| 212 |
+
sd.add(debug_state_file)
|
| 213 |
+
|
| 214 |
+
return archive
|
| 215 |
+
|
| 216 |
+
|
| 217 |
+
def get_local_pip_packages(archive: Archive):
|
| 218 |
+
"""Get currently installed pip packages and write into an archive.
|
| 219 |
+
|
| 220 |
+
Args:
|
| 221 |
+
archive: Archive object to add meta files to.
|
| 222 |
+
|
| 223 |
+
Returns:
|
| 224 |
+
Open archive object.
|
| 225 |
+
"""
|
| 226 |
+
if not archive.is_open:
|
| 227 |
+
archive.open()
|
| 228 |
+
|
| 229 |
+
try:
|
| 230 |
+
from pip._internal.operations import freeze
|
| 231 |
+
except ImportError: # pip < 10.0
|
| 232 |
+
from pip.operations import freeze
|
| 233 |
+
|
| 234 |
+
with tempfile.NamedTemporaryFile("wt") as fp:
|
| 235 |
+
for line in freeze.freeze():
|
| 236 |
+
fp.writelines([line, "\n"])
|
| 237 |
+
|
| 238 |
+
fp.flush()
|
| 239 |
+
with archive.subdir("") as sd:
|
| 240 |
+
sd.add(fp.name, "pip_packages.txt")
|
| 241 |
+
|
| 242 |
+
return archive
|
| 243 |
+
|
| 244 |
+
|
| 245 |
+
def get_local_ray_processes(
|
| 246 |
+
archive: Archive,
|
| 247 |
+
processes: Optional[List[Tuple[str, bool]]] = None,
|
| 248 |
+
verbose: bool = False,
|
| 249 |
+
):
|
| 250 |
+
"""Get the status of all the relevant ray processes.
|
| 251 |
+
Args:
|
| 252 |
+
archive: Archive object to add process info files to.
|
| 253 |
+
processes: List of processes to get information on. The first
|
| 254 |
+
element of the tuple is a string to filter by, and the second
|
| 255 |
+
element is a boolean indicating if we should filter by command
|
| 256 |
+
name (True) or command line including parameters (False)
|
| 257 |
+
verbose: If True, show entire executable command line.
|
| 258 |
+
If False, show just the first term.
|
| 259 |
+
Returns:
|
| 260 |
+
Open archive object.
|
| 261 |
+
"""
|
| 262 |
+
if not processes:
|
| 263 |
+
# local import to avoid circular dependencies
|
| 264 |
+
from ray.autoscaler._private.constants import RAY_PROCESSES
|
| 265 |
+
|
| 266 |
+
processes = RAY_PROCESSES
|
| 267 |
+
|
| 268 |
+
process_infos = []
|
| 269 |
+
for process in psutil.process_iter(["pid", "name", "cmdline", "status"]):
|
| 270 |
+
try:
|
| 271 |
+
with process.oneshot():
|
| 272 |
+
cmdline = " ".join(process.cmdline())
|
| 273 |
+
process_infos.append(
|
| 274 |
+
(
|
| 275 |
+
{
|
| 276 |
+
"executable": cmdline
|
| 277 |
+
if verbose
|
| 278 |
+
else cmdline.split("--", 1)[0][:-1],
|
| 279 |
+
"name": process.name(),
|
| 280 |
+
"pid": process.pid,
|
| 281 |
+
"status": process.status(),
|
| 282 |
+
},
|
| 283 |
+
process.cmdline(),
|
| 284 |
+
)
|
| 285 |
+
)
|
| 286 |
+
except Exception as exc:
|
| 287 |
+
raise LocalCommandFailed(exc) from exc
|
| 288 |
+
|
| 289 |
+
relevant_processes = {}
|
| 290 |
+
for process_dict, cmdline in process_infos:
|
| 291 |
+
for keyword, filter_by_cmd in processes:
|
| 292 |
+
if filter_by_cmd:
|
| 293 |
+
corpus = process_dict["name"]
|
| 294 |
+
else:
|
| 295 |
+
corpus = subprocess.list2cmdline(cmdline)
|
| 296 |
+
if keyword in corpus and process_dict["pid"] not in relevant_processes:
|
| 297 |
+
relevant_processes[process_dict["pid"]] = process_dict
|
| 298 |
+
|
| 299 |
+
with tempfile.NamedTemporaryFile("wt") as fp:
|
| 300 |
+
for line in relevant_processes.values():
|
| 301 |
+
fp.writelines([yaml.dump(line), "\n"])
|
| 302 |
+
|
| 303 |
+
fp.flush()
|
| 304 |
+
with archive.subdir("meta") as sd:
|
| 305 |
+
sd.add(fp.name, "process_info.txt")
|
| 306 |
+
|
| 307 |
+
return archive
|
| 308 |
+
|
| 309 |
+
|
| 310 |
+
def get_all_local_data(archive: Archive, parameters: GetParameters):
|
| 311 |
+
"""Get all local data.
|
| 312 |
+
|
| 313 |
+
Gets:
|
| 314 |
+
- The Ray logs of the latest session
|
| 315 |
+
- The currently installed pip packages
|
| 316 |
+
|
| 317 |
+
Args:
|
| 318 |
+
archive: Archive object to add meta files to.
|
| 319 |
+
parameters: Parameters (settings) for getting data.
|
| 320 |
+
|
| 321 |
+
Returns:
|
| 322 |
+
Open archive object.
|
| 323 |
+
"""
|
| 324 |
+
if not archive.is_open:
|
| 325 |
+
archive.open()
|
| 326 |
+
|
| 327 |
+
if parameters.logs:
|
| 328 |
+
try:
|
| 329 |
+
get_local_ray_logs(archive=archive)
|
| 330 |
+
except LocalCommandFailed as exc:
|
| 331 |
+
cli_logger.error(exc)
|
| 332 |
+
if parameters.debug_state:
|
| 333 |
+
try:
|
| 334 |
+
get_local_debug_state(archive=archive)
|
| 335 |
+
except LocalCommandFailed as exc:
|
| 336 |
+
cli_logger.error(exc)
|
| 337 |
+
if parameters.pip:
|
| 338 |
+
try:
|
| 339 |
+
get_local_pip_packages(archive=archive)
|
| 340 |
+
except LocalCommandFailed as exc:
|
| 341 |
+
cli_logger.error(exc)
|
| 342 |
+
if parameters.processes:
|
| 343 |
+
try:
|
| 344 |
+
get_local_ray_processes(
|
| 345 |
+
archive=archive,
|
| 346 |
+
processes=parameters.processes_list,
|
| 347 |
+
verbose=parameters.processes_verbose,
|
| 348 |
+
)
|
| 349 |
+
except LocalCommandFailed as exc:
|
| 350 |
+
cli_logger.error(exc)
|
| 351 |
+
|
| 352 |
+
return archive
|
| 353 |
+
|
| 354 |
+
|
| 355 |
+
###
|
| 356 |
+
# Functions to invoke remote scripts and gather data from remote nodes
|
| 357 |
+
###
|
| 358 |
+
|
| 359 |
+
|
| 360 |
+
def _wrap(items: List[str], quotes="'"):
|
| 361 |
+
return f"{quotes}{' '.join(items)}{quotes}"
|
| 362 |
+
|
| 363 |
+
|
| 364 |
+
def create_and_get_archive_from_remote_node(
|
| 365 |
+
remote_node: Node, parameters: GetParameters, script_path: str = "ray"
|
| 366 |
+
) -> Optional[str]:
|
| 367 |
+
"""Create an archive containing logs on a remote node and transfer.
|
| 368 |
+
|
| 369 |
+
This will call ``ray local-dump --stream`` on the remote
|
| 370 |
+
node. The resulting file will be saved locally in a temporary file and
|
| 371 |
+
returned.
|
| 372 |
+
|
| 373 |
+
Args:
|
| 374 |
+
remote_node: Remote node to gather archive from.
|
| 375 |
+
script_path: Path to this script on the remote node.
|
| 376 |
+
parameters: Parameters (settings) for getting data.
|
| 377 |
+
|
| 378 |
+
Returns:
|
| 379 |
+
Path to a temporary file containing the node's collected data.
|
| 380 |
+
|
| 381 |
+
"""
|
| 382 |
+
cmd = [
|
| 383 |
+
"ssh",
|
| 384 |
+
"-o StrictHostKeyChecking=no",
|
| 385 |
+
"-o UserKnownHostsFile=/dev/null",
|
| 386 |
+
"-o LogLevel=ERROR",
|
| 387 |
+
"-i",
|
| 388 |
+
remote_node.ssh_key,
|
| 389 |
+
f"{remote_node.ssh_user}@{remote_node.host}",
|
| 390 |
+
]
|
| 391 |
+
|
| 392 |
+
if remote_node.docker_container:
|
| 393 |
+
cmd += [
|
| 394 |
+
"docker",
|
| 395 |
+
"exec",
|
| 396 |
+
remote_node.docker_container,
|
| 397 |
+
]
|
| 398 |
+
|
| 399 |
+
collect_cmd = [script_path, "local-dump", "--stream"]
|
| 400 |
+
collect_cmd += ["--logs"] if parameters.logs else ["--no-logs"]
|
| 401 |
+
collect_cmd += ["--debug-state"] if parameters.debug_state else ["--no-debug-state"]
|
| 402 |
+
collect_cmd += ["--pip"] if parameters.pip else ["--no-pip"]
|
| 403 |
+
collect_cmd += ["--processes"] if parameters.processes else ["--no-processes"]
|
| 404 |
+
if parameters.processes:
|
| 405 |
+
collect_cmd += (
|
| 406 |
+
["--processes-verbose"]
|
| 407 |
+
if parameters.processes_verbose
|
| 408 |
+
else ["--no-proccesses-verbose"]
|
| 409 |
+
)
|
| 410 |
+
|
| 411 |
+
cmd += ["/bin/bash", "-c", _wrap(collect_cmd, quotes='"')]
|
| 412 |
+
|
| 413 |
+
cat = "node" if not remote_node.is_head else "head"
|
| 414 |
+
|
| 415 |
+
cli_logger.print(f"Collecting data from remote node: {remote_node.host}")
|
| 416 |
+
tmp = tempfile.mkstemp(prefix=f"ray_{cat}_{remote_node.host}_", suffix=".tar.gz")[1]
|
| 417 |
+
with open(tmp, "wb") as fp:
|
| 418 |
+
try:
|
| 419 |
+
subprocess.check_call(cmd, stdout=fp, stderr=sys.stderr)
|
| 420 |
+
except subprocess.CalledProcessError as exc:
|
| 421 |
+
raise RemoteCommandFailed(
|
| 422 |
+
f"Gathering logs from remote node failed: {' '.join(cmd)}"
|
| 423 |
+
) from exc
|
| 424 |
+
|
| 425 |
+
return tmp
|
| 426 |
+
|
| 427 |
+
|
| 428 |
+
def create_and_add_remote_data_to_local_archive(
|
| 429 |
+
archive: Archive, remote_node: Node, parameters: GetParameters
|
| 430 |
+
):
|
| 431 |
+
"""Create and get data from remote node and add to local archive.
|
| 432 |
+
|
| 433 |
+
Args:
|
| 434 |
+
archive: Archive object to add remote data to.
|
| 435 |
+
remote_node: Remote node to gather archive from.
|
| 436 |
+
parameters: Parameters (settings) for getting data.
|
| 437 |
+
|
| 438 |
+
Returns:
|
| 439 |
+
Open archive object.
|
| 440 |
+
"""
|
| 441 |
+
tmp = create_and_get_archive_from_remote_node(remote_node, parameters)
|
| 442 |
+
|
| 443 |
+
if not archive.is_open:
|
| 444 |
+
archive.open()
|
| 445 |
+
|
| 446 |
+
cat = "node" if not remote_node.is_head else "head"
|
| 447 |
+
|
| 448 |
+
with archive.subdir("", root=os.path.dirname(tmp)) as sd:
|
| 449 |
+
sd.add(tmp, arcname=f"ray_{cat}_{remote_node.host}.tar.gz")
|
| 450 |
+
|
| 451 |
+
return archive
|
| 452 |
+
|
| 453 |
+
|
| 454 |
+
def create_and_add_local_data_to_local_archive(
|
| 455 |
+
archive: Archive, parameters: GetParameters
|
| 456 |
+
):
|
| 457 |
+
"""Create and get data from this node and add to archive.
|
| 458 |
+
|
| 459 |
+
Args:
|
| 460 |
+
archive: Archive object to add remote data to.
|
| 461 |
+
parameters: Parameters (settings) for getting data.
|
| 462 |
+
|
| 463 |
+
Returns:
|
| 464 |
+
Open archive object.
|
| 465 |
+
"""
|
| 466 |
+
with Archive() as local_data_archive:
|
| 467 |
+
get_all_local_data(local_data_archive, parameters)
|
| 468 |
+
|
| 469 |
+
if not archive.is_open:
|
| 470 |
+
archive.open()
|
| 471 |
+
|
| 472 |
+
with archive.subdir("", root=os.path.dirname(local_data_archive.file)) as sd:
|
| 473 |
+
sd.add(local_data_archive.file, arcname="local_node.tar.gz")
|
| 474 |
+
|
| 475 |
+
os.remove(local_data_archive.file)
|
| 476 |
+
|
| 477 |
+
return archive
|
| 478 |
+
|
| 479 |
+
|
| 480 |
+
def create_archive_for_remote_nodes(
|
| 481 |
+
archive: Archive, remote_nodes: Sequence[Node], parameters: GetParameters
|
| 482 |
+
):
|
| 483 |
+
"""Create an archive combining data from the remote nodes.
|
| 484 |
+
|
| 485 |
+
This will parallelize calls to get data from remote nodes.
|
| 486 |
+
|
| 487 |
+
Args:
|
| 488 |
+
archive: Archive object to add remote data to.
|
| 489 |
+
remote_nodes (Sequence[Node]): Sequence of remote nodes.
|
| 490 |
+
parameters: Parameters (settings) for getting data.
|
| 491 |
+
|
| 492 |
+
Returns:
|
| 493 |
+
Open archive object.
|
| 494 |
+
|
| 495 |
+
"""
|
| 496 |
+
if not archive.is_open:
|
| 497 |
+
archive.open()
|
| 498 |
+
|
| 499 |
+
with ThreadPoolExecutor(max_workers=MAX_PARALLEL_SSH_WORKERS) as executor:
|
| 500 |
+
for remote_node in remote_nodes:
|
| 501 |
+
executor.submit(
|
| 502 |
+
create_and_add_remote_data_to_local_archive,
|
| 503 |
+
archive=archive,
|
| 504 |
+
remote_node=remote_node,
|
| 505 |
+
parameters=parameters,
|
| 506 |
+
)
|
| 507 |
+
|
| 508 |
+
return archive
|
| 509 |
+
|
| 510 |
+
|
| 511 |
+
def create_archive_for_local_and_remote_nodes(
|
| 512 |
+
archive: Archive, remote_nodes: Sequence[Node], parameters: GetParameters
|
| 513 |
+
):
|
| 514 |
+
"""Create an archive combining data from the local and remote nodes.
|
| 515 |
+
|
| 516 |
+
This will parallelize calls to get data from remote nodes.
|
| 517 |
+
|
| 518 |
+
Args:
|
| 519 |
+
archive: Archive object to add data to.
|
| 520 |
+
remote_nodes (Sequence[Node]): Sequence of remote nodes.
|
| 521 |
+
parameters: Parameters (settings) for getting data.
|
| 522 |
+
|
| 523 |
+
Returns:
|
| 524 |
+
Open archive object.
|
| 525 |
+
|
| 526 |
+
"""
|
| 527 |
+
if not archive.is_open:
|
| 528 |
+
archive.open()
|
| 529 |
+
|
| 530 |
+
try:
|
| 531 |
+
create_and_add_local_data_to_local_archive(archive, parameters)
|
| 532 |
+
except CommandFailed as exc:
|
| 533 |
+
cli_logger.error(exc)
|
| 534 |
+
|
| 535 |
+
create_archive_for_remote_nodes(archive, remote_nodes, parameters)
|
| 536 |
+
|
| 537 |
+
cli_logger.print(
|
| 538 |
+
f"Collected data from local node and {len(remote_nodes)} " f"remote nodes."
|
| 539 |
+
)
|
| 540 |
+
return archive
|
| 541 |
+
|
| 542 |
+
|
| 543 |
+
###
|
| 544 |
+
# Ray cluster info
|
| 545 |
+
###
|
| 546 |
+
def get_info_from_ray_cluster_config(
|
| 547 |
+
cluster_config: str,
|
| 548 |
+
) -> Tuple[List[str], str, str, Optional[str], Optional[str]]:
|
| 549 |
+
"""Get information from Ray cluster config.
|
| 550 |
+
|
| 551 |
+
Return list of host IPs, ssh user, ssh key file, and optional docker
|
| 552 |
+
container.
|
| 553 |
+
|
| 554 |
+
Args:
|
| 555 |
+
cluster_config: Path to ray cluster config.
|
| 556 |
+
|
| 557 |
+
Returns:
|
| 558 |
+
Tuple of list of host IPs, ssh user name, ssh key file path,
|
| 559 |
+
optional docker container name, optional cluster name.
|
| 560 |
+
"""
|
| 561 |
+
from ray.autoscaler._private.commands import _bootstrap_config
|
| 562 |
+
|
| 563 |
+
cli_logger.print(
|
| 564 |
+
f"Retrieving cluster information from ray cluster file: " f"{cluster_config}"
|
| 565 |
+
)
|
| 566 |
+
|
| 567 |
+
cluster_config = os.path.expanduser(cluster_config)
|
| 568 |
+
|
| 569 |
+
config = yaml.safe_load(open(cluster_config).read())
|
| 570 |
+
config = _bootstrap_config(config, no_config_cache=True)
|
| 571 |
+
|
| 572 |
+
provider = _get_node_provider(config["provider"], config["cluster_name"])
|
| 573 |
+
head_nodes = provider.non_terminated_nodes({TAG_RAY_NODE_KIND: NODE_KIND_HEAD})
|
| 574 |
+
worker_nodes = provider.non_terminated_nodes({TAG_RAY_NODE_KIND: NODE_KIND_WORKER})
|
| 575 |
+
|
| 576 |
+
hosts = [provider.external_ip(node) for node in head_nodes + worker_nodes]
|
| 577 |
+
ssh_user = config["auth"]["ssh_user"]
|
| 578 |
+
ssh_key = config["auth"]["ssh_private_key"]
|
| 579 |
+
|
| 580 |
+
docker = None
|
| 581 |
+
docker_config = config.get("docker", None)
|
| 582 |
+
if docker_config:
|
| 583 |
+
docker = docker_config.get("container_name", None)
|
| 584 |
+
|
| 585 |
+
cluster_name = config.get("cluster_name", None)
|
| 586 |
+
|
| 587 |
+
return hosts, ssh_user, ssh_key, docker, cluster_name
|
| 588 |
+
|
| 589 |
+
|
| 590 |
+
def _info_from_params(
|
| 591 |
+
cluster: Optional[str] = None,
|
| 592 |
+
host: Optional[str] = None,
|
| 593 |
+
ssh_user: Optional[str] = None,
|
| 594 |
+
ssh_key: Optional[str] = None,
|
| 595 |
+
docker: Optional[str] = None,
|
| 596 |
+
):
|
| 597 |
+
"""Parse command line arguments.
|
| 598 |
+
|
| 599 |
+
Note: This returns a list of hosts, not a comma separated string!
|
| 600 |
+
"""
|
| 601 |
+
if not host and not cluster:
|
| 602 |
+
bootstrap_config = os.path.expanduser("~/ray_bootstrap_config.yaml")
|
| 603 |
+
if os.path.exists(bootstrap_config):
|
| 604 |
+
cluster = bootstrap_config
|
| 605 |
+
cli_logger.warning(
|
| 606 |
+
f"Detected cluster config file at {cluster}. "
|
| 607 |
+
f"If this is incorrect, specify with "
|
| 608 |
+
f"`ray cluster-dump <config>`"
|
| 609 |
+
)
|
| 610 |
+
elif cluster:
|
| 611 |
+
cluster = os.path.expanduser(cluster)
|
| 612 |
+
|
| 613 |
+
cluster_name = None
|
| 614 |
+
|
| 615 |
+
if cluster:
|
| 616 |
+
h, u, k, d, cluster_name = get_info_from_ray_cluster_config(cluster)
|
| 617 |
+
|
| 618 |
+
ssh_user = ssh_user or u
|
| 619 |
+
ssh_key = ssh_key or k
|
| 620 |
+
docker = docker or d
|
| 621 |
+
hosts = host.split(",") if host else h
|
| 622 |
+
|
| 623 |
+
if not hosts:
|
| 624 |
+
raise LocalCommandFailed(
|
| 625 |
+
f"Invalid cluster file or cluster has no running nodes: " f"{cluster}"
|
| 626 |
+
)
|
| 627 |
+
elif host:
|
| 628 |
+
hosts = host.split(",")
|
| 629 |
+
else:
|
| 630 |
+
raise LocalCommandFailed(
|
| 631 |
+
"You need to either specify a `<cluster_config>` or `--host`."
|
| 632 |
+
)
|
| 633 |
+
|
| 634 |
+
if not ssh_user:
|
| 635 |
+
ssh_user = DEFAULT_SSH_USER
|
| 636 |
+
cli_logger.warning(
|
| 637 |
+
f"Using default SSH user `{ssh_user}`. "
|
| 638 |
+
f"If this is incorrect, specify with `--ssh-user <user>`"
|
| 639 |
+
)
|
| 640 |
+
|
| 641 |
+
if not ssh_key:
|
| 642 |
+
for cand_key in DEFAULT_SSH_KEYS:
|
| 643 |
+
cand_key_file = os.path.expanduser(cand_key)
|
| 644 |
+
if os.path.exists(cand_key_file):
|
| 645 |
+
ssh_key = cand_key_file
|
| 646 |
+
cli_logger.warning(
|
| 647 |
+
f"Auto detected SSH key file: {ssh_key}. "
|
| 648 |
+
f"If this is incorrect, specify with `--ssh-key <key>`"
|
| 649 |
+
)
|
| 650 |
+
break
|
| 651 |
+
|
| 652 |
+
return cluster, hosts, ssh_user, ssh_key, docker, cluster_name
|
.venv/lib/python3.11/site-packages/ray/autoscaler/_private/command_runner.py
ADDED
|
@@ -0,0 +1,921 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import hashlib
|
| 2 |
+
import json
|
| 3 |
+
import logging
|
| 4 |
+
import os
|
| 5 |
+
import subprocess
|
| 6 |
+
import sys
|
| 7 |
+
import time
|
| 8 |
+
from getpass import getuser
|
| 9 |
+
from shlex import quote
|
| 10 |
+
from typing import Dict, List
|
| 11 |
+
|
| 12 |
+
import click
|
| 13 |
+
|
| 14 |
+
from ray._private.ray_constants import DEFAULT_OBJECT_STORE_MAX_MEMORY_BYTES
|
| 15 |
+
from ray.autoscaler._private.cli_logger import cf, cli_logger
|
| 16 |
+
from ray.autoscaler._private.constants import (
|
| 17 |
+
AUTOSCALER_NODE_SSH_INTERVAL_S,
|
| 18 |
+
AUTOSCALER_NODE_START_WAIT_S,
|
| 19 |
+
DEFAULT_OBJECT_STORE_MEMORY_PROPORTION,
|
| 20 |
+
)
|
| 21 |
+
from ray.autoscaler._private.docker import (
|
| 22 |
+
check_bind_mounts_cmd,
|
| 23 |
+
check_docker_image,
|
| 24 |
+
check_docker_running_cmd,
|
| 25 |
+
docker_start_cmds,
|
| 26 |
+
with_docker_exec,
|
| 27 |
+
)
|
| 28 |
+
from ray.autoscaler._private.log_timer import LogTimer
|
| 29 |
+
from ray.autoscaler._private.subprocess_output_util import (
|
| 30 |
+
ProcessRunnerError,
|
| 31 |
+
is_output_redirected,
|
| 32 |
+
run_cmd_redirected,
|
| 33 |
+
)
|
| 34 |
+
from ray.autoscaler.command_runner import CommandRunnerInterface
|
| 35 |
+
|
| 36 |
+
logger = logging.getLogger(__name__)
|
| 37 |
+
|
| 38 |
+
# How long to wait for a node to start, in seconds
|
| 39 |
+
HASH_MAX_LENGTH = 10
|
| 40 |
+
KUBECTL_RSYNC = os.path.join(
|
| 41 |
+
os.path.dirname(os.path.abspath(__file__)), "_kubernetes/kubectl-rsync.sh"
|
| 42 |
+
)
|
| 43 |
+
MAX_HOME_RETRIES = 3
|
| 44 |
+
HOME_RETRY_DELAY_S = 5
|
| 45 |
+
|
| 46 |
+
_config = {"use_login_shells": True, "silent_rsync": True}
|
| 47 |
+
|
| 48 |
+
|
| 49 |
+
def is_rsync_silent():
|
| 50 |
+
return _config["silent_rsync"]
|
| 51 |
+
|
| 52 |
+
|
| 53 |
+
def set_rsync_silent(val):
|
| 54 |
+
"""Choose whether to silence rsync output.
|
| 55 |
+
|
| 56 |
+
Most commands will want to list rsync'd files themselves rather than
|
| 57 |
+
print the default rsync spew.
|
| 58 |
+
"""
|
| 59 |
+
_config["silent_rsync"] = val
|
| 60 |
+
|
| 61 |
+
|
| 62 |
+
def is_using_login_shells():
|
| 63 |
+
return _config["use_login_shells"]
|
| 64 |
+
|
| 65 |
+
|
| 66 |
+
def set_using_login_shells(val: bool):
|
| 67 |
+
"""Choose between login and non-interactive shells.
|
| 68 |
+
|
| 69 |
+
Non-interactive shells have the benefit of receiving less output from
|
| 70 |
+
subcommands (since progress bars and TTY control codes are not printed).
|
| 71 |
+
Sometimes this can be significant since e.g. `pip install` prints
|
| 72 |
+
hundreds of progress bar lines when downloading.
|
| 73 |
+
|
| 74 |
+
Login shells have the benefit of working very close to how a proper bash
|
| 75 |
+
session does, regarding how scripts execute and how the environment is
|
| 76 |
+
setup. This is also how all commands were ran in the past. The only reason
|
| 77 |
+
to use login shells over non-interactive shells is if you need some weird
|
| 78 |
+
and non-robust tool to work.
|
| 79 |
+
|
| 80 |
+
Args:
|
| 81 |
+
val: If true, login shells will be used to run all commands.
|
| 82 |
+
"""
|
| 83 |
+
_config["use_login_shells"] = val
|
| 84 |
+
|
| 85 |
+
|
| 86 |
+
def _with_environment_variables(cmd: str, environment_variables: Dict[str, object]):
|
| 87 |
+
"""Prepend environment variables to a shell command.
|
| 88 |
+
|
| 89 |
+
Args:
|
| 90 |
+
cmd: The base command.
|
| 91 |
+
environment_variables (Dict[str, object]): The set of environment
|
| 92 |
+
variables. If an environment variable value is a dict, it will
|
| 93 |
+
automatically be converted to a one line yaml string.
|
| 94 |
+
"""
|
| 95 |
+
|
| 96 |
+
as_strings = []
|
| 97 |
+
for key, val in environment_variables.items():
|
| 98 |
+
val = json.dumps(val, separators=(",", ":"))
|
| 99 |
+
s = "export {}={};".format(key, quote(val))
|
| 100 |
+
as_strings.append(s)
|
| 101 |
+
all_vars = "".join(as_strings)
|
| 102 |
+
return all_vars + cmd
|
| 103 |
+
|
| 104 |
+
|
| 105 |
+
def _with_interactive(cmd):
|
| 106 |
+
force_interactive = (
|
| 107 |
+
f"source ~/.bashrc; "
|
| 108 |
+
f"export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && ({cmd})"
|
| 109 |
+
)
|
| 110 |
+
return ["bash", "--login", "-c", "-i", quote(force_interactive)]
|
| 111 |
+
|
| 112 |
+
|
| 113 |
+
class SSHOptions:
|
| 114 |
+
def __init__(self, ssh_key, control_path=None, **kwargs):
|
| 115 |
+
self.ssh_key = ssh_key
|
| 116 |
+
self.arg_dict = {
|
| 117 |
+
# Supresses initial fingerprint verification.
|
| 118 |
+
"StrictHostKeyChecking": "no",
|
| 119 |
+
# SSH IP and fingerprint pairs no longer added to known_hosts.
|
| 120 |
+
# This is to remove a "REMOTE HOST IDENTIFICATION HAS CHANGED"
|
| 121 |
+
# warning if a new node has the same IP as a previously
|
| 122 |
+
# deleted node, because the fingerprints will not match in
|
| 123 |
+
# that case.
|
| 124 |
+
"UserKnownHostsFile": os.devnull,
|
| 125 |
+
# Try fewer extraneous key pairs.
|
| 126 |
+
"IdentitiesOnly": "yes",
|
| 127 |
+
# Abort if port forwarding fails (instead of just printing to
|
| 128 |
+
# stderr).
|
| 129 |
+
"ExitOnForwardFailure": "yes",
|
| 130 |
+
# Quickly kill the connection if network connection breaks (as
|
| 131 |
+
# opposed to hanging/blocking).
|
| 132 |
+
"ServerAliveInterval": 5,
|
| 133 |
+
"ServerAliveCountMax": 3,
|
| 134 |
+
}
|
| 135 |
+
if control_path:
|
| 136 |
+
self.arg_dict.update(
|
| 137 |
+
{
|
| 138 |
+
"ControlMaster": "auto",
|
| 139 |
+
"ControlPath": "{}/%C".format(control_path),
|
| 140 |
+
"ControlPersist": "10s",
|
| 141 |
+
}
|
| 142 |
+
)
|
| 143 |
+
self.arg_dict.update(kwargs)
|
| 144 |
+
|
| 145 |
+
def to_ssh_options_list(self, *, timeout=60):
|
| 146 |
+
self.arg_dict["ConnectTimeout"] = "{}s".format(timeout)
|
| 147 |
+
ssh_key_option = ["-i", self.ssh_key] if self.ssh_key else []
|
| 148 |
+
return ssh_key_option + [
|
| 149 |
+
x
|
| 150 |
+
for y in (
|
| 151 |
+
["-o", "{}={}".format(k, v)]
|
| 152 |
+
for k, v in self.arg_dict.items()
|
| 153 |
+
if v is not None
|
| 154 |
+
)
|
| 155 |
+
for x in y
|
| 156 |
+
]
|
| 157 |
+
|
| 158 |
+
|
| 159 |
+
class SSHCommandRunner(CommandRunnerInterface):
|
| 160 |
+
def __init__(
|
| 161 |
+
self,
|
| 162 |
+
log_prefix,
|
| 163 |
+
node_id,
|
| 164 |
+
provider,
|
| 165 |
+
auth_config,
|
| 166 |
+
cluster_name,
|
| 167 |
+
process_runner,
|
| 168 |
+
use_internal_ip,
|
| 169 |
+
):
|
| 170 |
+
|
| 171 |
+
ssh_control_hash = hashlib.sha1(cluster_name.encode()).hexdigest()
|
| 172 |
+
ssh_user_hash = hashlib.sha1(getuser().encode()).hexdigest()
|
| 173 |
+
ssh_control_path = "/tmp/ray_ssh_{}/{}".format(
|
| 174 |
+
ssh_user_hash[:HASH_MAX_LENGTH], ssh_control_hash[:HASH_MAX_LENGTH]
|
| 175 |
+
)
|
| 176 |
+
|
| 177 |
+
self.cluster_name = cluster_name
|
| 178 |
+
self.log_prefix = log_prefix
|
| 179 |
+
self.process_runner = process_runner
|
| 180 |
+
self.node_id = node_id
|
| 181 |
+
self.use_internal_ip = use_internal_ip
|
| 182 |
+
self.provider = provider
|
| 183 |
+
self.ssh_private_key = auth_config.get("ssh_private_key")
|
| 184 |
+
self.ssh_user = auth_config["ssh_user"]
|
| 185 |
+
self.ssh_control_path = ssh_control_path
|
| 186 |
+
self.ssh_ip = None
|
| 187 |
+
self.ssh_proxy_command = auth_config.get("ssh_proxy_command", None)
|
| 188 |
+
self.ssh_options = SSHOptions(
|
| 189 |
+
self.ssh_private_key,
|
| 190 |
+
self.ssh_control_path,
|
| 191 |
+
ProxyCommand=self.ssh_proxy_command,
|
| 192 |
+
)
|
| 193 |
+
|
| 194 |
+
def _get_node_ip(self):
|
| 195 |
+
if self.use_internal_ip:
|
| 196 |
+
return self.provider.internal_ip(self.node_id)
|
| 197 |
+
else:
|
| 198 |
+
return self.provider.external_ip(self.node_id)
|
| 199 |
+
|
| 200 |
+
def _wait_for_ip(self, deadline):
|
| 201 |
+
# if we have IP do not print waiting info
|
| 202 |
+
ip = self._get_node_ip()
|
| 203 |
+
if ip is not None:
|
| 204 |
+
cli_logger.labeled_value("Fetched IP", ip)
|
| 205 |
+
return ip
|
| 206 |
+
|
| 207 |
+
interval = AUTOSCALER_NODE_SSH_INTERVAL_S
|
| 208 |
+
with cli_logger.group("Waiting for IP"):
|
| 209 |
+
while time.time() < deadline and not self.provider.is_terminated(
|
| 210 |
+
self.node_id
|
| 211 |
+
):
|
| 212 |
+
ip = self._get_node_ip()
|
| 213 |
+
if ip is not None:
|
| 214 |
+
cli_logger.labeled_value("Received", ip)
|
| 215 |
+
return ip
|
| 216 |
+
cli_logger.print(
|
| 217 |
+
"Not yet available, retrying in {} seconds", cf.bold(str(interval))
|
| 218 |
+
)
|
| 219 |
+
time.sleep(interval)
|
| 220 |
+
|
| 221 |
+
return None
|
| 222 |
+
|
| 223 |
+
def _set_ssh_ip_if_required(self):
|
| 224 |
+
if self.ssh_ip is not None:
|
| 225 |
+
return
|
| 226 |
+
|
| 227 |
+
# We assume that this never changes.
|
| 228 |
+
# I think that's reasonable.
|
| 229 |
+
deadline = time.time() + AUTOSCALER_NODE_START_WAIT_S
|
| 230 |
+
with LogTimer(self.log_prefix + "Got IP"):
|
| 231 |
+
ip = self._wait_for_ip(deadline)
|
| 232 |
+
|
| 233 |
+
cli_logger.doassert(ip is not None, "Could not get node IP.") # todo: msg
|
| 234 |
+
assert ip is not None, "Unable to find IP of node"
|
| 235 |
+
|
| 236 |
+
self.ssh_ip = ip
|
| 237 |
+
|
| 238 |
+
# This should run before any SSH commands and therefore ensure that
|
| 239 |
+
# the ControlPath directory exists, allowing SSH to maintain
|
| 240 |
+
# persistent sessions later on.
|
| 241 |
+
try:
|
| 242 |
+
os.makedirs(self.ssh_control_path, mode=0o700, exist_ok=True)
|
| 243 |
+
except OSError as e:
|
| 244 |
+
cli_logger.warning("{}", str(e)) # todo: msg
|
| 245 |
+
|
| 246 |
+
def _run_helper(
|
| 247 |
+
self, final_cmd, with_output=False, exit_on_fail=False, silent=False
|
| 248 |
+
):
|
| 249 |
+
"""Run a command that was already setup with SSH and `bash` settings.
|
| 250 |
+
|
| 251 |
+
Args:
|
| 252 |
+
cmd (List[str]):
|
| 253 |
+
Full command to run. Should include SSH options and other
|
| 254 |
+
processing that we do.
|
| 255 |
+
with_output (bool):
|
| 256 |
+
If `with_output` is `True`, command stdout will be captured and
|
| 257 |
+
returned.
|
| 258 |
+
exit_on_fail (bool):
|
| 259 |
+
If `exit_on_fail` is `True`, the process will exit
|
| 260 |
+
if the command fails (exits with a code other than 0).
|
| 261 |
+
|
| 262 |
+
Raises:
|
| 263 |
+
ProcessRunnerError if using new log style and disabled
|
| 264 |
+
login shells.
|
| 265 |
+
click.ClickException if using login shells.
|
| 266 |
+
"""
|
| 267 |
+
try:
|
| 268 |
+
# For now, if the output is needed we just skip the new logic.
|
| 269 |
+
# In the future we could update the new logic to support
|
| 270 |
+
# capturing output, but it is probably not needed.
|
| 271 |
+
if not with_output:
|
| 272 |
+
return run_cmd_redirected(
|
| 273 |
+
final_cmd,
|
| 274 |
+
process_runner=self.process_runner,
|
| 275 |
+
silent=silent,
|
| 276 |
+
use_login_shells=is_using_login_shells(),
|
| 277 |
+
)
|
| 278 |
+
else:
|
| 279 |
+
return self.process_runner.check_output(final_cmd)
|
| 280 |
+
except subprocess.CalledProcessError as e:
|
| 281 |
+
joined_cmd = " ".join(final_cmd)
|
| 282 |
+
if not is_using_login_shells():
|
| 283 |
+
raise ProcessRunnerError(
|
| 284 |
+
"Command failed",
|
| 285 |
+
"ssh_command_failed",
|
| 286 |
+
code=e.returncode,
|
| 287 |
+
command=joined_cmd,
|
| 288 |
+
)
|
| 289 |
+
|
| 290 |
+
if exit_on_fail:
|
| 291 |
+
raise click.ClickException(
|
| 292 |
+
"Command failed:\n\n {}\n".format(joined_cmd)
|
| 293 |
+
) from None
|
| 294 |
+
else:
|
| 295 |
+
fail_msg = "SSH command failed."
|
| 296 |
+
if is_output_redirected():
|
| 297 |
+
fail_msg += " See above for the output from the failure."
|
| 298 |
+
raise click.ClickException(fail_msg) from None
|
| 299 |
+
finally:
|
| 300 |
+
# Do our best to flush output to terminal.
|
| 301 |
+
# See https://github.com/ray-project/ray/pull/19473.
|
| 302 |
+
sys.stdout.flush()
|
| 303 |
+
sys.stderr.flush()
|
| 304 |
+
|
| 305 |
+
def run(
|
| 306 |
+
self,
|
| 307 |
+
cmd,
|
| 308 |
+
timeout=120,
|
| 309 |
+
exit_on_fail=False,
|
| 310 |
+
port_forward=None,
|
| 311 |
+
with_output=False,
|
| 312 |
+
environment_variables: Dict[str, object] = None,
|
| 313 |
+
run_env="auto", # Unused argument.
|
| 314 |
+
ssh_options_override_ssh_key="",
|
| 315 |
+
shutdown_after_run=False,
|
| 316 |
+
silent=False,
|
| 317 |
+
):
|
| 318 |
+
if shutdown_after_run:
|
| 319 |
+
cmd += "; sudo shutdown -h now"
|
| 320 |
+
|
| 321 |
+
if ssh_options_override_ssh_key:
|
| 322 |
+
if self.ssh_proxy_command:
|
| 323 |
+
ssh_options = SSHOptions(
|
| 324 |
+
ssh_options_override_ssh_key, ProxyCommand=self.ssh_proxy_command
|
| 325 |
+
)
|
| 326 |
+
else:
|
| 327 |
+
ssh_options = SSHOptions(ssh_options_override_ssh_key)
|
| 328 |
+
else:
|
| 329 |
+
ssh_options = self.ssh_options
|
| 330 |
+
|
| 331 |
+
assert isinstance(
|
| 332 |
+
ssh_options, SSHOptions
|
| 333 |
+
), "ssh_options must be of type SSHOptions, got {}".format(type(ssh_options))
|
| 334 |
+
|
| 335 |
+
self._set_ssh_ip_if_required()
|
| 336 |
+
|
| 337 |
+
if is_using_login_shells():
|
| 338 |
+
ssh = ["ssh", "-tt"]
|
| 339 |
+
else:
|
| 340 |
+
ssh = ["ssh"]
|
| 341 |
+
|
| 342 |
+
if port_forward:
|
| 343 |
+
with cli_logger.group("Forwarding ports"):
|
| 344 |
+
if not isinstance(port_forward, list):
|
| 345 |
+
port_forward = [port_forward]
|
| 346 |
+
for local, remote in port_forward:
|
| 347 |
+
cli_logger.verbose(
|
| 348 |
+
"Forwarding port {} to port {} on localhost.",
|
| 349 |
+
cf.bold(local),
|
| 350 |
+
cf.bold(remote),
|
| 351 |
+
) # todo: msg
|
| 352 |
+
ssh += ["-L", "{}:localhost:{}".format(local, remote)]
|
| 353 |
+
|
| 354 |
+
final_cmd = (
|
| 355 |
+
ssh
|
| 356 |
+
+ ssh_options.to_ssh_options_list(timeout=timeout)
|
| 357 |
+
+ ["{}@{}".format(self.ssh_user, self.ssh_ip)]
|
| 358 |
+
)
|
| 359 |
+
if cmd:
|
| 360 |
+
if environment_variables:
|
| 361 |
+
cmd = _with_environment_variables(cmd, environment_variables)
|
| 362 |
+
if is_using_login_shells():
|
| 363 |
+
final_cmd += _with_interactive(cmd)
|
| 364 |
+
else:
|
| 365 |
+
final_cmd += [cmd]
|
| 366 |
+
else:
|
| 367 |
+
# We do this because `-o ControlMaster` causes the `-N` flag to
|
| 368 |
+
# still create an interactive shell in some ssh versions.
|
| 369 |
+
final_cmd.append("while true; do sleep 86400; done")
|
| 370 |
+
|
| 371 |
+
cli_logger.verbose("Running `{}`", cf.bold(cmd))
|
| 372 |
+
with cli_logger.indented():
|
| 373 |
+
cli_logger.very_verbose(
|
| 374 |
+
"Full command is `{}`", cf.bold(" ".join(final_cmd))
|
| 375 |
+
)
|
| 376 |
+
|
| 377 |
+
if cli_logger.verbosity > 0:
|
| 378 |
+
with cli_logger.indented():
|
| 379 |
+
return self._run_helper(
|
| 380 |
+
final_cmd, with_output, exit_on_fail, silent=silent
|
| 381 |
+
)
|
| 382 |
+
else:
|
| 383 |
+
return self._run_helper(final_cmd, with_output, exit_on_fail, silent=silent)
|
| 384 |
+
|
| 385 |
+
def _create_rsync_filter_args(self, options):
|
| 386 |
+
rsync_excludes = options.get("rsync_exclude") or []
|
| 387 |
+
rsync_filters = options.get("rsync_filter") or []
|
| 388 |
+
|
| 389 |
+
exclude_args = [
|
| 390 |
+
["--exclude", rsync_exclude] for rsync_exclude in rsync_excludes
|
| 391 |
+
]
|
| 392 |
+
filter_args = [
|
| 393 |
+
["--filter", "dir-merge,- {}".format(rsync_filter)]
|
| 394 |
+
for rsync_filter in rsync_filters
|
| 395 |
+
]
|
| 396 |
+
|
| 397 |
+
# Combine and flatten the two lists
|
| 398 |
+
return [arg for args_list in exclude_args + filter_args for arg in args_list]
|
| 399 |
+
|
| 400 |
+
def run_rsync_up(self, source, target, options=None):
|
| 401 |
+
self._set_ssh_ip_if_required()
|
| 402 |
+
options = options or {}
|
| 403 |
+
|
| 404 |
+
command = ["rsync"]
|
| 405 |
+
command += [
|
| 406 |
+
"--rsh",
|
| 407 |
+
subprocess.list2cmdline(
|
| 408 |
+
["ssh"] + self.ssh_options.to_ssh_options_list(timeout=120)
|
| 409 |
+
),
|
| 410 |
+
]
|
| 411 |
+
command += ["-avz"]
|
| 412 |
+
command += self._create_rsync_filter_args(options=options)
|
| 413 |
+
command += [source, "{}@{}:{}".format(self.ssh_user, self.ssh_ip, target)]
|
| 414 |
+
cli_logger.verbose("Running `{}`", cf.bold(" ".join(command)))
|
| 415 |
+
self._run_helper(command, silent=is_rsync_silent())
|
| 416 |
+
|
| 417 |
+
def run_rsync_down(self, source, target, options=None):
|
| 418 |
+
self._set_ssh_ip_if_required()
|
| 419 |
+
|
| 420 |
+
command = ["rsync"]
|
| 421 |
+
command += [
|
| 422 |
+
"--rsh",
|
| 423 |
+
subprocess.list2cmdline(
|
| 424 |
+
["ssh"] + self.ssh_options.to_ssh_options_list(timeout=120)
|
| 425 |
+
),
|
| 426 |
+
]
|
| 427 |
+
command += ["-avz"]
|
| 428 |
+
command += self._create_rsync_filter_args(options=options)
|
| 429 |
+
command += ["{}@{}:{}".format(self.ssh_user, self.ssh_ip, source), target]
|
| 430 |
+
cli_logger.verbose("Running `{}`", cf.bold(" ".join(command)))
|
| 431 |
+
self._run_helper(command, silent=is_rsync_silent())
|
| 432 |
+
|
| 433 |
+
def remote_shell_command_str(self):
|
| 434 |
+
if self.ssh_private_key:
|
| 435 |
+
return "ssh -o IdentitiesOnly=yes -i {} {}@{}\n".format(
|
| 436 |
+
self.ssh_private_key, self.ssh_user, self.ssh_ip
|
| 437 |
+
)
|
| 438 |
+
else:
|
| 439 |
+
return "ssh -o IdentitiesOnly=yes {}@{}\n".format(
|
| 440 |
+
self.ssh_user, self.ssh_ip
|
| 441 |
+
)
|
| 442 |
+
|
| 443 |
+
|
| 444 |
+
class DockerCommandRunner(CommandRunnerInterface):
|
| 445 |
+
def __init__(self, docker_config, **common_args):
|
| 446 |
+
self.ssh_command_runner = SSHCommandRunner(**common_args)
|
| 447 |
+
self.container_name = docker_config["container_name"]
|
| 448 |
+
self.docker_config = docker_config
|
| 449 |
+
self.home_dir = None
|
| 450 |
+
self.initialized = False
|
| 451 |
+
# Optionally use 'podman' instead of 'docker'
|
| 452 |
+
use_podman = docker_config.get("use_podman", False)
|
| 453 |
+
self.docker_cmd = "podman" if use_podman else "docker"
|
| 454 |
+
|
| 455 |
+
def run(
|
| 456 |
+
self,
|
| 457 |
+
cmd,
|
| 458 |
+
timeout=120,
|
| 459 |
+
exit_on_fail=False,
|
| 460 |
+
port_forward=None,
|
| 461 |
+
with_output=False,
|
| 462 |
+
environment_variables: Dict[str, object] = None,
|
| 463 |
+
run_env="auto",
|
| 464 |
+
ssh_options_override_ssh_key="",
|
| 465 |
+
shutdown_after_run=False,
|
| 466 |
+
):
|
| 467 |
+
if run_env == "auto":
|
| 468 |
+
run_env = (
|
| 469 |
+
"host"
|
| 470 |
+
if (not bool(cmd) or cmd.find(self.docker_cmd) == 0)
|
| 471 |
+
else self.docker_cmd
|
| 472 |
+
)
|
| 473 |
+
|
| 474 |
+
if environment_variables:
|
| 475 |
+
cmd = _with_environment_variables(cmd, environment_variables)
|
| 476 |
+
|
| 477 |
+
if run_env == "docker":
|
| 478 |
+
cmd = self._docker_expand_user(cmd, any_char=True)
|
| 479 |
+
if is_using_login_shells():
|
| 480 |
+
cmd = " ".join(_with_interactive(cmd))
|
| 481 |
+
cmd = with_docker_exec(
|
| 482 |
+
[cmd],
|
| 483 |
+
container_name=self.container_name,
|
| 484 |
+
with_interactive=is_using_login_shells(),
|
| 485 |
+
docker_cmd=self.docker_cmd,
|
| 486 |
+
)[0]
|
| 487 |
+
|
| 488 |
+
if shutdown_after_run:
|
| 489 |
+
# sudo shutdown should run after `with_docker_exec` command above
|
| 490 |
+
cmd += "; sudo shutdown -h now"
|
| 491 |
+
# Do not pass shutdown_after_run argument to ssh_command_runner.run()
|
| 492 |
+
# since it is handled above.
|
| 493 |
+
return self.ssh_command_runner.run(
|
| 494 |
+
cmd,
|
| 495 |
+
timeout=timeout,
|
| 496 |
+
exit_on_fail=exit_on_fail,
|
| 497 |
+
port_forward=port_forward,
|
| 498 |
+
with_output=with_output,
|
| 499 |
+
ssh_options_override_ssh_key=ssh_options_override_ssh_key,
|
| 500 |
+
)
|
| 501 |
+
|
| 502 |
+
def run_rsync_up(self, source, target, options=None):
|
| 503 |
+
options = options or {}
|
| 504 |
+
host_destination = os.path.join(
|
| 505 |
+
self._get_docker_host_mount_location(self.ssh_command_runner.cluster_name),
|
| 506 |
+
target.lstrip("/"),
|
| 507 |
+
)
|
| 508 |
+
|
| 509 |
+
host_mount_location = os.path.dirname(host_destination.rstrip("/"))
|
| 510 |
+
self.ssh_command_runner.run(
|
| 511 |
+
f"mkdir -p {host_mount_location} && chown -R "
|
| 512 |
+
f"{self.ssh_command_runner.ssh_user} {host_mount_location}",
|
| 513 |
+
silent=is_rsync_silent(),
|
| 514 |
+
)
|
| 515 |
+
|
| 516 |
+
self.ssh_command_runner.run_rsync_up(source, host_destination, options=options)
|
| 517 |
+
if self._check_container_status() and not options.get(
|
| 518 |
+
"docker_mount_if_possible", False
|
| 519 |
+
):
|
| 520 |
+
if os.path.isdir(source):
|
| 521 |
+
# Adding a "." means that docker copies the *contents*
|
| 522 |
+
# Without it, docker copies the source *into* the target
|
| 523 |
+
host_destination += "/."
|
| 524 |
+
|
| 525 |
+
# This path may not exist inside the container. This ensures
|
| 526 |
+
# that the path is created!
|
| 527 |
+
prefix = with_docker_exec(
|
| 528 |
+
[
|
| 529 |
+
"mkdir -p {}".format(
|
| 530 |
+
os.path.dirname(self._docker_expand_user(target))
|
| 531 |
+
)
|
| 532 |
+
],
|
| 533 |
+
container_name=self.container_name,
|
| 534 |
+
with_interactive=is_using_login_shells(),
|
| 535 |
+
docker_cmd=self.docker_cmd,
|
| 536 |
+
)[0]
|
| 537 |
+
|
| 538 |
+
self.ssh_command_runner.run(
|
| 539 |
+
"{} && rsync -e '{} exec -i' -avz {} {}:{}".format(
|
| 540 |
+
prefix,
|
| 541 |
+
self.docker_cmd,
|
| 542 |
+
host_destination,
|
| 543 |
+
self.container_name,
|
| 544 |
+
self._docker_expand_user(target),
|
| 545 |
+
),
|
| 546 |
+
silent=is_rsync_silent(),
|
| 547 |
+
)
|
| 548 |
+
|
| 549 |
+
def run_rsync_down(self, source, target, options=None):
|
| 550 |
+
options = options or {}
|
| 551 |
+
host_source = os.path.join(
|
| 552 |
+
self._get_docker_host_mount_location(self.ssh_command_runner.cluster_name),
|
| 553 |
+
source.lstrip("/"),
|
| 554 |
+
)
|
| 555 |
+
host_mount_location = os.path.dirname(host_source.rstrip("/"))
|
| 556 |
+
self.ssh_command_runner.run(
|
| 557 |
+
f"mkdir -p {host_mount_location} && chown -R "
|
| 558 |
+
f"{self.ssh_command_runner.ssh_user} {host_mount_location}",
|
| 559 |
+
silent=is_rsync_silent(),
|
| 560 |
+
)
|
| 561 |
+
if source[-1] == "/":
|
| 562 |
+
source += "."
|
| 563 |
+
# Adding a "." means that docker copies the *contents*
|
| 564 |
+
# Without it, docker copies the source *into* the target
|
| 565 |
+
if not options.get("docker_mount_if_possible", False):
|
| 566 |
+
# NOTE: `--delete` is okay here because the container is the source
|
| 567 |
+
# of truth.
|
| 568 |
+
self.ssh_command_runner.run(
|
| 569 |
+
"rsync -e '{} exec -i' -avz --delete {}:{} {}".format(
|
| 570 |
+
self.docker_cmd,
|
| 571 |
+
self.container_name,
|
| 572 |
+
self._docker_expand_user(source),
|
| 573 |
+
host_source,
|
| 574 |
+
),
|
| 575 |
+
silent=is_rsync_silent(),
|
| 576 |
+
)
|
| 577 |
+
self.ssh_command_runner.run_rsync_down(host_source, target, options=options)
|
| 578 |
+
|
| 579 |
+
def remote_shell_command_str(self):
|
| 580 |
+
inner_str = (
|
| 581 |
+
self.ssh_command_runner.remote_shell_command_str()
|
| 582 |
+
.replace("ssh", "ssh -tt", 1)
|
| 583 |
+
.strip("\n")
|
| 584 |
+
)
|
| 585 |
+
return inner_str + " {} exec -it {} /bin/bash\n".format(
|
| 586 |
+
self.docker_cmd, self.container_name
|
| 587 |
+
)
|
| 588 |
+
|
| 589 |
+
def _check_docker_installed(self):
|
| 590 |
+
no_exist = "NoExist"
|
| 591 |
+
output = self.ssh_command_runner.run(
|
| 592 |
+
f"command -v {self.docker_cmd} || echo '{no_exist}'", with_output=True
|
| 593 |
+
)
|
| 594 |
+
cleaned_output = output.decode().strip()
|
| 595 |
+
if no_exist in cleaned_output or "docker" not in cleaned_output:
|
| 596 |
+
if self.docker_cmd == "docker":
|
| 597 |
+
install_commands = [
|
| 598 |
+
"curl -fsSL https://get.docker.com -o get-docker.sh",
|
| 599 |
+
"sudo sh get-docker.sh",
|
| 600 |
+
"sudo usermod -aG docker $USER",
|
| 601 |
+
"sudo systemctl restart docker -f",
|
| 602 |
+
]
|
| 603 |
+
else:
|
| 604 |
+
install_commands = [
|
| 605 |
+
"sudo apt-get update",
|
| 606 |
+
"sudo apt-get -y install podman",
|
| 607 |
+
]
|
| 608 |
+
|
| 609 |
+
logger.error(
|
| 610 |
+
f"{self.docker_cmd.capitalize()} not installed. You can "
|
| 611 |
+
f"install {self.docker_cmd.capitalize()} by adding the "
|
| 612 |
+
"following commands to 'initialization_commands':\n"
|
| 613 |
+
+ "\n".join(install_commands)
|
| 614 |
+
)
|
| 615 |
+
|
| 616 |
+
def _check_container_status(self):
|
| 617 |
+
if self.initialized:
|
| 618 |
+
return True
|
| 619 |
+
output = (
|
| 620 |
+
self.ssh_command_runner.run(
|
| 621 |
+
check_docker_running_cmd(self.container_name, self.docker_cmd),
|
| 622 |
+
with_output=True,
|
| 623 |
+
)
|
| 624 |
+
.decode("utf-8")
|
| 625 |
+
.strip()
|
| 626 |
+
)
|
| 627 |
+
# Checks for the false positive where "true" is in the container name
|
| 628 |
+
return "true" in output.lower() and "no such object" not in output.lower()
|
| 629 |
+
|
| 630 |
+
def _docker_expand_user(self, string, any_char=False):
|
| 631 |
+
user_pos = string.find("~")
|
| 632 |
+
if user_pos > -1:
|
| 633 |
+
if self.home_dir is None:
|
| 634 |
+
self.home_dir = (
|
| 635 |
+
self.ssh_command_runner.run(
|
| 636 |
+
f"{self.docker_cmd} exec {self.container_name} "
|
| 637 |
+
"printenv HOME",
|
| 638 |
+
with_output=True,
|
| 639 |
+
)
|
| 640 |
+
.decode("utf-8")
|
| 641 |
+
.strip()
|
| 642 |
+
)
|
| 643 |
+
|
| 644 |
+
if any_char:
|
| 645 |
+
return string.replace("~/", self.home_dir + "/")
|
| 646 |
+
|
| 647 |
+
elif not any_char and user_pos == 0:
|
| 648 |
+
return string.replace("~", self.home_dir, 1)
|
| 649 |
+
|
| 650 |
+
return string
|
| 651 |
+
|
| 652 |
+
def _check_if_container_restart_is_needed(
|
| 653 |
+
self, image: str, cleaned_bind_mounts: Dict[str, str]
|
| 654 |
+
) -> bool:
|
| 655 |
+
re_init_required = False
|
| 656 |
+
running_image = (
|
| 657 |
+
self.run(
|
| 658 |
+
check_docker_image(self.container_name, self.docker_cmd),
|
| 659 |
+
with_output=True,
|
| 660 |
+
run_env="host",
|
| 661 |
+
)
|
| 662 |
+
.decode("utf-8")
|
| 663 |
+
.strip()
|
| 664 |
+
)
|
| 665 |
+
if running_image != image:
|
| 666 |
+
cli_logger.error(
|
| 667 |
+
"A container with name {} is running image {} instead "
|
| 668 |
+
+ "of {} (which was provided in the YAML)",
|
| 669 |
+
self.container_name,
|
| 670 |
+
running_image,
|
| 671 |
+
image,
|
| 672 |
+
)
|
| 673 |
+
mounts = (
|
| 674 |
+
self.run(
|
| 675 |
+
check_bind_mounts_cmd(self.container_name, self.docker_cmd),
|
| 676 |
+
with_output=True,
|
| 677 |
+
run_env="host",
|
| 678 |
+
)
|
| 679 |
+
.decode("utf-8")
|
| 680 |
+
.strip()
|
| 681 |
+
)
|
| 682 |
+
try:
|
| 683 |
+
active_mounts = json.loads(mounts)
|
| 684 |
+
active_remote_mounts = {
|
| 685 |
+
mnt["Destination"].strip("/") for mnt in active_mounts
|
| 686 |
+
}
|
| 687 |
+
# Ignore ray bootstrap files.
|
| 688 |
+
requested_remote_mounts = {
|
| 689 |
+
self._docker_expand_user(remote).strip("/")
|
| 690 |
+
for remote in cleaned_bind_mounts.keys()
|
| 691 |
+
}
|
| 692 |
+
unfulfilled_mounts = requested_remote_mounts - active_remote_mounts
|
| 693 |
+
if unfulfilled_mounts:
|
| 694 |
+
re_init_required = True
|
| 695 |
+
cli_logger.warning(
|
| 696 |
+
"This Docker Container is already running. "
|
| 697 |
+
"Restarting the Docker container on "
|
| 698 |
+
"this node to pick up the following file_mounts {}",
|
| 699 |
+
unfulfilled_mounts,
|
| 700 |
+
)
|
| 701 |
+
except json.JSONDecodeError:
|
| 702 |
+
cli_logger.verbose(
|
| 703 |
+
"Unable to check if file_mounts specified in the YAML "
|
| 704 |
+
"differ from those on the running container."
|
| 705 |
+
)
|
| 706 |
+
return re_init_required
|
| 707 |
+
|
| 708 |
+
def run_init(
|
| 709 |
+
self, *, as_head: bool, file_mounts: Dict[str, str], sync_run_yet: bool
|
| 710 |
+
):
|
| 711 |
+
BOOTSTRAP_MOUNTS = ["~/ray_bootstrap_config.yaml", "~/ray_bootstrap_key.pem"]
|
| 712 |
+
|
| 713 |
+
specific_image = self.docker_config.get(
|
| 714 |
+
f"{'head' if as_head else 'worker'}_image", self.docker_config.get("image")
|
| 715 |
+
)
|
| 716 |
+
|
| 717 |
+
self._check_docker_installed()
|
| 718 |
+
if self.docker_config.get("pull_before_run", True):
|
| 719 |
+
assert specific_image, (
|
| 720 |
+
"Image must be included in config if " + "pull_before_run is specified"
|
| 721 |
+
)
|
| 722 |
+
self.run(
|
| 723 |
+
"{} pull {}".format(self.docker_cmd, specific_image), run_env="host"
|
| 724 |
+
)
|
| 725 |
+
else:
|
| 726 |
+
|
| 727 |
+
self.run(
|
| 728 |
+
f"{self.docker_cmd} image inspect {specific_image} "
|
| 729 |
+
"1> /dev/null 2>&1 || "
|
| 730 |
+
f"{self.docker_cmd} pull {specific_image}"
|
| 731 |
+
)
|
| 732 |
+
|
| 733 |
+
# Bootstrap files cannot be bind mounted because docker opens the
|
| 734 |
+
# underlying inode. When the file is switched, docker becomes outdated.
|
| 735 |
+
cleaned_bind_mounts = file_mounts.copy()
|
| 736 |
+
for mnt in BOOTSTRAP_MOUNTS:
|
| 737 |
+
cleaned_bind_mounts.pop(mnt, None)
|
| 738 |
+
|
| 739 |
+
docker_run_executed = False
|
| 740 |
+
|
| 741 |
+
container_running = self._check_container_status()
|
| 742 |
+
requires_re_init = False
|
| 743 |
+
if container_running:
|
| 744 |
+
requires_re_init = self._check_if_container_restart_is_needed(
|
| 745 |
+
specific_image, cleaned_bind_mounts
|
| 746 |
+
)
|
| 747 |
+
if requires_re_init:
|
| 748 |
+
self.run(
|
| 749 |
+
f"{self.docker_cmd} stop {self.container_name}", run_env="host"
|
| 750 |
+
)
|
| 751 |
+
|
| 752 |
+
if (not container_running) or requires_re_init:
|
| 753 |
+
if not sync_run_yet:
|
| 754 |
+
# Do not start the actual image as we need to run file_sync
|
| 755 |
+
# first to ensure that all folders are created with the
|
| 756 |
+
# correct ownership. Docker will create the folders with
|
| 757 |
+
# `root` as the owner.
|
| 758 |
+
return True
|
| 759 |
+
# Get home directory
|
| 760 |
+
image_env = (
|
| 761 |
+
self.ssh_command_runner.run(
|
| 762 |
+
f"{self.docker_cmd} "
|
| 763 |
+
+ "inspect -f '{{json .Config.Env}}' "
|
| 764 |
+
+ specific_image,
|
| 765 |
+
with_output=True,
|
| 766 |
+
)
|
| 767 |
+
.decode()
|
| 768 |
+
.strip()
|
| 769 |
+
)
|
| 770 |
+
home_directory = "/root"
|
| 771 |
+
try:
|
| 772 |
+
for env_var in json.loads(image_env):
|
| 773 |
+
if env_var.startswith("HOME="):
|
| 774 |
+
home_directory = env_var.split("HOME=")[1]
|
| 775 |
+
break
|
| 776 |
+
except json.JSONDecodeError as e:
|
| 777 |
+
cli_logger.error(
|
| 778 |
+
"Unable to deserialize `image_env` to Python object. "
|
| 779 |
+
f"The `image_env` is:\n{image_env}"
|
| 780 |
+
)
|
| 781 |
+
raise e
|
| 782 |
+
|
| 783 |
+
user_docker_run_options = self.docker_config.get(
|
| 784 |
+
"run_options", []
|
| 785 |
+
) + self.docker_config.get(
|
| 786 |
+
f"{'head' if as_head else 'worker'}_run_options", []
|
| 787 |
+
)
|
| 788 |
+
start_command = docker_start_cmds(
|
| 789 |
+
self.ssh_command_runner.ssh_user,
|
| 790 |
+
specific_image,
|
| 791 |
+
cleaned_bind_mounts,
|
| 792 |
+
self.container_name,
|
| 793 |
+
self._configure_runtime(
|
| 794 |
+
self._auto_configure_shm(user_docker_run_options)
|
| 795 |
+
),
|
| 796 |
+
self.ssh_command_runner.cluster_name,
|
| 797 |
+
home_directory,
|
| 798 |
+
self.docker_cmd,
|
| 799 |
+
)
|
| 800 |
+
self.run(start_command, run_env="host")
|
| 801 |
+
docker_run_executed = True
|
| 802 |
+
|
| 803 |
+
# Explicitly copy in ray bootstrap files.
|
| 804 |
+
for mount in BOOTSTRAP_MOUNTS:
|
| 805 |
+
if mount in file_mounts:
|
| 806 |
+
if not sync_run_yet:
|
| 807 |
+
# NOTE(ilr) This rsync is needed because when starting from
|
| 808 |
+
# a stopped instance, /tmp may be deleted and `run_init`
|
| 809 |
+
# is called before the first `file_sync` happens
|
| 810 |
+
self.run_rsync_up(file_mounts[mount], mount)
|
| 811 |
+
self.ssh_command_runner.run(
|
| 812 |
+
"rsync -e '{cmd} exec -i' -avz {src} {container}:{dst}".format(
|
| 813 |
+
cmd=self.docker_cmd,
|
| 814 |
+
src=os.path.join(
|
| 815 |
+
self._get_docker_host_mount_location(
|
| 816 |
+
self.ssh_command_runner.cluster_name
|
| 817 |
+
),
|
| 818 |
+
mount,
|
| 819 |
+
),
|
| 820 |
+
container=self.container_name,
|
| 821 |
+
dst=self._docker_expand_user(mount),
|
| 822 |
+
)
|
| 823 |
+
)
|
| 824 |
+
try:
|
| 825 |
+
# Check if the current user has read permission.
|
| 826 |
+
# If they do not, try to change ownership!
|
| 827 |
+
self.run(
|
| 828 |
+
f"cat {mount} >/dev/null 2>&1 || "
|
| 829 |
+
f"sudo chown $(id -u):$(id -g) {mount}"
|
| 830 |
+
)
|
| 831 |
+
except Exception:
|
| 832 |
+
lsl_string = (
|
| 833 |
+
self.run(f"ls -l {mount}", with_output=True)
|
| 834 |
+
.decode("utf-8")
|
| 835 |
+
.strip()
|
| 836 |
+
)
|
| 837 |
+
# The string is of format <Permission> <Links>
|
| 838 |
+
# <Owner> <Group> <Size> <Date> <Name>
|
| 839 |
+
permissions = lsl_string.split(" ")[0]
|
| 840 |
+
owner = lsl_string.split(" ")[2]
|
| 841 |
+
group = lsl_string.split(" ")[3]
|
| 842 |
+
current_user = (
|
| 843 |
+
self.run("whoami", with_output=True).decode("utf-8").strip()
|
| 844 |
+
)
|
| 845 |
+
cli_logger.warning(
|
| 846 |
+
f"File ({mount}) is owned by user:{owner} and group:"
|
| 847 |
+
f"{group} with permissions ({permissions}). The "
|
| 848 |
+
f"current user ({current_user}) does not have "
|
| 849 |
+
"permission to read these files, and Ray may not be "
|
| 850 |
+
"able to autoscale. This can be resolved by "
|
| 851 |
+
"installing `sudo` in your container, or adding a "
|
| 852 |
+
f"command like 'chown {current_user} {mount}' to "
|
| 853 |
+
"your `setup_commands`."
|
| 854 |
+
)
|
| 855 |
+
self.initialized = True
|
| 856 |
+
return docker_run_executed
|
| 857 |
+
|
| 858 |
+
def _configure_runtime(self, run_options: List[str]) -> List[str]:
|
| 859 |
+
if self.docker_config.get("disable_automatic_runtime_detection"):
|
| 860 |
+
return run_options
|
| 861 |
+
|
| 862 |
+
runtime_output = (
|
| 863 |
+
self.ssh_command_runner.run(
|
| 864 |
+
f"{self.docker_cmd} " + "info -f '{{.Runtimes}}' ", with_output=True
|
| 865 |
+
)
|
| 866 |
+
.decode()
|
| 867 |
+
.strip()
|
| 868 |
+
)
|
| 869 |
+
if "nvidia-container-runtime" in runtime_output:
|
| 870 |
+
try:
|
| 871 |
+
self.ssh_command_runner.run("nvidia-smi", with_output=False)
|
| 872 |
+
return run_options + ["--runtime=nvidia"]
|
| 873 |
+
except Exception as e:
|
| 874 |
+
logger.warning(
|
| 875 |
+
"Nvidia Container Runtime is present, but no GPUs found."
|
| 876 |
+
)
|
| 877 |
+
logger.debug(f"nvidia-smi error: {e}")
|
| 878 |
+
return run_options
|
| 879 |
+
|
| 880 |
+
return run_options
|
| 881 |
+
|
| 882 |
+
def _auto_configure_shm(self, run_options: List[str]) -> List[str]:
|
| 883 |
+
if self.docker_config.get("disable_shm_size_detection"):
|
| 884 |
+
return run_options
|
| 885 |
+
for run_opt in run_options:
|
| 886 |
+
if "--shm-size" in run_opt:
|
| 887 |
+
logger.info(
|
| 888 |
+
"Bypassing automatic SHM-Detection because of "
|
| 889 |
+
f"`run_option`: {run_opt}"
|
| 890 |
+
)
|
| 891 |
+
return run_options
|
| 892 |
+
try:
|
| 893 |
+
shm_output = (
|
| 894 |
+
self.ssh_command_runner.run(
|
| 895 |
+
"cat /proc/meminfo || true", with_output=True
|
| 896 |
+
)
|
| 897 |
+
.decode()
|
| 898 |
+
.strip()
|
| 899 |
+
)
|
| 900 |
+
available_memory = int(
|
| 901 |
+
[ln for ln in shm_output.split("\n") if "MemAvailable" in ln][
|
| 902 |
+
0
|
| 903 |
+
].split()[1]
|
| 904 |
+
)
|
| 905 |
+
available_memory_bytes = available_memory * 1024
|
| 906 |
+
# Overestimate SHM size by 10%
|
| 907 |
+
shm_size = min(
|
| 908 |
+
(available_memory_bytes * DEFAULT_OBJECT_STORE_MEMORY_PROPORTION * 1.1),
|
| 909 |
+
DEFAULT_OBJECT_STORE_MAX_MEMORY_BYTES,
|
| 910 |
+
)
|
| 911 |
+
return run_options + [f"--shm-size='{shm_size}b'"]
|
| 912 |
+
except Exception as e:
|
| 913 |
+
logger.warning(f"Received error while trying to auto-compute SHM size {e}")
|
| 914 |
+
return run_options
|
| 915 |
+
|
| 916 |
+
def _get_docker_host_mount_location(self, cluster_name: str) -> str:
|
| 917 |
+
"""Return the docker host mount directory location."""
|
| 918 |
+
# Imported here due to circular dependency in imports.
|
| 919 |
+
from ray.autoscaler.sdk import get_docker_host_mount_location
|
| 920 |
+
|
| 921 |
+
return get_docker_host_mount_location(cluster_name)
|
.venv/lib/python3.11/site-packages/ray/autoscaler/_private/commands.py
ADDED
|
@@ -0,0 +1,1631 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import copy
|
| 2 |
+
import datetime
|
| 3 |
+
import hashlib
|
| 4 |
+
import json
|
| 5 |
+
import logging
|
| 6 |
+
import os
|
| 7 |
+
import random
|
| 8 |
+
import shutil
|
| 9 |
+
import subprocess
|
| 10 |
+
import sys
|
| 11 |
+
import tempfile
|
| 12 |
+
import time
|
| 13 |
+
from concurrent.futures import ThreadPoolExecutor
|
| 14 |
+
from types import ModuleType
|
| 15 |
+
from typing import Any, Dict, List, Optional, Tuple, Union
|
| 16 |
+
|
| 17 |
+
import click
|
| 18 |
+
import yaml
|
| 19 |
+
|
| 20 |
+
import ray
|
| 21 |
+
from ray._private.usage import usage_lib
|
| 22 |
+
from ray.autoscaler._private import subprocess_output_util as cmd_output_util
|
| 23 |
+
from ray.autoscaler._private.autoscaler import AutoscalerSummary
|
| 24 |
+
from ray.autoscaler._private.cli_logger import cf, cli_logger
|
| 25 |
+
from ray.autoscaler._private.cluster_dump import (
|
| 26 |
+
Archive,
|
| 27 |
+
GetParameters,
|
| 28 |
+
Node,
|
| 29 |
+
_info_from_params,
|
| 30 |
+
create_archive_for_local_and_remote_nodes,
|
| 31 |
+
create_archive_for_remote_nodes,
|
| 32 |
+
get_all_local_data,
|
| 33 |
+
)
|
| 34 |
+
from ray.autoscaler._private.command_runner import (
|
| 35 |
+
set_rsync_silent,
|
| 36 |
+
set_using_login_shells,
|
| 37 |
+
)
|
| 38 |
+
from ray.autoscaler._private.constants import (
|
| 39 |
+
AUTOSCALER_RESOURCE_REQUEST_CHANNEL,
|
| 40 |
+
MAX_PARALLEL_SHUTDOWN_WORKERS,
|
| 41 |
+
)
|
| 42 |
+
from ray.autoscaler._private.event_system import CreateClusterEvent, global_event_system
|
| 43 |
+
from ray.autoscaler._private.log_timer import LogTimer
|
| 44 |
+
from ray.autoscaler._private.node_provider_availability_tracker import (
|
| 45 |
+
NodeAvailabilitySummary,
|
| 46 |
+
)
|
| 47 |
+
from ray.autoscaler._private.providers import (
|
| 48 |
+
_NODE_PROVIDERS,
|
| 49 |
+
_PROVIDER_PRETTY_NAMES,
|
| 50 |
+
_get_node_provider,
|
| 51 |
+
)
|
| 52 |
+
from ray.autoscaler._private.updater import NodeUpdaterThread
|
| 53 |
+
from ray.autoscaler._private.util import (
|
| 54 |
+
LoadMetricsSummary,
|
| 55 |
+
format_info_string,
|
| 56 |
+
hash_launch_conf,
|
| 57 |
+
hash_runtime_conf,
|
| 58 |
+
prepare_config,
|
| 59 |
+
validate_config,
|
| 60 |
+
)
|
| 61 |
+
from ray.autoscaler.node_provider import NodeProvider
|
| 62 |
+
from ray.autoscaler.tags import (
|
| 63 |
+
NODE_KIND_HEAD,
|
| 64 |
+
NODE_KIND_WORKER,
|
| 65 |
+
STATUS_UNINITIALIZED,
|
| 66 |
+
STATUS_UP_TO_DATE,
|
| 67 |
+
TAG_RAY_LAUNCH_CONFIG,
|
| 68 |
+
TAG_RAY_NODE_KIND,
|
| 69 |
+
TAG_RAY_NODE_NAME,
|
| 70 |
+
TAG_RAY_NODE_STATUS,
|
| 71 |
+
TAG_RAY_USER_NODE_TYPE,
|
| 72 |
+
)
|
| 73 |
+
from ray.experimental.internal_kv import _internal_kv_put, internal_kv_get_gcs_client
|
| 74 |
+
from ray.util.debug import log_once
|
| 75 |
+
|
| 76 |
+
try: # py3
|
| 77 |
+
from shlex import quote
|
| 78 |
+
except ImportError: # py2
|
| 79 |
+
from pipes import quote
|
| 80 |
+
|
| 81 |
+
|
| 82 |
+
logger = logging.getLogger(__name__)
|
| 83 |
+
|
| 84 |
+
RUN_ENV_TYPES = ["auto", "host", "docker"]
|
| 85 |
+
|
| 86 |
+
POLL_INTERVAL = 5
|
| 87 |
+
|
| 88 |
+
Port_forward = Union[Tuple[int, int], List[Tuple[int, int]]]
|
| 89 |
+
|
| 90 |
+
|
| 91 |
+
def try_logging_config(config: Dict[str, Any]) -> None:
|
| 92 |
+
if config["provider"]["type"] == "aws":
|
| 93 |
+
from ray.autoscaler._private.aws.config import log_to_cli
|
| 94 |
+
|
| 95 |
+
log_to_cli(config)
|
| 96 |
+
|
| 97 |
+
|
| 98 |
+
def try_get_log_state(provider_config: Dict[str, Any]) -> Optional[dict]:
|
| 99 |
+
if provider_config["type"] == "aws":
|
| 100 |
+
from ray.autoscaler._private.aws.config import get_log_state
|
| 101 |
+
|
| 102 |
+
return get_log_state()
|
| 103 |
+
return None
|
| 104 |
+
|
| 105 |
+
|
| 106 |
+
def try_reload_log_state(provider_config: Dict[str, Any], log_state: dict) -> None:
|
| 107 |
+
if not log_state:
|
| 108 |
+
return
|
| 109 |
+
if provider_config["type"] == "aws":
|
| 110 |
+
from ray.autoscaler._private.aws.config import reload_log_state
|
| 111 |
+
|
| 112 |
+
return reload_log_state(log_state)
|
| 113 |
+
|
| 114 |
+
|
| 115 |
+
def debug_status(
|
| 116 |
+
status, error, verbose: bool = False, address: Optional[str] = None
|
| 117 |
+
) -> str:
|
| 118 |
+
"""
|
| 119 |
+
Return a debug string for the autoscaler.
|
| 120 |
+
|
| 121 |
+
Args:
|
| 122 |
+
status: The autoscaler status string for v1
|
| 123 |
+
error: The autoscaler error string for v1
|
| 124 |
+
verbose: Whether to print verbose information.
|
| 125 |
+
address: The address of the cluster (gcs address).
|
| 126 |
+
|
| 127 |
+
Returns:
|
| 128 |
+
str: A debug string for the cluster's status.
|
| 129 |
+
"""
|
| 130 |
+
from ray.autoscaler.v2.utils import is_autoscaler_v2
|
| 131 |
+
|
| 132 |
+
if is_autoscaler_v2():
|
| 133 |
+
from ray.autoscaler.v2.sdk import get_cluster_status
|
| 134 |
+
from ray.autoscaler.v2.utils import ClusterStatusFormatter
|
| 135 |
+
|
| 136 |
+
cluster_status = get_cluster_status(address)
|
| 137 |
+
status = ClusterStatusFormatter.format(cluster_status, verbose=verbose)
|
| 138 |
+
elif status:
|
| 139 |
+
status = status.decode("utf-8")
|
| 140 |
+
status_dict = json.loads(status)
|
| 141 |
+
lm_summary_dict = status_dict.get("load_metrics_report")
|
| 142 |
+
autoscaler_summary_dict = status_dict.get("autoscaler_report")
|
| 143 |
+
timestamp = status_dict.get("time")
|
| 144 |
+
gcs_request_time = status_dict.get("gcs_request_time")
|
| 145 |
+
non_terminated_nodes_time = status_dict.get("non_terminated_nodes_time")
|
| 146 |
+
if lm_summary_dict and autoscaler_summary_dict and timestamp:
|
| 147 |
+
lm_summary = LoadMetricsSummary(**lm_summary_dict)
|
| 148 |
+
node_availability_summary_dict = autoscaler_summary_dict.pop(
|
| 149 |
+
"node_availability_summary", {}
|
| 150 |
+
)
|
| 151 |
+
node_availability_summary = NodeAvailabilitySummary.from_fields(
|
| 152 |
+
**node_availability_summary_dict
|
| 153 |
+
)
|
| 154 |
+
autoscaler_summary = AutoscalerSummary(
|
| 155 |
+
node_availability_summary=node_availability_summary,
|
| 156 |
+
**autoscaler_summary_dict,
|
| 157 |
+
)
|
| 158 |
+
report_time = datetime.datetime.fromtimestamp(timestamp)
|
| 159 |
+
status = format_info_string(
|
| 160 |
+
lm_summary,
|
| 161 |
+
autoscaler_summary,
|
| 162 |
+
time=report_time,
|
| 163 |
+
gcs_request_time=gcs_request_time,
|
| 164 |
+
non_terminated_nodes_time=non_terminated_nodes_time,
|
| 165 |
+
verbose=verbose,
|
| 166 |
+
)
|
| 167 |
+
else:
|
| 168 |
+
status = (
|
| 169 |
+
"No cluster status. It may take a few seconds "
|
| 170 |
+
"for the Ray internal services to start up."
|
| 171 |
+
)
|
| 172 |
+
else:
|
| 173 |
+
status = (
|
| 174 |
+
"No cluster status. It may take a few seconds "
|
| 175 |
+
"for the Ray internal services to start up."
|
| 176 |
+
)
|
| 177 |
+
|
| 178 |
+
if error:
|
| 179 |
+
status += "\n"
|
| 180 |
+
status += error.decode("utf-8")
|
| 181 |
+
|
| 182 |
+
return status
|
| 183 |
+
|
| 184 |
+
|
| 185 |
+
def request_resources(
|
| 186 |
+
num_cpus: Optional[int] = None, bundles: Optional[List[dict]] = None
|
| 187 |
+
) -> None:
|
| 188 |
+
"""Remotely request some CPU or GPU resources from the autoscaler.
|
| 189 |
+
|
| 190 |
+
This function is to be called e.g. on a node before submitting a bunch of
|
| 191 |
+
ray.remote calls to ensure that resources rapidly become available.
|
| 192 |
+
|
| 193 |
+
Args:
|
| 194 |
+
num_cpus: Scale the cluster to ensure this number of CPUs are
|
| 195 |
+
available. This request is persistent until another call to
|
| 196 |
+
request_resources() is made.
|
| 197 |
+
bundles (List[ResourceDict]): Scale the cluster to ensure this set of
|
| 198 |
+
resource shapes can fit. This request is persistent until another
|
| 199 |
+
call to request_resources() is made.
|
| 200 |
+
"""
|
| 201 |
+
if not ray.is_initialized():
|
| 202 |
+
raise RuntimeError("Ray is not initialized yet")
|
| 203 |
+
to_request = []
|
| 204 |
+
if num_cpus:
|
| 205 |
+
to_request += [{"CPU": 1}] * num_cpus
|
| 206 |
+
if bundles:
|
| 207 |
+
to_request += bundles
|
| 208 |
+
_internal_kv_put(
|
| 209 |
+
AUTOSCALER_RESOURCE_REQUEST_CHANNEL, json.dumps(to_request), overwrite=True
|
| 210 |
+
)
|
| 211 |
+
|
| 212 |
+
from ray.autoscaler.v2.utils import is_autoscaler_v2
|
| 213 |
+
|
| 214 |
+
if is_autoscaler_v2():
|
| 215 |
+
from ray.autoscaler.v2.sdk import request_cluster_resources
|
| 216 |
+
|
| 217 |
+
gcs_address = internal_kv_get_gcs_client().address
|
| 218 |
+
request_cluster_resources(gcs_address, to_request)
|
| 219 |
+
|
| 220 |
+
|
| 221 |
+
def create_or_update_cluster(
|
| 222 |
+
config_file: str,
|
| 223 |
+
override_min_workers: Optional[int],
|
| 224 |
+
override_max_workers: Optional[int],
|
| 225 |
+
no_restart: bool,
|
| 226 |
+
restart_only: bool,
|
| 227 |
+
yes: bool,
|
| 228 |
+
override_cluster_name: Optional[str] = None,
|
| 229 |
+
no_config_cache: bool = False,
|
| 230 |
+
redirect_command_output: Optional[bool] = False,
|
| 231 |
+
use_login_shells: bool = True,
|
| 232 |
+
no_monitor_on_head: bool = False,
|
| 233 |
+
) -> Dict[str, Any]:
|
| 234 |
+
"""Creates or updates an autoscaling Ray cluster from a config json."""
|
| 235 |
+
# no_monitor_on_head is an internal flag used by the Ray K8s operator.
|
| 236 |
+
# If True, prevents autoscaling config sync to the Ray head during cluster
|
| 237 |
+
# creation. See https://github.com/ray-project/ray/pull/13720.
|
| 238 |
+
set_using_login_shells(use_login_shells)
|
| 239 |
+
if not use_login_shells:
|
| 240 |
+
cmd_output_util.set_allow_interactive(False)
|
| 241 |
+
if redirect_command_output is None:
|
| 242 |
+
# Do not redirect by default.
|
| 243 |
+
cmd_output_util.set_output_redirected(False)
|
| 244 |
+
else:
|
| 245 |
+
cmd_output_util.set_output_redirected(redirect_command_output)
|
| 246 |
+
|
| 247 |
+
def handle_yaml_error(e):
|
| 248 |
+
cli_logger.error("Cluster config invalid")
|
| 249 |
+
cli_logger.newline()
|
| 250 |
+
cli_logger.error("Failed to load YAML file " + cf.bold("{}"), config_file)
|
| 251 |
+
cli_logger.newline()
|
| 252 |
+
with cli_logger.verbatim_error_ctx("PyYAML error:"):
|
| 253 |
+
cli_logger.error(e)
|
| 254 |
+
cli_logger.abort()
|
| 255 |
+
|
| 256 |
+
try:
|
| 257 |
+
config = yaml.safe_load(open(config_file).read())
|
| 258 |
+
except FileNotFoundError:
|
| 259 |
+
cli_logger.abort(
|
| 260 |
+
"Provided cluster configuration file ({}) does not exist",
|
| 261 |
+
cf.bold(config_file),
|
| 262 |
+
)
|
| 263 |
+
except yaml.parser.ParserError as e:
|
| 264 |
+
handle_yaml_error(e)
|
| 265 |
+
raise
|
| 266 |
+
except yaml.scanner.ScannerError as e:
|
| 267 |
+
handle_yaml_error(e)
|
| 268 |
+
raise
|
| 269 |
+
global_event_system.execute_callback(
|
| 270 |
+
CreateClusterEvent.up_started, {"cluster_config": config}
|
| 271 |
+
)
|
| 272 |
+
|
| 273 |
+
# todo: validate file_mounts, ssh keys, etc.
|
| 274 |
+
|
| 275 |
+
importer = _NODE_PROVIDERS.get(config["provider"]["type"])
|
| 276 |
+
if not importer:
|
| 277 |
+
cli_logger.abort(
|
| 278 |
+
"Unknown provider type " + cf.bold("{}") + "\n"
|
| 279 |
+
"Available providers are: {}",
|
| 280 |
+
config["provider"]["type"],
|
| 281 |
+
cli_logger.render_list(
|
| 282 |
+
[k for k in _NODE_PROVIDERS.keys() if _NODE_PROVIDERS[k] is not None]
|
| 283 |
+
),
|
| 284 |
+
)
|
| 285 |
+
|
| 286 |
+
printed_overrides = False
|
| 287 |
+
|
| 288 |
+
def handle_cli_override(key, override):
|
| 289 |
+
if override is not None:
|
| 290 |
+
if key in config:
|
| 291 |
+
nonlocal printed_overrides
|
| 292 |
+
printed_overrides = True
|
| 293 |
+
cli_logger.warning(
|
| 294 |
+
"`{}` override provided on the command line.\n"
|
| 295 |
+
" Using "
|
| 296 |
+
+ cf.bold("{}")
|
| 297 |
+
+ cf.dimmed(" [configuration file has " + cf.bold("{}") + "]"),
|
| 298 |
+
key,
|
| 299 |
+
override,
|
| 300 |
+
config[key],
|
| 301 |
+
)
|
| 302 |
+
config[key] = override
|
| 303 |
+
|
| 304 |
+
handle_cli_override("min_workers", override_min_workers)
|
| 305 |
+
handle_cli_override("max_workers", override_max_workers)
|
| 306 |
+
handle_cli_override("cluster_name", override_cluster_name)
|
| 307 |
+
|
| 308 |
+
if printed_overrides:
|
| 309 |
+
cli_logger.newline()
|
| 310 |
+
|
| 311 |
+
cli_logger.labeled_value("Cluster", config["cluster_name"])
|
| 312 |
+
|
| 313 |
+
cli_logger.newline()
|
| 314 |
+
config = _bootstrap_config(config, no_config_cache=no_config_cache)
|
| 315 |
+
|
| 316 |
+
try_logging_config(config)
|
| 317 |
+
get_or_create_head_node(
|
| 318 |
+
config,
|
| 319 |
+
config_file,
|
| 320 |
+
no_restart,
|
| 321 |
+
restart_only,
|
| 322 |
+
yes,
|
| 323 |
+
override_cluster_name,
|
| 324 |
+
no_monitor_on_head,
|
| 325 |
+
)
|
| 326 |
+
return config
|
| 327 |
+
|
| 328 |
+
|
| 329 |
+
CONFIG_CACHE_VERSION = 1
|
| 330 |
+
|
| 331 |
+
|
| 332 |
+
def _bootstrap_config(
|
| 333 |
+
config: Dict[str, Any], no_config_cache: bool = False
|
| 334 |
+
) -> Dict[str, Any]:
|
| 335 |
+
config = prepare_config(config)
|
| 336 |
+
# NOTE: multi-node-type autoscaler is guaranteed to be in use after this.
|
| 337 |
+
|
| 338 |
+
hasher = hashlib.sha1()
|
| 339 |
+
hasher.update(json.dumps([config], sort_keys=True).encode("utf-8"))
|
| 340 |
+
cache_key = os.path.join(
|
| 341 |
+
tempfile.gettempdir(), "ray-config-{}".format(hasher.hexdigest())
|
| 342 |
+
)
|
| 343 |
+
|
| 344 |
+
if os.path.exists(cache_key) and not no_config_cache:
|
| 345 |
+
config_cache = json.loads(open(cache_key).read())
|
| 346 |
+
if config_cache.get("_version", -1) == CONFIG_CACHE_VERSION:
|
| 347 |
+
# todo: is it fine to re-resolve? afaik it should be.
|
| 348 |
+
# we can have migrations otherwise or something
|
| 349 |
+
# but this seems overcomplicated given that resolving is
|
| 350 |
+
# relatively cheap
|
| 351 |
+
try_reload_log_state(
|
| 352 |
+
config_cache["config"]["provider"],
|
| 353 |
+
config_cache.get("provider_log_info"),
|
| 354 |
+
)
|
| 355 |
+
|
| 356 |
+
if log_once("_printed_cached_config_warning"):
|
| 357 |
+
cli_logger.verbose_warning(
|
| 358 |
+
"Loaded cached provider configuration from " + cf.bold("{}"),
|
| 359 |
+
cache_key,
|
| 360 |
+
)
|
| 361 |
+
if cli_logger.verbosity == 0:
|
| 362 |
+
cli_logger.warning("Loaded cached provider configuration")
|
| 363 |
+
cli_logger.warning(
|
| 364 |
+
"If you experience issues with "
|
| 365 |
+
"the cloud provider, try re-running "
|
| 366 |
+
"the command with {}.",
|
| 367 |
+
cf.bold("--no-config-cache"),
|
| 368 |
+
)
|
| 369 |
+
|
| 370 |
+
return config_cache["config"]
|
| 371 |
+
else:
|
| 372 |
+
cli_logger.warning(
|
| 373 |
+
"Found cached cluster config "
|
| 374 |
+
"but the version " + cf.bold("{}") + " "
|
| 375 |
+
"(expected " + cf.bold("{}") + ") does not match.\n"
|
| 376 |
+
"This is normal if cluster launcher was updated.\n"
|
| 377 |
+
"Config will be re-resolved.",
|
| 378 |
+
config_cache.get("_version", "none"),
|
| 379 |
+
CONFIG_CACHE_VERSION,
|
| 380 |
+
)
|
| 381 |
+
|
| 382 |
+
importer = _NODE_PROVIDERS.get(config["provider"]["type"])
|
| 383 |
+
if not importer:
|
| 384 |
+
raise NotImplementedError("Unsupported provider {}".format(config["provider"]))
|
| 385 |
+
|
| 386 |
+
provider_cls = importer(config["provider"])
|
| 387 |
+
|
| 388 |
+
cli_logger.print(
|
| 389 |
+
"Checking {} environment settings",
|
| 390 |
+
_PROVIDER_PRETTY_NAMES.get(config["provider"]["type"]),
|
| 391 |
+
)
|
| 392 |
+
try:
|
| 393 |
+
config = provider_cls.fillout_available_node_types_resources(config)
|
| 394 |
+
except Exception as exc:
|
| 395 |
+
if cli_logger.verbosity > 2:
|
| 396 |
+
logger.exception("Failed to autodetect node resources.")
|
| 397 |
+
else:
|
| 398 |
+
cli_logger.warning(
|
| 399 |
+
f"Failed to autodetect node resources: {str(exc)}. "
|
| 400 |
+
"You can see full stack trace with higher verbosity."
|
| 401 |
+
)
|
| 402 |
+
|
| 403 |
+
try:
|
| 404 |
+
# NOTE: if `resources` field is missing, validate_config for providers
|
| 405 |
+
# other than AWS and Kubernetes will fail (the schema error will ask
|
| 406 |
+
# the user to manually fill the resources) as we currently support
|
| 407 |
+
# autofilling resources for AWS and Kubernetes only.
|
| 408 |
+
validate_config(config)
|
| 409 |
+
except (ModuleNotFoundError, ImportError):
|
| 410 |
+
cli_logger.abort(
|
| 411 |
+
"Not all Ray autoscaler dependencies were found. "
|
| 412 |
+
"In Ray 1.4+, the Ray CLI, autoscaler, and dashboard will "
|
| 413 |
+
'only be usable via `pip install "ray[default]"`. Please '
|
| 414 |
+
"update your install command."
|
| 415 |
+
)
|
| 416 |
+
resolved_config = provider_cls.bootstrap_config(config)
|
| 417 |
+
|
| 418 |
+
if not no_config_cache:
|
| 419 |
+
with open(cache_key, "w") as f:
|
| 420 |
+
config_cache = {
|
| 421 |
+
"_version": CONFIG_CACHE_VERSION,
|
| 422 |
+
"provider_log_info": try_get_log_state(resolved_config["provider"]),
|
| 423 |
+
"config": resolved_config,
|
| 424 |
+
}
|
| 425 |
+
f.write(json.dumps(config_cache))
|
| 426 |
+
return resolved_config
|
| 427 |
+
|
| 428 |
+
|
| 429 |
+
def teardown_cluster(
|
| 430 |
+
config_file: str,
|
| 431 |
+
yes: bool,
|
| 432 |
+
workers_only: bool,
|
| 433 |
+
override_cluster_name: Optional[str],
|
| 434 |
+
keep_min_workers: bool,
|
| 435 |
+
) -> None:
|
| 436 |
+
"""Destroys all nodes of a Ray cluster described by a config json."""
|
| 437 |
+
config = yaml.safe_load(open(config_file).read())
|
| 438 |
+
if override_cluster_name is not None:
|
| 439 |
+
config["cluster_name"] = override_cluster_name
|
| 440 |
+
|
| 441 |
+
config = _bootstrap_config(config)
|
| 442 |
+
|
| 443 |
+
cli_logger.confirm(yes, "Destroying cluster.", _abort=True)
|
| 444 |
+
|
| 445 |
+
if not workers_only:
|
| 446 |
+
try:
|
| 447 |
+
exec_cluster(
|
| 448 |
+
config_file,
|
| 449 |
+
cmd="ray stop",
|
| 450 |
+
run_env="auto",
|
| 451 |
+
screen=False,
|
| 452 |
+
tmux=False,
|
| 453 |
+
stop=False,
|
| 454 |
+
start=False,
|
| 455 |
+
override_cluster_name=override_cluster_name,
|
| 456 |
+
port_forward=None,
|
| 457 |
+
with_output=False,
|
| 458 |
+
)
|
| 459 |
+
except Exception as e:
|
| 460 |
+
# todo: add better exception info
|
| 461 |
+
cli_logger.verbose_error("{}", str(e))
|
| 462 |
+
cli_logger.warning(
|
| 463 |
+
"Exception occurred when stopping the cluster Ray runtime "
|
| 464 |
+
"(use -v to dump teardown exceptions)."
|
| 465 |
+
)
|
| 466 |
+
cli_logger.warning(
|
| 467 |
+
"Ignoring the exception and "
|
| 468 |
+
"attempting to shut down the cluster nodes anyway."
|
| 469 |
+
)
|
| 470 |
+
|
| 471 |
+
provider = _get_node_provider(config["provider"], config["cluster_name"])
|
| 472 |
+
|
| 473 |
+
def remaining_nodes():
|
| 474 |
+
workers = provider.non_terminated_nodes({TAG_RAY_NODE_KIND: NODE_KIND_WORKER})
|
| 475 |
+
|
| 476 |
+
if keep_min_workers:
|
| 477 |
+
min_workers = config.get("min_workers", 0)
|
| 478 |
+
cli_logger.print(
|
| 479 |
+
"{} random worker nodes will not be shut down. "
|
| 480 |
+
+ cf.dimmed("(due to {})"),
|
| 481 |
+
cf.bold(min_workers),
|
| 482 |
+
cf.bold("--keep-min-workers"),
|
| 483 |
+
)
|
| 484 |
+
|
| 485 |
+
workers = random.sample(workers, len(workers) - min_workers)
|
| 486 |
+
|
| 487 |
+
# todo: it's weird to kill the head node but not all workers
|
| 488 |
+
if workers_only:
|
| 489 |
+
cli_logger.print(
|
| 490 |
+
"The head node will not be shut down. " + cf.dimmed("(due to {})"),
|
| 491 |
+
cf.bold("--workers-only"),
|
| 492 |
+
)
|
| 493 |
+
|
| 494 |
+
return workers
|
| 495 |
+
|
| 496 |
+
head = provider.non_terminated_nodes({TAG_RAY_NODE_KIND: NODE_KIND_HEAD})
|
| 497 |
+
|
| 498 |
+
return head + workers
|
| 499 |
+
|
| 500 |
+
def run_docker_stop(node, container_name):
|
| 501 |
+
try:
|
| 502 |
+
updater = NodeUpdaterThread(
|
| 503 |
+
node_id=node,
|
| 504 |
+
provider_config=config["provider"],
|
| 505 |
+
provider=provider,
|
| 506 |
+
auth_config=config["auth"],
|
| 507 |
+
cluster_name=config["cluster_name"],
|
| 508 |
+
file_mounts=config["file_mounts"],
|
| 509 |
+
initialization_commands=[],
|
| 510 |
+
setup_commands=[],
|
| 511 |
+
ray_start_commands=[],
|
| 512 |
+
runtime_hash="",
|
| 513 |
+
file_mounts_contents_hash="",
|
| 514 |
+
is_head_node=False,
|
| 515 |
+
docker_config=config.get("docker"),
|
| 516 |
+
)
|
| 517 |
+
|
| 518 |
+
_exec(
|
| 519 |
+
updater,
|
| 520 |
+
f"docker stop {container_name}",
|
| 521 |
+
with_output=False,
|
| 522 |
+
run_env="host",
|
| 523 |
+
)
|
| 524 |
+
except Exception:
|
| 525 |
+
cli_logger.warning(f"Docker stop failed on {node}")
|
| 526 |
+
|
| 527 |
+
# Loop here to check that both the head and worker nodes are actually
|
| 528 |
+
# really gone
|
| 529 |
+
A = remaining_nodes()
|
| 530 |
+
|
| 531 |
+
container_name = config.get("docker", {}).get("container_name")
|
| 532 |
+
if container_name:
|
| 533 |
+
# This is to ensure that the parallel SSH calls below do not mess with
|
| 534 |
+
# the users terminal.
|
| 535 |
+
output_redir = cmd_output_util.is_output_redirected()
|
| 536 |
+
cmd_output_util.set_output_redirected(True)
|
| 537 |
+
allow_interactive = cmd_output_util.does_allow_interactive()
|
| 538 |
+
cmd_output_util.set_allow_interactive(False)
|
| 539 |
+
|
| 540 |
+
with ThreadPoolExecutor(max_workers=MAX_PARALLEL_SHUTDOWN_WORKERS) as executor:
|
| 541 |
+
for node in A:
|
| 542 |
+
executor.submit(
|
| 543 |
+
run_docker_stop, node=node, container_name=container_name
|
| 544 |
+
)
|
| 545 |
+
cmd_output_util.set_output_redirected(output_redir)
|
| 546 |
+
cmd_output_util.set_allow_interactive(allow_interactive)
|
| 547 |
+
with LogTimer("teardown_cluster: done."):
|
| 548 |
+
while A:
|
| 549 |
+
provider.terminate_nodes(A)
|
| 550 |
+
|
| 551 |
+
cli_logger.print(
|
| 552 |
+
"Requested {} nodes to shut down.",
|
| 553 |
+
cf.bold(len(A)),
|
| 554 |
+
_tags=dict(interval="1s"),
|
| 555 |
+
)
|
| 556 |
+
|
| 557 |
+
time.sleep(POLL_INTERVAL) # todo: interval should be a variable
|
| 558 |
+
A = remaining_nodes()
|
| 559 |
+
cli_logger.print(
|
| 560 |
+
"{} nodes remaining after {} second(s).", cf.bold(len(A)), POLL_INTERVAL
|
| 561 |
+
)
|
| 562 |
+
cli_logger.success("No nodes remaining.")
|
| 563 |
+
|
| 564 |
+
|
| 565 |
+
def kill_node(
|
| 566 |
+
config_file: str, yes: bool, hard: bool, override_cluster_name: Optional[str]
|
| 567 |
+
) -> Optional[str]:
|
| 568 |
+
"""Kills a random Raylet worker."""
|
| 569 |
+
|
| 570 |
+
config = yaml.safe_load(open(config_file).read())
|
| 571 |
+
if override_cluster_name is not None:
|
| 572 |
+
config["cluster_name"] = override_cluster_name
|
| 573 |
+
config = _bootstrap_config(config)
|
| 574 |
+
|
| 575 |
+
cli_logger.confirm(yes, "A random node will be killed.")
|
| 576 |
+
|
| 577 |
+
provider = _get_node_provider(config["provider"], config["cluster_name"])
|
| 578 |
+
nodes = provider.non_terminated_nodes({TAG_RAY_NODE_KIND: NODE_KIND_WORKER})
|
| 579 |
+
if not nodes:
|
| 580 |
+
cli_logger.print("No worker nodes detected.")
|
| 581 |
+
return None
|
| 582 |
+
node = random.choice(nodes)
|
| 583 |
+
cli_logger.print("Shutdown " + cf.bold("{}"), node)
|
| 584 |
+
if hard:
|
| 585 |
+
provider.terminate_node(node)
|
| 586 |
+
else:
|
| 587 |
+
updater = NodeUpdaterThread(
|
| 588 |
+
node_id=node,
|
| 589 |
+
provider_config=config["provider"],
|
| 590 |
+
provider=provider,
|
| 591 |
+
auth_config=config["auth"],
|
| 592 |
+
cluster_name=config["cluster_name"],
|
| 593 |
+
file_mounts=config["file_mounts"],
|
| 594 |
+
initialization_commands=[],
|
| 595 |
+
setup_commands=[],
|
| 596 |
+
ray_start_commands=[],
|
| 597 |
+
runtime_hash="",
|
| 598 |
+
file_mounts_contents_hash="",
|
| 599 |
+
is_head_node=False,
|
| 600 |
+
docker_config=config.get("docker"),
|
| 601 |
+
)
|
| 602 |
+
|
| 603 |
+
_exec(updater, "ray stop", False, False)
|
| 604 |
+
|
| 605 |
+
time.sleep(POLL_INTERVAL)
|
| 606 |
+
|
| 607 |
+
if config.get("provider", {}).get("use_internal_ips", False):
|
| 608 |
+
node_ip = provider.internal_ip(node)
|
| 609 |
+
else:
|
| 610 |
+
node_ip = provider.external_ip(node)
|
| 611 |
+
|
| 612 |
+
return node_ip
|
| 613 |
+
|
| 614 |
+
|
| 615 |
+
def monitor_cluster(
|
| 616 |
+
cluster_config_file: str, num_lines: int, override_cluster_name: Optional[str]
|
| 617 |
+
) -> None:
|
| 618 |
+
"""Tails the autoscaler logs of a Ray cluster."""
|
| 619 |
+
cmd = f"tail -n {num_lines} -f /tmp/ray/session_latest/logs/monitor*"
|
| 620 |
+
exec_cluster(
|
| 621 |
+
cluster_config_file,
|
| 622 |
+
cmd=cmd,
|
| 623 |
+
run_env="auto",
|
| 624 |
+
screen=False,
|
| 625 |
+
tmux=False,
|
| 626 |
+
stop=False,
|
| 627 |
+
start=False,
|
| 628 |
+
override_cluster_name=override_cluster_name,
|
| 629 |
+
port_forward=None,
|
| 630 |
+
)
|
| 631 |
+
|
| 632 |
+
|
| 633 |
+
def warn_about_bad_start_command(
|
| 634 |
+
start_commands: List[str], no_monitor_on_head: bool = False
|
| 635 |
+
) -> None:
|
| 636 |
+
ray_start_cmd = list(filter(lambda x: "ray start" in x, start_commands))
|
| 637 |
+
if len(ray_start_cmd) == 0:
|
| 638 |
+
cli_logger.warning(
|
| 639 |
+
"Ray runtime will not be started because `{}` is not in `{}`.",
|
| 640 |
+
cf.bold("ray start"),
|
| 641 |
+
cf.bold("head_start_ray_commands"),
|
| 642 |
+
)
|
| 643 |
+
|
| 644 |
+
autoscaling_config_in_ray_start_cmd = any(
|
| 645 |
+
"autoscaling-config" in x for x in ray_start_cmd
|
| 646 |
+
)
|
| 647 |
+
if not (autoscaling_config_in_ray_start_cmd or no_monitor_on_head):
|
| 648 |
+
cli_logger.warning(
|
| 649 |
+
"The head node will not launch any workers because "
|
| 650 |
+
"`{}` does not have `{}` set.\n"
|
| 651 |
+
"Potential fix: add `{}` to the `{}` command under `{}`.",
|
| 652 |
+
cf.bold("ray start"),
|
| 653 |
+
cf.bold("--autoscaling-config"),
|
| 654 |
+
cf.bold("--autoscaling-config=~/ray_bootstrap_config.yaml"),
|
| 655 |
+
cf.bold("ray start"),
|
| 656 |
+
cf.bold("head_start_ray_commands"),
|
| 657 |
+
)
|
| 658 |
+
|
| 659 |
+
|
| 660 |
+
def get_or_create_head_node(
|
| 661 |
+
config: Dict[str, Any],
|
| 662 |
+
printable_config_file: str,
|
| 663 |
+
no_restart: bool,
|
| 664 |
+
restart_only: bool,
|
| 665 |
+
yes: bool,
|
| 666 |
+
override_cluster_name: Optional[str],
|
| 667 |
+
no_monitor_on_head: bool = False,
|
| 668 |
+
_provider: Optional[NodeProvider] = None,
|
| 669 |
+
_runner: ModuleType = subprocess,
|
| 670 |
+
) -> None:
|
| 671 |
+
"""Create the cluster head node, which in turn creates the workers."""
|
| 672 |
+
global_event_system.execute_callback(CreateClusterEvent.cluster_booting_started)
|
| 673 |
+
provider = _provider or _get_node_provider(
|
| 674 |
+
config["provider"], config["cluster_name"]
|
| 675 |
+
)
|
| 676 |
+
|
| 677 |
+
config = copy.deepcopy(config)
|
| 678 |
+
head_node_tags = {
|
| 679 |
+
TAG_RAY_NODE_KIND: NODE_KIND_HEAD,
|
| 680 |
+
}
|
| 681 |
+
nodes = provider.non_terminated_nodes(head_node_tags)
|
| 682 |
+
if len(nodes) > 0:
|
| 683 |
+
head_node = nodes[0]
|
| 684 |
+
else:
|
| 685 |
+
head_node = None
|
| 686 |
+
|
| 687 |
+
if not head_node:
|
| 688 |
+
cli_logger.confirm(
|
| 689 |
+
yes, "No head node found. Launching a new cluster.", _abort=True
|
| 690 |
+
)
|
| 691 |
+
cli_logger.newline()
|
| 692 |
+
usage_lib.show_usage_stats_prompt(cli=True)
|
| 693 |
+
|
| 694 |
+
if head_node:
|
| 695 |
+
if restart_only:
|
| 696 |
+
cli_logger.confirm(
|
| 697 |
+
yes,
|
| 698 |
+
"Updating cluster configuration and "
|
| 699 |
+
"restarting the cluster Ray runtime. "
|
| 700 |
+
"Setup commands will not be run due to `{}`.\n",
|
| 701 |
+
cf.bold("--restart-only"),
|
| 702 |
+
_abort=True,
|
| 703 |
+
)
|
| 704 |
+
cli_logger.newline()
|
| 705 |
+
usage_lib.show_usage_stats_prompt(cli=True)
|
| 706 |
+
elif no_restart:
|
| 707 |
+
cli_logger.print(
|
| 708 |
+
"Cluster Ray runtime will not be restarted due to `{}`.",
|
| 709 |
+
cf.bold("--no-restart"),
|
| 710 |
+
)
|
| 711 |
+
cli_logger.confirm(
|
| 712 |
+
yes,
|
| 713 |
+
"Updating cluster configuration and running setup commands.",
|
| 714 |
+
_abort=True,
|
| 715 |
+
)
|
| 716 |
+
else:
|
| 717 |
+
cli_logger.print("Updating cluster configuration and running full setup.")
|
| 718 |
+
cli_logger.confirm(
|
| 719 |
+
yes, cf.bold("Cluster Ray runtime will be restarted."), _abort=True
|
| 720 |
+
)
|
| 721 |
+
cli_logger.newline()
|
| 722 |
+
usage_lib.show_usage_stats_prompt(cli=True)
|
| 723 |
+
|
| 724 |
+
cli_logger.newline()
|
| 725 |
+
# TODO(ekl) this logic is duplicated in node_launcher.py (keep in sync)
|
| 726 |
+
head_node_config = copy.deepcopy(config.get("head_node", {}))
|
| 727 |
+
# The above `head_node` field is deprecated in favor of per-node-type
|
| 728 |
+
# node_configs. We allow it for backwards-compatibility.
|
| 729 |
+
head_node_resources = None
|
| 730 |
+
head_node_labels = None
|
| 731 |
+
head_node_type = config.get("head_node_type")
|
| 732 |
+
if head_node_type:
|
| 733 |
+
head_node_tags[TAG_RAY_USER_NODE_TYPE] = head_node_type
|
| 734 |
+
head_config = config["available_node_types"][head_node_type]
|
| 735 |
+
head_node_config.update(head_config["node_config"])
|
| 736 |
+
|
| 737 |
+
# Not necessary to keep in sync with node_launcher.py
|
| 738 |
+
# Keep in sync with autoscaler.py _node_resources
|
| 739 |
+
head_node_resources = head_config.get("resources")
|
| 740 |
+
head_node_labels = head_config.get("labels")
|
| 741 |
+
|
| 742 |
+
launch_hash = hash_launch_conf(head_node_config, config["auth"])
|
| 743 |
+
creating_new_head = _should_create_new_head(
|
| 744 |
+
head_node, launch_hash, head_node_type, provider
|
| 745 |
+
)
|
| 746 |
+
if creating_new_head:
|
| 747 |
+
with cli_logger.group("Acquiring an up-to-date head node"):
|
| 748 |
+
global_event_system.execute_callback(
|
| 749 |
+
CreateClusterEvent.acquiring_new_head_node
|
| 750 |
+
)
|
| 751 |
+
if head_node is not None:
|
| 752 |
+
cli_logger.confirm(yes, "Relaunching the head node.", _abort=True)
|
| 753 |
+
|
| 754 |
+
provider.terminate_node(head_node)
|
| 755 |
+
cli_logger.print("Terminated head node {}", head_node)
|
| 756 |
+
|
| 757 |
+
head_node_tags[TAG_RAY_LAUNCH_CONFIG] = launch_hash
|
| 758 |
+
head_node_tags[TAG_RAY_NODE_NAME] = "ray-{}-head".format(
|
| 759 |
+
config["cluster_name"]
|
| 760 |
+
)
|
| 761 |
+
head_node_tags[TAG_RAY_NODE_STATUS] = STATUS_UNINITIALIZED
|
| 762 |
+
provider.create_node(head_node_config, head_node_tags, 1)
|
| 763 |
+
cli_logger.print("Launched a new head node")
|
| 764 |
+
|
| 765 |
+
start = time.time()
|
| 766 |
+
head_node = None
|
| 767 |
+
with cli_logger.group("Fetching the new head node"):
|
| 768 |
+
while True:
|
| 769 |
+
if time.time() - start > 50:
|
| 770 |
+
cli_logger.abort(
|
| 771 |
+
"Head node fetch timed out. Failed to create head node."
|
| 772 |
+
)
|
| 773 |
+
nodes = provider.non_terminated_nodes(head_node_tags)
|
| 774 |
+
if len(nodes) == 1:
|
| 775 |
+
head_node = nodes[0]
|
| 776 |
+
break
|
| 777 |
+
time.sleep(POLL_INTERVAL)
|
| 778 |
+
cli_logger.newline()
|
| 779 |
+
|
| 780 |
+
global_event_system.execute_callback(CreateClusterEvent.head_node_acquired)
|
| 781 |
+
|
| 782 |
+
with cli_logger.group(
|
| 783 |
+
"Setting up head node",
|
| 784 |
+
_numbered=("<>", 1, 1),
|
| 785 |
+
# cf.bold(provider.node_tags(head_node)[TAG_RAY_NODE_NAME]),
|
| 786 |
+
_tags=dict(),
|
| 787 |
+
): # add id, ARN to tags?
|
| 788 |
+
# TODO(ekl) right now we always update the head node even if the
|
| 789 |
+
# hash matches.
|
| 790 |
+
# We could prompt the user for what they want to do here.
|
| 791 |
+
# No need to pass in cluster_sync_files because we use this
|
| 792 |
+
# hash to set up the head node
|
| 793 |
+
(runtime_hash, file_mounts_contents_hash) = hash_runtime_conf(
|
| 794 |
+
config["file_mounts"], None, config
|
| 795 |
+
)
|
| 796 |
+
|
| 797 |
+
if not no_monitor_on_head:
|
| 798 |
+
# Return remote_config_file to avoid prematurely closing it.
|
| 799 |
+
config, remote_config_file = _set_up_config_for_head_node(
|
| 800 |
+
config, provider, no_restart
|
| 801 |
+
)
|
| 802 |
+
cli_logger.print("Prepared bootstrap config")
|
| 803 |
+
|
| 804 |
+
if restart_only:
|
| 805 |
+
# Docker may re-launch nodes, requiring setup
|
| 806 |
+
# commands to be rerun.
|
| 807 |
+
if config.get("docker", {}).get("container_name"):
|
| 808 |
+
setup_commands = config["head_setup_commands"]
|
| 809 |
+
else:
|
| 810 |
+
setup_commands = []
|
| 811 |
+
ray_start_commands = config["head_start_ray_commands"]
|
| 812 |
+
# If user passed in --no-restart and we're not creating a new head,
|
| 813 |
+
# omit start commands.
|
| 814 |
+
elif no_restart and not creating_new_head:
|
| 815 |
+
setup_commands = config["head_setup_commands"]
|
| 816 |
+
ray_start_commands = []
|
| 817 |
+
else:
|
| 818 |
+
setup_commands = config["head_setup_commands"]
|
| 819 |
+
ray_start_commands = config["head_start_ray_commands"]
|
| 820 |
+
|
| 821 |
+
if not no_restart:
|
| 822 |
+
warn_about_bad_start_command(ray_start_commands, no_monitor_on_head)
|
| 823 |
+
|
| 824 |
+
updater = NodeUpdaterThread(
|
| 825 |
+
node_id=head_node,
|
| 826 |
+
provider_config=config["provider"],
|
| 827 |
+
provider=provider,
|
| 828 |
+
auth_config=config["auth"],
|
| 829 |
+
cluster_name=config["cluster_name"],
|
| 830 |
+
file_mounts=config["file_mounts"],
|
| 831 |
+
initialization_commands=config["initialization_commands"],
|
| 832 |
+
setup_commands=setup_commands,
|
| 833 |
+
ray_start_commands=ray_start_commands,
|
| 834 |
+
process_runner=_runner,
|
| 835 |
+
runtime_hash=runtime_hash,
|
| 836 |
+
file_mounts_contents_hash=file_mounts_contents_hash,
|
| 837 |
+
is_head_node=True,
|
| 838 |
+
node_resources=head_node_resources,
|
| 839 |
+
node_labels=head_node_labels,
|
| 840 |
+
rsync_options={
|
| 841 |
+
"rsync_exclude": config.get("rsync_exclude"),
|
| 842 |
+
"rsync_filter": config.get("rsync_filter"),
|
| 843 |
+
},
|
| 844 |
+
docker_config=config.get("docker"),
|
| 845 |
+
restart_only=restart_only,
|
| 846 |
+
)
|
| 847 |
+
updater.start()
|
| 848 |
+
updater.join()
|
| 849 |
+
|
| 850 |
+
# Refresh the node cache so we see the external ip if available
|
| 851 |
+
provider.non_terminated_nodes(head_node_tags)
|
| 852 |
+
|
| 853 |
+
if updater.exitcode != 0:
|
| 854 |
+
# todo: this does not follow the mockup and is not good enough
|
| 855 |
+
cli_logger.abort("Failed to setup head node.")
|
| 856 |
+
sys.exit(1)
|
| 857 |
+
|
| 858 |
+
global_event_system.execute_callback(
|
| 859 |
+
CreateClusterEvent.cluster_booting_completed,
|
| 860 |
+
{
|
| 861 |
+
"head_node_id": head_node,
|
| 862 |
+
},
|
| 863 |
+
)
|
| 864 |
+
|
| 865 |
+
monitor_str = "tail -n 100 -f /tmp/ray/session_latest/logs/monitor*"
|
| 866 |
+
if override_cluster_name:
|
| 867 |
+
modifiers = " --cluster-name={}".format(quote(override_cluster_name))
|
| 868 |
+
else:
|
| 869 |
+
modifiers = ""
|
| 870 |
+
|
| 871 |
+
cli_logger.newline()
|
| 872 |
+
with cli_logger.group("Useful commands:"):
|
| 873 |
+
printable_config_file = os.path.abspath(printable_config_file)
|
| 874 |
+
|
| 875 |
+
cli_logger.print("To terminate the cluster:")
|
| 876 |
+
cli_logger.print(cf.bold(f" ray down {printable_config_file}{modifiers}"))
|
| 877 |
+
cli_logger.newline()
|
| 878 |
+
|
| 879 |
+
cli_logger.print("To retrieve the IP address of the cluster head:")
|
| 880 |
+
cli_logger.print(
|
| 881 |
+
cf.bold(f" ray get-head-ip {printable_config_file}{modifiers}")
|
| 882 |
+
)
|
| 883 |
+
cli_logger.newline()
|
| 884 |
+
|
| 885 |
+
cli_logger.print(
|
| 886 |
+
"To port-forward the cluster's Ray Dashboard to the local machine:"
|
| 887 |
+
)
|
| 888 |
+
cli_logger.print(cf.bold(f" ray dashboard {printable_config_file}{modifiers}"))
|
| 889 |
+
cli_logger.newline()
|
| 890 |
+
|
| 891 |
+
cli_logger.print(
|
| 892 |
+
"To submit a job to the cluster, port-forward the "
|
| 893 |
+
"Ray Dashboard in another terminal and run:"
|
| 894 |
+
)
|
| 895 |
+
cli_logger.print(
|
| 896 |
+
cf.bold(
|
| 897 |
+
" ray job submit --address http://localhost:<dashboard-port> "
|
| 898 |
+
"--working-dir . -- python my_script.py"
|
| 899 |
+
)
|
| 900 |
+
)
|
| 901 |
+
cli_logger.newline()
|
| 902 |
+
|
| 903 |
+
cli_logger.print("To connect to a terminal on the cluster head for debugging:")
|
| 904 |
+
cli_logger.print(cf.bold(f" ray attach {printable_config_file}{modifiers}"))
|
| 905 |
+
cli_logger.newline()
|
| 906 |
+
|
| 907 |
+
cli_logger.print("To monitor autoscaling:")
|
| 908 |
+
cli_logger.print(
|
| 909 |
+
cf.bold(
|
| 910 |
+
f" ray exec {printable_config_file}{modifiers} {quote(monitor_str)}"
|
| 911 |
+
)
|
| 912 |
+
)
|
| 913 |
+
cli_logger.newline()
|
| 914 |
+
|
| 915 |
+
|
| 916 |
+
def _should_create_new_head(
|
| 917 |
+
head_node_id: Optional[str],
|
| 918 |
+
new_launch_hash: str,
|
| 919 |
+
new_head_node_type: str,
|
| 920 |
+
provider: NodeProvider,
|
| 921 |
+
) -> bool:
|
| 922 |
+
"""Decides whether a new head node needs to be created.
|
| 923 |
+
|
| 924 |
+
We need a new head if at least one of the following holds:
|
| 925 |
+
(a) There isn't an existing head node
|
| 926 |
+
(b) The user-submitted head node_config differs from the existing head
|
| 927 |
+
node's node_config.
|
| 928 |
+
(c) The user-submitted head node_type key differs from the existing head
|
| 929 |
+
node's node_type.
|
| 930 |
+
|
| 931 |
+
Args:
|
| 932 |
+
head_node_id (Optional[str]): head node id if a head exists, else None
|
| 933 |
+
new_launch_hash: hash of current user-submitted head config
|
| 934 |
+
new_head_node_type: current user-submitted head node-type key
|
| 935 |
+
|
| 936 |
+
Returns:
|
| 937 |
+
bool: True if a new Ray head node should be launched, False otherwise
|
| 938 |
+
"""
|
| 939 |
+
if not head_node_id:
|
| 940 |
+
# No head node exists, need to create it.
|
| 941 |
+
return True
|
| 942 |
+
|
| 943 |
+
# Pull existing head's data.
|
| 944 |
+
head_tags = provider.node_tags(head_node_id)
|
| 945 |
+
current_launch_hash = head_tags.get(TAG_RAY_LAUNCH_CONFIG)
|
| 946 |
+
current_head_type = head_tags.get(TAG_RAY_USER_NODE_TYPE)
|
| 947 |
+
|
| 948 |
+
# Compare to current head
|
| 949 |
+
hashes_mismatch = new_launch_hash != current_launch_hash
|
| 950 |
+
types_mismatch = new_head_node_type != current_head_type
|
| 951 |
+
|
| 952 |
+
new_head_required = hashes_mismatch or types_mismatch
|
| 953 |
+
|
| 954 |
+
# Warn user
|
| 955 |
+
if new_head_required:
|
| 956 |
+
with cli_logger.group(
|
| 957 |
+
"Currently running head node is out-of-date with cluster configuration"
|
| 958 |
+
):
|
| 959 |
+
if hashes_mismatch:
|
| 960 |
+
cli_logger.print(
|
| 961 |
+
"Current hash is {}, expected {}",
|
| 962 |
+
cf.bold(current_launch_hash),
|
| 963 |
+
cf.bold(new_launch_hash),
|
| 964 |
+
)
|
| 965 |
+
|
| 966 |
+
if types_mismatch:
|
| 967 |
+
cli_logger.print(
|
| 968 |
+
"Current head node type is {}, expected {}",
|
| 969 |
+
cf.bold(current_head_type),
|
| 970 |
+
cf.bold(new_head_node_type),
|
| 971 |
+
)
|
| 972 |
+
|
| 973 |
+
return new_head_required
|
| 974 |
+
|
| 975 |
+
|
| 976 |
+
def _set_up_config_for_head_node(
|
| 977 |
+
config: Dict[str, Any], provider: NodeProvider, no_restart: bool
|
| 978 |
+
) -> Tuple[Dict[str, Any], Any]:
|
| 979 |
+
"""Prepares autoscaling config and, if needed, ssh key, to be mounted onto
|
| 980 |
+
the Ray head node for use by the autoscaler.
|
| 981 |
+
|
| 982 |
+
Returns the modified config and the temporary config file that will be
|
| 983 |
+
mounted onto the head node.
|
| 984 |
+
"""
|
| 985 |
+
# Rewrite the auth config so that the head
|
| 986 |
+
# node can update the workers
|
| 987 |
+
remote_config = copy.deepcopy(config)
|
| 988 |
+
|
| 989 |
+
# drop proxy options if they exist, otherwise
|
| 990 |
+
# head node won't be able to connect to workers
|
| 991 |
+
remote_config["auth"].pop("ssh_proxy_command", None)
|
| 992 |
+
|
| 993 |
+
# Drop the head_node field if it was introduced. It is technically not a
|
| 994 |
+
# valid field in the config, but it may have been introduced after
|
| 995 |
+
# validation (see _bootstrap_config() call to
|
| 996 |
+
# provider_cls.bootstrap_config(config)). The head node will never try to
|
| 997 |
+
# launch a head node so it doesn't need these defaults.
|
| 998 |
+
remote_config.pop("head_node", None)
|
| 999 |
+
|
| 1000 |
+
if "ssh_private_key" in config["auth"]:
|
| 1001 |
+
remote_key_path = "~/ray_bootstrap_key.pem"
|
| 1002 |
+
remote_config["auth"]["ssh_private_key"] = remote_key_path
|
| 1003 |
+
|
| 1004 |
+
# Adjust for new file locations
|
| 1005 |
+
new_mounts = {}
|
| 1006 |
+
for remote_path in config["file_mounts"]:
|
| 1007 |
+
new_mounts[remote_path] = remote_path
|
| 1008 |
+
remote_config["file_mounts"] = new_mounts
|
| 1009 |
+
remote_config["no_restart"] = no_restart
|
| 1010 |
+
|
| 1011 |
+
remote_config = provider.prepare_for_head_node(remote_config)
|
| 1012 |
+
|
| 1013 |
+
# Now inject the rewritten config and SSH key into the head node
|
| 1014 |
+
remote_config_file = tempfile.NamedTemporaryFile("w", prefix="ray-bootstrap-")
|
| 1015 |
+
remote_config_file.write(json.dumps(remote_config))
|
| 1016 |
+
remote_config_file.flush()
|
| 1017 |
+
config["file_mounts"].update(
|
| 1018 |
+
{"~/ray_bootstrap_config.yaml": remote_config_file.name}
|
| 1019 |
+
)
|
| 1020 |
+
|
| 1021 |
+
if "ssh_private_key" in config["auth"]:
|
| 1022 |
+
config["file_mounts"].update(
|
| 1023 |
+
{
|
| 1024 |
+
remote_key_path: config["auth"]["ssh_private_key"],
|
| 1025 |
+
}
|
| 1026 |
+
)
|
| 1027 |
+
|
| 1028 |
+
return config, remote_config_file
|
| 1029 |
+
|
| 1030 |
+
|
| 1031 |
+
def attach_cluster(
|
| 1032 |
+
config_file: str,
|
| 1033 |
+
start: bool,
|
| 1034 |
+
use_screen: bool,
|
| 1035 |
+
use_tmux: bool,
|
| 1036 |
+
override_cluster_name: Optional[str],
|
| 1037 |
+
no_config_cache: bool = False,
|
| 1038 |
+
new: bool = False,
|
| 1039 |
+
port_forward: Optional[Port_forward] = None,
|
| 1040 |
+
) -> None:
|
| 1041 |
+
"""Attaches to a screen for the specified cluster.
|
| 1042 |
+
|
| 1043 |
+
Arguments:
|
| 1044 |
+
config_file: path to the cluster yaml
|
| 1045 |
+
start: whether to start the cluster if it isn't up
|
| 1046 |
+
use_screen: whether to use screen as multiplexer
|
| 1047 |
+
use_tmux: whether to use tmux as multiplexer
|
| 1048 |
+
override_cluster_name: set the name of the cluster
|
| 1049 |
+
new: whether to force a new screen
|
| 1050 |
+
port_forward ( (int,int) or list[(int,int)] ): port(s) to forward
|
| 1051 |
+
"""
|
| 1052 |
+
|
| 1053 |
+
if use_tmux:
|
| 1054 |
+
if new:
|
| 1055 |
+
cmd = "tmux new"
|
| 1056 |
+
else:
|
| 1057 |
+
cmd = "tmux attach || tmux new"
|
| 1058 |
+
elif use_screen:
|
| 1059 |
+
if new:
|
| 1060 |
+
cmd = "screen -L"
|
| 1061 |
+
else:
|
| 1062 |
+
cmd = "screen -L -xRR"
|
| 1063 |
+
else:
|
| 1064 |
+
if new:
|
| 1065 |
+
raise ValueError("--new only makes sense if passing --screen or --tmux")
|
| 1066 |
+
cmd = "$SHELL"
|
| 1067 |
+
|
| 1068 |
+
exec_cluster(
|
| 1069 |
+
config_file,
|
| 1070 |
+
cmd=cmd,
|
| 1071 |
+
run_env="auto",
|
| 1072 |
+
screen=False,
|
| 1073 |
+
tmux=False,
|
| 1074 |
+
stop=False,
|
| 1075 |
+
start=start,
|
| 1076 |
+
override_cluster_name=override_cluster_name,
|
| 1077 |
+
no_config_cache=no_config_cache,
|
| 1078 |
+
port_forward=port_forward,
|
| 1079 |
+
_allow_uninitialized_state=True,
|
| 1080 |
+
)
|
| 1081 |
+
|
| 1082 |
+
|
| 1083 |
+
def exec_cluster(
|
| 1084 |
+
config_file: str,
|
| 1085 |
+
*,
|
| 1086 |
+
cmd: Optional[str] = None,
|
| 1087 |
+
run_env: str = "auto",
|
| 1088 |
+
screen: bool = False,
|
| 1089 |
+
tmux: bool = False,
|
| 1090 |
+
stop: bool = False,
|
| 1091 |
+
start: bool = False,
|
| 1092 |
+
override_cluster_name: Optional[str] = None,
|
| 1093 |
+
no_config_cache: bool = False,
|
| 1094 |
+
port_forward: Optional[Port_forward] = None,
|
| 1095 |
+
with_output: bool = False,
|
| 1096 |
+
_allow_uninitialized_state: bool = False,
|
| 1097 |
+
extra_screen_args: Optional[str] = None,
|
| 1098 |
+
) -> str:
|
| 1099 |
+
"""Runs a command on the specified cluster.
|
| 1100 |
+
|
| 1101 |
+
Arguments:
|
| 1102 |
+
config_file: path to the cluster yaml
|
| 1103 |
+
cmd: command to run
|
| 1104 |
+
run_env: whether to run the command on the host or in a container.
|
| 1105 |
+
Select between "auto", "host" and "docker"
|
| 1106 |
+
screen: whether to run in a screen
|
| 1107 |
+
extra_screen_args: optional custom additional args to screen command
|
| 1108 |
+
tmux: whether to run in a tmux session
|
| 1109 |
+
stop: whether to stop the cluster after command run
|
| 1110 |
+
start: whether to start the cluster if it isn't up
|
| 1111 |
+
override_cluster_name: set the name of the cluster
|
| 1112 |
+
port_forward ( (int, int) or list[(int, int)] ): port(s) to forward
|
| 1113 |
+
_allow_uninitialized_state: whether to execute on an uninitialized head
|
| 1114 |
+
node.
|
| 1115 |
+
"""
|
| 1116 |
+
assert not (screen and tmux), "Can specify only one of `screen` or `tmux`."
|
| 1117 |
+
assert run_env in RUN_ENV_TYPES, "--run_env must be in {}".format(RUN_ENV_TYPES)
|
| 1118 |
+
# TODO(rliaw): We default this to True to maintain backwards-compat.
|
| 1119 |
+
# In the future we would want to support disabling login-shells
|
| 1120 |
+
# and interactivity.
|
| 1121 |
+
cmd_output_util.set_allow_interactive(True)
|
| 1122 |
+
|
| 1123 |
+
config = yaml.safe_load(open(config_file).read())
|
| 1124 |
+
if override_cluster_name is not None:
|
| 1125 |
+
config["cluster_name"] = override_cluster_name
|
| 1126 |
+
config = _bootstrap_config(config, no_config_cache=no_config_cache)
|
| 1127 |
+
|
| 1128 |
+
head_node = _get_running_head_node(
|
| 1129 |
+
config,
|
| 1130 |
+
config_file,
|
| 1131 |
+
override_cluster_name,
|
| 1132 |
+
create_if_needed=start,
|
| 1133 |
+
_allow_uninitialized_state=_allow_uninitialized_state,
|
| 1134 |
+
)
|
| 1135 |
+
|
| 1136 |
+
provider = _get_node_provider(config["provider"], config["cluster_name"])
|
| 1137 |
+
updater = NodeUpdaterThread(
|
| 1138 |
+
node_id=head_node,
|
| 1139 |
+
provider_config=config["provider"],
|
| 1140 |
+
provider=provider,
|
| 1141 |
+
auth_config=config["auth"],
|
| 1142 |
+
cluster_name=config["cluster_name"],
|
| 1143 |
+
file_mounts=config["file_mounts"],
|
| 1144 |
+
initialization_commands=[],
|
| 1145 |
+
setup_commands=[],
|
| 1146 |
+
ray_start_commands=[],
|
| 1147 |
+
runtime_hash="",
|
| 1148 |
+
file_mounts_contents_hash="",
|
| 1149 |
+
is_head_node=True,
|
| 1150 |
+
rsync_options={
|
| 1151 |
+
"rsync_exclude": config.get("rsync_exclude"),
|
| 1152 |
+
"rsync_filter": config.get("rsync_filter"),
|
| 1153 |
+
},
|
| 1154 |
+
docker_config=config.get("docker"),
|
| 1155 |
+
)
|
| 1156 |
+
if cmd and stop:
|
| 1157 |
+
cmd = "; ".join(
|
| 1158 |
+
[
|
| 1159 |
+
cmd,
|
| 1160 |
+
"ray stop",
|
| 1161 |
+
"ray teardown ~/ray_bootstrap_config.yaml --yes --workers-only",
|
| 1162 |
+
"sudo shutdown -h now",
|
| 1163 |
+
]
|
| 1164 |
+
)
|
| 1165 |
+
|
| 1166 |
+
result = _exec(
|
| 1167 |
+
updater,
|
| 1168 |
+
cmd,
|
| 1169 |
+
screen,
|
| 1170 |
+
tmux,
|
| 1171 |
+
port_forward=port_forward,
|
| 1172 |
+
with_output=with_output,
|
| 1173 |
+
run_env=run_env,
|
| 1174 |
+
shutdown_after_run=False,
|
| 1175 |
+
extra_screen_args=extra_screen_args,
|
| 1176 |
+
)
|
| 1177 |
+
if tmux or screen:
|
| 1178 |
+
attach_command_parts = ["ray attach", config_file]
|
| 1179 |
+
if override_cluster_name is not None:
|
| 1180 |
+
attach_command_parts.append(
|
| 1181 |
+
"--cluster-name={}".format(override_cluster_name)
|
| 1182 |
+
)
|
| 1183 |
+
if tmux:
|
| 1184 |
+
attach_command_parts.append("--tmux")
|
| 1185 |
+
elif screen:
|
| 1186 |
+
attach_command_parts.append("--screen")
|
| 1187 |
+
|
| 1188 |
+
attach_command = " ".join(attach_command_parts)
|
| 1189 |
+
cli_logger.print("Run `{}` to check command status.", cf.bold(attach_command))
|
| 1190 |
+
return result
|
| 1191 |
+
|
| 1192 |
+
|
| 1193 |
+
def _exec(
|
| 1194 |
+
updater: NodeUpdaterThread,
|
| 1195 |
+
cmd: Optional[str] = None,
|
| 1196 |
+
screen: bool = False,
|
| 1197 |
+
tmux: bool = False,
|
| 1198 |
+
port_forward: Optional[Port_forward] = None,
|
| 1199 |
+
with_output: bool = False,
|
| 1200 |
+
run_env: str = "auto",
|
| 1201 |
+
shutdown_after_run: bool = False,
|
| 1202 |
+
extra_screen_args: Optional[str] = None,
|
| 1203 |
+
) -> str:
|
| 1204 |
+
if cmd:
|
| 1205 |
+
if screen:
|
| 1206 |
+
wrapped_cmd = [
|
| 1207 |
+
"screen",
|
| 1208 |
+
"-L",
|
| 1209 |
+
"-dm",
|
| 1210 |
+
]
|
| 1211 |
+
|
| 1212 |
+
if extra_screen_args is not None and len(extra_screen_args) > 0:
|
| 1213 |
+
wrapped_cmd += [extra_screen_args]
|
| 1214 |
+
|
| 1215 |
+
wrapped_cmd += [
|
| 1216 |
+
"bash",
|
| 1217 |
+
"-c",
|
| 1218 |
+
quote(cmd + "; exec bash"),
|
| 1219 |
+
]
|
| 1220 |
+
cmd = " ".join(wrapped_cmd)
|
| 1221 |
+
elif tmux:
|
| 1222 |
+
# TODO: Consider providing named session functionality
|
| 1223 |
+
wrapped_cmd = [
|
| 1224 |
+
"tmux",
|
| 1225 |
+
"new",
|
| 1226 |
+
"-d",
|
| 1227 |
+
"bash",
|
| 1228 |
+
"-c",
|
| 1229 |
+
quote(cmd + "; exec bash"),
|
| 1230 |
+
]
|
| 1231 |
+
cmd = " ".join(wrapped_cmd)
|
| 1232 |
+
return updater.cmd_runner.run(
|
| 1233 |
+
cmd,
|
| 1234 |
+
exit_on_fail=True,
|
| 1235 |
+
port_forward=port_forward,
|
| 1236 |
+
with_output=with_output,
|
| 1237 |
+
run_env=run_env,
|
| 1238 |
+
shutdown_after_run=shutdown_after_run,
|
| 1239 |
+
)
|
| 1240 |
+
|
| 1241 |
+
|
| 1242 |
+
def rsync(
|
| 1243 |
+
config_file: str,
|
| 1244 |
+
source: Optional[str],
|
| 1245 |
+
target: Optional[str],
|
| 1246 |
+
override_cluster_name: Optional[str],
|
| 1247 |
+
down: bool,
|
| 1248 |
+
ip_address: Optional[str] = None,
|
| 1249 |
+
use_internal_ip: bool = False,
|
| 1250 |
+
no_config_cache: bool = False,
|
| 1251 |
+
all_nodes: bool = False,
|
| 1252 |
+
should_bootstrap: bool = True,
|
| 1253 |
+
_runner: ModuleType = subprocess,
|
| 1254 |
+
) -> None:
|
| 1255 |
+
"""Rsyncs files.
|
| 1256 |
+
|
| 1257 |
+
Arguments:
|
| 1258 |
+
config_file: path to the cluster yaml
|
| 1259 |
+
source: source dir
|
| 1260 |
+
target: target dir
|
| 1261 |
+
override_cluster_name: set the name of the cluster
|
| 1262 |
+
down: whether we're syncing remote -> local
|
| 1263 |
+
ip_address: Address of node. Raise Exception
|
| 1264 |
+
if both ip_address and 'all_nodes' are provided.
|
| 1265 |
+
use_internal_ip: Whether the provided ip_address is
|
| 1266 |
+
public or private.
|
| 1267 |
+
all_nodes: whether to sync worker nodes in addition to the head node
|
| 1268 |
+
should_bootstrap: whether to bootstrap cluster config before syncing
|
| 1269 |
+
"""
|
| 1270 |
+
if bool(source) != bool(target):
|
| 1271 |
+
cli_logger.abort("Expected either both a source and a target, or neither.")
|
| 1272 |
+
|
| 1273 |
+
assert bool(source) == bool(
|
| 1274 |
+
target
|
| 1275 |
+
), "Must either provide both or neither source and target."
|
| 1276 |
+
|
| 1277 |
+
if ip_address and all_nodes:
|
| 1278 |
+
cli_logger.abort("Cannot provide both ip_address and 'all_nodes'.")
|
| 1279 |
+
|
| 1280 |
+
config = yaml.safe_load(open(config_file).read())
|
| 1281 |
+
if override_cluster_name is not None:
|
| 1282 |
+
config["cluster_name"] = override_cluster_name
|
| 1283 |
+
if should_bootstrap:
|
| 1284 |
+
config = _bootstrap_config(config, no_config_cache=no_config_cache)
|
| 1285 |
+
|
| 1286 |
+
is_file_mount = False
|
| 1287 |
+
if source and target:
|
| 1288 |
+
for remote_mount in config.get("file_mounts", {}).keys():
|
| 1289 |
+
if (source if down else target).startswith(remote_mount):
|
| 1290 |
+
is_file_mount = True
|
| 1291 |
+
break
|
| 1292 |
+
|
| 1293 |
+
provider = _get_node_provider(config["provider"], config["cluster_name"])
|
| 1294 |
+
|
| 1295 |
+
def rsync_to_node(node_id, is_head_node):
|
| 1296 |
+
updater = NodeUpdaterThread(
|
| 1297 |
+
node_id=node_id,
|
| 1298 |
+
provider_config=config["provider"],
|
| 1299 |
+
provider=provider,
|
| 1300 |
+
auth_config=config["auth"],
|
| 1301 |
+
cluster_name=config["cluster_name"],
|
| 1302 |
+
file_mounts=config["file_mounts"],
|
| 1303 |
+
initialization_commands=[],
|
| 1304 |
+
setup_commands=[],
|
| 1305 |
+
ray_start_commands=[],
|
| 1306 |
+
runtime_hash="",
|
| 1307 |
+
use_internal_ip=use_internal_ip,
|
| 1308 |
+
process_runner=_runner,
|
| 1309 |
+
file_mounts_contents_hash="",
|
| 1310 |
+
is_head_node=is_head_node,
|
| 1311 |
+
rsync_options={
|
| 1312 |
+
"rsync_exclude": config.get("rsync_exclude"),
|
| 1313 |
+
"rsync_filter": config.get("rsync_filter"),
|
| 1314 |
+
},
|
| 1315 |
+
docker_config=config.get("docker"),
|
| 1316 |
+
)
|
| 1317 |
+
if down:
|
| 1318 |
+
rsync = updater.rsync_down
|
| 1319 |
+
else:
|
| 1320 |
+
rsync = updater.rsync_up
|
| 1321 |
+
|
| 1322 |
+
if source and target:
|
| 1323 |
+
# print rsync progress for single file rsync
|
| 1324 |
+
if cli_logger.verbosity > 0:
|
| 1325 |
+
cmd_output_util.set_output_redirected(False)
|
| 1326 |
+
set_rsync_silent(False)
|
| 1327 |
+
rsync(source, target, is_file_mount)
|
| 1328 |
+
else:
|
| 1329 |
+
updater.sync_file_mounts(rsync)
|
| 1330 |
+
|
| 1331 |
+
nodes = []
|
| 1332 |
+
head_node = _get_running_head_node(
|
| 1333 |
+
config, config_file, override_cluster_name, create_if_needed=False
|
| 1334 |
+
)
|
| 1335 |
+
if ip_address:
|
| 1336 |
+
nodes = [provider.get_node_id(ip_address, use_internal_ip=use_internal_ip)]
|
| 1337 |
+
else:
|
| 1338 |
+
nodes = [head_node]
|
| 1339 |
+
if all_nodes:
|
| 1340 |
+
nodes.extend(_get_worker_nodes(config, override_cluster_name))
|
| 1341 |
+
|
| 1342 |
+
for node_id in nodes:
|
| 1343 |
+
rsync_to_node(node_id, is_head_node=(node_id == head_node))
|
| 1344 |
+
|
| 1345 |
+
|
| 1346 |
+
def get_head_node_ip(
|
| 1347 |
+
config_file: str, override_cluster_name: Optional[str] = None
|
| 1348 |
+
) -> str:
|
| 1349 |
+
"""Returns head node IP for given configuration file if exists."""
|
| 1350 |
+
|
| 1351 |
+
config = yaml.safe_load(open(config_file).read())
|
| 1352 |
+
if override_cluster_name is not None:
|
| 1353 |
+
config["cluster_name"] = override_cluster_name
|
| 1354 |
+
|
| 1355 |
+
provider = _get_node_provider(config["provider"], config["cluster_name"])
|
| 1356 |
+
head_node = _get_running_head_node(config, config_file, override_cluster_name)
|
| 1357 |
+
provider_cfg = config.get("provider", {})
|
| 1358 |
+
# Get internal IP if using internal IPs and
|
| 1359 |
+
# use_external_head_ip is not specified
|
| 1360 |
+
if provider_cfg.get("use_internal_ips", False) and not provider_cfg.get(
|
| 1361 |
+
"use_external_head_ip", False
|
| 1362 |
+
):
|
| 1363 |
+
head_node_ip = provider.internal_ip(head_node)
|
| 1364 |
+
else:
|
| 1365 |
+
head_node_ip = provider.external_ip(head_node)
|
| 1366 |
+
|
| 1367 |
+
return head_node_ip
|
| 1368 |
+
|
| 1369 |
+
|
| 1370 |
+
def get_worker_node_ips(
|
| 1371 |
+
config_file: str, override_cluster_name: Optional[str] = None
|
| 1372 |
+
) -> List[str]:
|
| 1373 |
+
"""Returns worker node IPs for given configuration file."""
|
| 1374 |
+
|
| 1375 |
+
config = yaml.safe_load(open(config_file).read())
|
| 1376 |
+
if override_cluster_name is not None:
|
| 1377 |
+
config["cluster_name"] = override_cluster_name
|
| 1378 |
+
|
| 1379 |
+
provider = _get_node_provider(config["provider"], config["cluster_name"])
|
| 1380 |
+
nodes = provider.non_terminated_nodes({TAG_RAY_NODE_KIND: NODE_KIND_WORKER})
|
| 1381 |
+
|
| 1382 |
+
if config.get("provider", {}).get("use_internal_ips", False):
|
| 1383 |
+
return [provider.internal_ip(node) for node in nodes]
|
| 1384 |
+
else:
|
| 1385 |
+
return [provider.external_ip(node) for node in nodes]
|
| 1386 |
+
|
| 1387 |
+
|
| 1388 |
+
def _get_worker_nodes(
|
| 1389 |
+
config: Dict[str, Any], override_cluster_name: Optional[str]
|
| 1390 |
+
) -> List[str]:
|
| 1391 |
+
"""Returns worker node ids for given configuration."""
|
| 1392 |
+
# todo: technically could be reused in get_worker_node_ips
|
| 1393 |
+
if override_cluster_name is not None:
|
| 1394 |
+
config["cluster_name"] = override_cluster_name
|
| 1395 |
+
|
| 1396 |
+
provider = _get_node_provider(config["provider"], config["cluster_name"])
|
| 1397 |
+
return provider.non_terminated_nodes({TAG_RAY_NODE_KIND: NODE_KIND_WORKER})
|
| 1398 |
+
|
| 1399 |
+
|
| 1400 |
+
def _get_running_head_node(
|
| 1401 |
+
config: Dict[str, Any],
|
| 1402 |
+
printable_config_file: str,
|
| 1403 |
+
override_cluster_name: Optional[str],
|
| 1404 |
+
create_if_needed: bool = False,
|
| 1405 |
+
_provider: Optional[NodeProvider] = None,
|
| 1406 |
+
_allow_uninitialized_state: bool = False,
|
| 1407 |
+
) -> str:
|
| 1408 |
+
"""Get a valid, running head node.
|
| 1409 |
+
Args:
|
| 1410 |
+
config (Dict[str, Any]): Cluster Config dictionary
|
| 1411 |
+
printable_config_file: Used for printing formatted CLI commands.
|
| 1412 |
+
override_cluster_name: Passed to `get_or_create_head_node` to
|
| 1413 |
+
override the cluster name present in `config`.
|
| 1414 |
+
create_if_needed: Create a head node if one is not present.
|
| 1415 |
+
_provider: [For testing], a Node Provider to use.
|
| 1416 |
+
_allow_uninitialized_state: Whether to return a head node that
|
| 1417 |
+
is not 'UP TO DATE'. This is used to allow `ray attach` and
|
| 1418 |
+
`ray exec` to debug a cluster in a bad state.
|
| 1419 |
+
|
| 1420 |
+
"""
|
| 1421 |
+
provider = _provider or _get_node_provider(
|
| 1422 |
+
config["provider"], config["cluster_name"]
|
| 1423 |
+
)
|
| 1424 |
+
head_node_tags = {
|
| 1425 |
+
TAG_RAY_NODE_KIND: NODE_KIND_HEAD,
|
| 1426 |
+
}
|
| 1427 |
+
nodes = provider.non_terminated_nodes(head_node_tags)
|
| 1428 |
+
head_node = None
|
| 1429 |
+
_backup_head_node = None
|
| 1430 |
+
for node in nodes:
|
| 1431 |
+
node_state = provider.node_tags(node).get(TAG_RAY_NODE_STATUS)
|
| 1432 |
+
if node_state == STATUS_UP_TO_DATE:
|
| 1433 |
+
head_node = node
|
| 1434 |
+
else:
|
| 1435 |
+
_backup_head_node = node
|
| 1436 |
+
cli_logger.warning(f"Head node ({node}) is in state {node_state}.")
|
| 1437 |
+
|
| 1438 |
+
if head_node is not None:
|
| 1439 |
+
return head_node
|
| 1440 |
+
elif create_if_needed:
|
| 1441 |
+
get_or_create_head_node(
|
| 1442 |
+
config,
|
| 1443 |
+
printable_config_file=printable_config_file,
|
| 1444 |
+
restart_only=False,
|
| 1445 |
+
no_restart=False,
|
| 1446 |
+
yes=True,
|
| 1447 |
+
override_cluster_name=override_cluster_name,
|
| 1448 |
+
)
|
| 1449 |
+
# NOTE: `_allow_uninitialized_state` is forced to False if
|
| 1450 |
+
# `create_if_needed` is set to True. This is to ensure that the
|
| 1451 |
+
# commands executed after creation occur on an actually running
|
| 1452 |
+
# cluster.
|
| 1453 |
+
return _get_running_head_node(
|
| 1454 |
+
config,
|
| 1455 |
+
printable_config_file,
|
| 1456 |
+
override_cluster_name,
|
| 1457 |
+
create_if_needed=False,
|
| 1458 |
+
_allow_uninitialized_state=False,
|
| 1459 |
+
)
|
| 1460 |
+
else:
|
| 1461 |
+
if _allow_uninitialized_state and _backup_head_node is not None:
|
| 1462 |
+
cli_logger.warning(
|
| 1463 |
+
f"The head node being returned: {_backup_head_node} is not "
|
| 1464 |
+
"`up-to-date`. If you are not debugging a startup issue "
|
| 1465 |
+
"it is recommended to restart this head node with: {}",
|
| 1466 |
+
cf.bold(f" ray down {printable_config_file}"),
|
| 1467 |
+
)
|
| 1468 |
+
|
| 1469 |
+
return _backup_head_node
|
| 1470 |
+
raise RuntimeError(
|
| 1471 |
+
"Head node of cluster ({}) not found!".format(config["cluster_name"])
|
| 1472 |
+
)
|
| 1473 |
+
|
| 1474 |
+
|
| 1475 |
+
def get_local_dump_archive(
|
| 1476 |
+
stream: bool = False,
|
| 1477 |
+
output: Optional[str] = None,
|
| 1478 |
+
logs: bool = True,
|
| 1479 |
+
debug_state: bool = True,
|
| 1480 |
+
pip: bool = True,
|
| 1481 |
+
processes: bool = True,
|
| 1482 |
+
processes_verbose: bool = False,
|
| 1483 |
+
tempfile: Optional[str] = None,
|
| 1484 |
+
) -> Optional[str]:
|
| 1485 |
+
if stream and output:
|
| 1486 |
+
raise ValueError(
|
| 1487 |
+
"You can only use either `--output` or `--stream`, but not both."
|
| 1488 |
+
)
|
| 1489 |
+
|
| 1490 |
+
parameters = GetParameters(
|
| 1491 |
+
logs=logs,
|
| 1492 |
+
debug_state=debug_state,
|
| 1493 |
+
pip=pip,
|
| 1494 |
+
processes=processes,
|
| 1495 |
+
processes_verbose=processes_verbose,
|
| 1496 |
+
)
|
| 1497 |
+
|
| 1498 |
+
with Archive(file=tempfile) as archive:
|
| 1499 |
+
get_all_local_data(archive, parameters)
|
| 1500 |
+
|
| 1501 |
+
tmp = archive.file
|
| 1502 |
+
|
| 1503 |
+
if stream:
|
| 1504 |
+
with open(tmp, "rb") as fp:
|
| 1505 |
+
os.write(1, fp.read())
|
| 1506 |
+
os.remove(tmp)
|
| 1507 |
+
return None
|
| 1508 |
+
|
| 1509 |
+
target = output or os.path.join(os.getcwd(), os.path.basename(tmp))
|
| 1510 |
+
shutil.move(tmp, target)
|
| 1511 |
+
cli_logger.print(f"Created local data archive at {target}")
|
| 1512 |
+
|
| 1513 |
+
return target
|
| 1514 |
+
|
| 1515 |
+
|
| 1516 |
+
def get_cluster_dump_archive(
|
| 1517 |
+
cluster_config_file: Optional[str] = None,
|
| 1518 |
+
host: Optional[str] = None,
|
| 1519 |
+
ssh_user: Optional[str] = None,
|
| 1520 |
+
ssh_key: Optional[str] = None,
|
| 1521 |
+
docker: Optional[str] = None,
|
| 1522 |
+
local: Optional[bool] = None,
|
| 1523 |
+
output: Optional[str] = None,
|
| 1524 |
+
logs: bool = True,
|
| 1525 |
+
debug_state: bool = True,
|
| 1526 |
+
pip: bool = True,
|
| 1527 |
+
processes: bool = True,
|
| 1528 |
+
processes_verbose: bool = False,
|
| 1529 |
+
tempfile: Optional[str] = None,
|
| 1530 |
+
) -> Optional[str]:
|
| 1531 |
+
# Inform the user what kind of logs are collected (before actually
|
| 1532 |
+
# collecting, so they can abort)
|
| 1533 |
+
content_str = ""
|
| 1534 |
+
if logs:
|
| 1535 |
+
content_str += (
|
| 1536 |
+
" - The logfiles of your Ray session\n"
|
| 1537 |
+
" This usually includes Python outputs (stdout/stderr)\n"
|
| 1538 |
+
)
|
| 1539 |
+
|
| 1540 |
+
if debug_state:
|
| 1541 |
+
content_str += (
|
| 1542 |
+
" - Debug state information on your Ray cluster \n"
|
| 1543 |
+
" e.g. number of workers, drivers, objects, etc.\n"
|
| 1544 |
+
)
|
| 1545 |
+
|
| 1546 |
+
if pip:
|
| 1547 |
+
content_str += " - Your installed Python packages (`pip freeze`)\n"
|
| 1548 |
+
|
| 1549 |
+
if processes:
|
| 1550 |
+
content_str += (
|
| 1551 |
+
" - Information on your running Ray processes\n"
|
| 1552 |
+
" This includes command line arguments\n"
|
| 1553 |
+
)
|
| 1554 |
+
|
| 1555 |
+
cli_logger.warning(
|
| 1556 |
+
"You are about to create a cluster dump. This will collect data from "
|
| 1557 |
+
"cluster nodes.\n\n"
|
| 1558 |
+
"The dump will contain this information:\n\n"
|
| 1559 |
+
f"{content_str}\n"
|
| 1560 |
+
f"If you are concerned about leaking private information, extract "
|
| 1561 |
+
f"the archive and inspect its contents before sharing it with "
|
| 1562 |
+
f"anyone."
|
| 1563 |
+
)
|
| 1564 |
+
|
| 1565 |
+
# Parse arguments (e.g. fetch info from cluster config)
|
| 1566 |
+
(
|
| 1567 |
+
cluster_config_file,
|
| 1568 |
+
hosts,
|
| 1569 |
+
ssh_user,
|
| 1570 |
+
ssh_key,
|
| 1571 |
+
docker,
|
| 1572 |
+
cluster_name,
|
| 1573 |
+
) = _info_from_params(cluster_config_file, host, ssh_user, ssh_key, docker)
|
| 1574 |
+
|
| 1575 |
+
nodes = [
|
| 1576 |
+
Node(host=h, ssh_user=ssh_user, ssh_key=ssh_key, docker_container=docker)
|
| 1577 |
+
for h in hosts
|
| 1578 |
+
]
|
| 1579 |
+
|
| 1580 |
+
if not nodes:
|
| 1581 |
+
cli_logger.error(
|
| 1582 |
+
"No nodes found. Specify with `--host` or by passing a ray "
|
| 1583 |
+
"cluster config to `--cluster`."
|
| 1584 |
+
)
|
| 1585 |
+
return None
|
| 1586 |
+
|
| 1587 |
+
if cluster_config_file:
|
| 1588 |
+
nodes[0].is_head = True
|
| 1589 |
+
|
| 1590 |
+
if local is None:
|
| 1591 |
+
# If called with a cluster config, this was probably started
|
| 1592 |
+
# from a laptop
|
| 1593 |
+
local = not bool(cluster_config_file)
|
| 1594 |
+
|
| 1595 |
+
parameters = GetParameters(
|
| 1596 |
+
logs=logs,
|
| 1597 |
+
debug_state=debug_state,
|
| 1598 |
+
pip=pip,
|
| 1599 |
+
processes=processes,
|
| 1600 |
+
processes_verbose=processes_verbose,
|
| 1601 |
+
)
|
| 1602 |
+
|
| 1603 |
+
with Archive(file=tempfile) as archive:
|
| 1604 |
+
if local:
|
| 1605 |
+
create_archive_for_local_and_remote_nodes(
|
| 1606 |
+
archive, remote_nodes=nodes, parameters=parameters
|
| 1607 |
+
)
|
| 1608 |
+
else:
|
| 1609 |
+
create_archive_for_remote_nodes(
|
| 1610 |
+
archive, remote_nodes=nodes, parameters=parameters
|
| 1611 |
+
)
|
| 1612 |
+
|
| 1613 |
+
if not output:
|
| 1614 |
+
if cluster_name:
|
| 1615 |
+
filename = (
|
| 1616 |
+
f"{cluster_name}_" f"{datetime.datetime.now():%Y-%m-%d_%H-%M-%S}.tar.gz"
|
| 1617 |
+
)
|
| 1618 |
+
else:
|
| 1619 |
+
filename = (
|
| 1620 |
+
f"collected_logs_" f"{datetime.datetime.now():%Y-%m-%d_%H-%M-%S}.tar.gz"
|
| 1621 |
+
)
|
| 1622 |
+
output = os.path.join(os.getcwd(), filename)
|
| 1623 |
+
else:
|
| 1624 |
+
output = os.path.expanduser(output)
|
| 1625 |
+
|
| 1626 |
+
shutil.move(archive.file, output)
|
| 1627 |
+
return output
|
| 1628 |
+
|
| 1629 |
+
|
| 1630 |
+
def confirm(msg: str, yes: bool) -> Optional[bool]:
|
| 1631 |
+
return None if yes else click.confirm(msg, abort=True)
|
.venv/lib/python3.11/site-packages/ray/autoscaler/_private/constants.py
ADDED
|
@@ -0,0 +1,140 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import sys
|
| 3 |
+
|
| 4 |
+
from ray._private.ray_constants import ( # noqa F401
|
| 5 |
+
AUTOSCALER_RESOURCE_REQUEST_CHANNEL,
|
| 6 |
+
DEFAULT_OBJECT_STORE_MEMORY_PROPORTION,
|
| 7 |
+
LABELS_ENVIRONMENT_VARIABLE,
|
| 8 |
+
LOGGER_FORMAT,
|
| 9 |
+
RESOURCES_ENVIRONMENT_VARIABLE,
|
| 10 |
+
)
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
def env_integer(key, default):
|
| 14 |
+
if key in os.environ:
|
| 15 |
+
val = os.environ[key]
|
| 16 |
+
if val == "inf":
|
| 17 |
+
return sys.maxsize
|
| 18 |
+
else:
|
| 19 |
+
return int(val)
|
| 20 |
+
return default
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
# Whether autoscaler cluster status logging is enabled. Set to 0 disable.
|
| 24 |
+
AUTOSCALER_STATUS_LOG = env_integer("RAY_ENABLE_CLUSTER_STATUS_LOG", 1)
|
| 25 |
+
|
| 26 |
+
# The name of the environment variable for plugging in a utilization scorer.
|
| 27 |
+
AUTOSCALER_UTILIZATION_SCORER_KEY = "RAY_AUTOSCALER_UTILIZATION_SCORER"
|
| 28 |
+
|
| 29 |
+
# Whether to avoid launching GPU nodes for CPU only tasks.
|
| 30 |
+
AUTOSCALER_CONSERVE_GPU_NODES = env_integer("AUTOSCALER_CONSERVE_GPU_NODES", 1)
|
| 31 |
+
|
| 32 |
+
# How long to wait for a node to start and terminate, in seconds.
|
| 33 |
+
AUTOSCALER_NODE_START_WAIT_S = env_integer("AUTOSCALER_NODE_START_WAIT_S", 900)
|
| 34 |
+
AUTOSCALER_NODE_TERMINATE_WAIT_S = env_integer("AUTOSCALER_NODE_TERMINATE_WAIT_S", 900)
|
| 35 |
+
|
| 36 |
+
# Interval at which to check if node SSH became available.
|
| 37 |
+
AUTOSCALER_NODE_SSH_INTERVAL_S = env_integer("AUTOSCALER_NODE_SSH_INTERVAL_S", 5)
|
| 38 |
+
|
| 39 |
+
# Abort autoscaling if more than this number of errors are encountered. This
|
| 40 |
+
# is a safety feature to prevent e.g. runaway node launches.
|
| 41 |
+
AUTOSCALER_MAX_NUM_FAILURES = env_integer("AUTOSCALER_MAX_NUM_FAILURES", 5)
|
| 42 |
+
|
| 43 |
+
# The maximum number of nodes to launch in a single request.
|
| 44 |
+
# Multiple requests may be made for this batch size, up to
|
| 45 |
+
# the limit of AUTOSCALER_MAX_CONCURRENT_LAUNCHES.
|
| 46 |
+
AUTOSCALER_MAX_LAUNCH_BATCH = env_integer("AUTOSCALER_MAX_LAUNCH_BATCH", 5)
|
| 47 |
+
|
| 48 |
+
# Max number of nodes to launch at a time.
|
| 49 |
+
AUTOSCALER_MAX_CONCURRENT_LAUNCHES = env_integer(
|
| 50 |
+
"AUTOSCALER_MAX_CONCURRENT_LAUNCHES", 10
|
| 51 |
+
)
|
| 52 |
+
|
| 53 |
+
# Default upscaling speed for the autoscaler. This specifies how many nodes
|
| 54 |
+
# to request at a time, where the desired number to upscale is
|
| 55 |
+
# min(1, upscaling_speed * current_num_nodes)
|
| 56 |
+
# e.g. 1.0 means to request enough nodes to double
|
| 57 |
+
# the cluster size in each round of requests.
|
| 58 |
+
# When the upscaling speed is 0.0, the autoscaler will request 1 node.
|
| 59 |
+
DEFAULT_UPSCALING_SPEED = 0.0
|
| 60 |
+
|
| 61 |
+
# Interval at which to perform autoscaling updates.
|
| 62 |
+
AUTOSCALER_UPDATE_INTERVAL_S = env_integer("AUTOSCALER_UPDATE_INTERVAL_S", 5)
|
| 63 |
+
|
| 64 |
+
# The autoscaler will attempt to restart Ray on nodes it hasn't heard from
|
| 65 |
+
# in more than this interval.
|
| 66 |
+
AUTOSCALER_HEARTBEAT_TIMEOUT_S = env_integer("AUTOSCALER_HEARTBEAT_TIMEOUT_S", 30)
|
| 67 |
+
# The maximum number of nodes (including failed nodes) that the autoscaler will
|
| 68 |
+
# track for logging purposes.
|
| 69 |
+
AUTOSCALER_MAX_NODES_TRACKED = 1500
|
| 70 |
+
|
| 71 |
+
AUTOSCALER_MAX_FAILURES_DISPLAYED = 20
|
| 72 |
+
|
| 73 |
+
AUTOSCALER_NODE_AVAILABILITY_MAX_STALENESS_S = env_integer(
|
| 74 |
+
"AUTOSCALER_NODE_AVAILABILITY_MAX_STALENESS_S", 30 * 60
|
| 75 |
+
)
|
| 76 |
+
|
| 77 |
+
AUTOSCALER_REPORT_PER_NODE_STATUS = (
|
| 78 |
+
env_integer("AUTOSCALER_REPORT_PER_NODE_STATUS", 1) == 1
|
| 79 |
+
)
|
| 80 |
+
|
| 81 |
+
# The maximum allowed resource demand vector size to guarantee the resource
|
| 82 |
+
# demand scheduler bin packing algorithm takes a reasonable amount of time
|
| 83 |
+
# to run.
|
| 84 |
+
AUTOSCALER_MAX_RESOURCE_DEMAND_VECTOR_SIZE = 1000
|
| 85 |
+
|
| 86 |
+
# Port that autoscaler prometheus metrics will be exported to
|
| 87 |
+
AUTOSCALER_METRIC_PORT = env_integer("AUTOSCALER_METRIC_PORT", 44217)
|
| 88 |
+
|
| 89 |
+
# Max number of retries to AWS (default is 5, time increases exponentially)
|
| 90 |
+
BOTO_MAX_RETRIES = env_integer("BOTO_MAX_RETRIES", 12)
|
| 91 |
+
# Max number of retries to create an EC2 node (retry different subnet)
|
| 92 |
+
BOTO_CREATE_MAX_RETRIES = env_integer("BOTO_CREATE_MAX_RETRIES", 5)
|
| 93 |
+
|
| 94 |
+
# ray home path in the container image
|
| 95 |
+
RAY_HOME = "/home/ray"
|
| 96 |
+
|
| 97 |
+
# The order of this list matters! `scripts.py` kills the ray processes in order of this
|
| 98 |
+
# list. Think twice when you add to this list.
|
| 99 |
+
# Invariants:
|
| 100 |
+
# RAYLET must be the first in the list.
|
| 101 |
+
# GCS SERVER must be the last in the list.
|
| 102 |
+
RAY_PROCESSES = [
|
| 103 |
+
# The first element is the substring to filter.
|
| 104 |
+
# The second element, if True, is to filter ps results by command name
|
| 105 |
+
# (only the first 15 charactors of the executable name on Linux);
|
| 106 |
+
# if False, is to filter ps results by command with all its arguments.
|
| 107 |
+
# See STANDARD FORMAT SPECIFIERS section of
|
| 108 |
+
# http://man7.org/linux/man-pages/man1/ps.1.html
|
| 109 |
+
# about comm and args. This can help avoid killing non-ray processes.
|
| 110 |
+
# Format:
|
| 111 |
+
# Keyword to filter, filter by command (True)/filter by args (False)
|
| 112 |
+
["raylet", True],
|
| 113 |
+
["plasma_store", True],
|
| 114 |
+
["monitor.py", False],
|
| 115 |
+
["ray.util.client.server", False],
|
| 116 |
+
["default_worker.py", False], # Python worker.
|
| 117 |
+
["setup_worker.py", False], # Python environment setup worker.
|
| 118 |
+
# For mac osx, setproctitle doesn't change the process name returned
|
| 119 |
+
# by psutil but only cmdline.
|
| 120 |
+
[
|
| 121 |
+
"ray::",
|
| 122 |
+
sys.platform != "darwin",
|
| 123 |
+
], # Python worker. TODO(mehrdadn): Fix for Windows
|
| 124 |
+
["io.ray.runtime.runner.worker.DefaultWorker", False], # Java worker.
|
| 125 |
+
["log_monitor.py", False],
|
| 126 |
+
["reporter.py", False],
|
| 127 |
+
[os.path.join("dashboard", "agent.py"), False],
|
| 128 |
+
[os.path.join("dashboard", "dashboard.py"), False],
|
| 129 |
+
[os.path.join("runtime_env", "agent", "main.py"), False],
|
| 130 |
+
["ray_process_reaper.py", False],
|
| 131 |
+
["gcs_server", True],
|
| 132 |
+
]
|
| 133 |
+
|
| 134 |
+
# Max Concurrent SSH Calls to stop Docker
|
| 135 |
+
MAX_PARALLEL_SHUTDOWN_WORKERS = env_integer("MAX_PARALLEL_SHUTDOWN_WORKERS", 50)
|
| 136 |
+
|
| 137 |
+
DISABLE_NODE_UPDATERS_KEY = "disable_node_updaters"
|
| 138 |
+
DISABLE_LAUNCH_CONFIG_CHECK_KEY = "disable_launch_config_check"
|
| 139 |
+
FOREGROUND_NODE_LAUNCH_KEY = "foreground_node_launch"
|
| 140 |
+
WORKER_LIVENESS_CHECK_KEY = "worker_liveness_check"
|
.venv/lib/python3.11/site-packages/ray/autoscaler/_private/docker.py
ADDED
|
@@ -0,0 +1,129 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from pathlib import Path
|
| 2 |
+
from typing import Any, Dict
|
| 3 |
+
|
| 4 |
+
from ray.autoscaler._private.cli_logger import cli_logger
|
| 5 |
+
|
| 6 |
+
try: # py3
|
| 7 |
+
from shlex import quote
|
| 8 |
+
except ImportError: # py2
|
| 9 |
+
from pipes import quote
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
def _check_docker_file_mounts(file_mounts: Dict[str, str]) -> None:
|
| 13 |
+
"""Checks if files are passed as file_mounts. This is a problem for Docker
|
| 14 |
+
based clusters because when a file is bind-mounted in Docker, updates to
|
| 15 |
+
the file on the host do not always propagate to the container. Using
|
| 16 |
+
directories is recommended.
|
| 17 |
+
"""
|
| 18 |
+
for remote, local in file_mounts.items():
|
| 19 |
+
if Path(local).is_file():
|
| 20 |
+
cli_logger.warning(
|
| 21 |
+
f"File Mount: ({remote}:{local}) refers to a file.\n To ensure"
|
| 22 |
+
" this mount updates properly, please use a directory."
|
| 23 |
+
)
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
def validate_docker_config(config: Dict[str, Any]) -> None:
|
| 27 |
+
"""Checks whether the Docker configuration is valid."""
|
| 28 |
+
if "docker" not in config:
|
| 29 |
+
return
|
| 30 |
+
|
| 31 |
+
_check_docker_file_mounts(config.get("file_mounts", {}))
|
| 32 |
+
|
| 33 |
+
docker_image = config["docker"].get("image")
|
| 34 |
+
cname = config["docker"].get("container_name")
|
| 35 |
+
|
| 36 |
+
head_docker_image = config["docker"].get("head_image", docker_image)
|
| 37 |
+
|
| 38 |
+
worker_docker_image = config["docker"].get("worker_image", docker_image)
|
| 39 |
+
|
| 40 |
+
image_present = docker_image or (head_docker_image and worker_docker_image)
|
| 41 |
+
if (not cname) and (not image_present):
|
| 42 |
+
return
|
| 43 |
+
else:
|
| 44 |
+
assert cname and image_present, "Must provide a container & image name"
|
| 45 |
+
|
| 46 |
+
return None
|
| 47 |
+
|
| 48 |
+
|
| 49 |
+
def with_docker_exec(
|
| 50 |
+
cmds, container_name, docker_cmd, env_vars=None, with_interactive=False
|
| 51 |
+
):
|
| 52 |
+
assert docker_cmd, "Must provide docker command"
|
| 53 |
+
env_str = ""
|
| 54 |
+
if env_vars:
|
| 55 |
+
env_str = " ".join(["-e {env}=${env}".format(env=env) for env in env_vars])
|
| 56 |
+
return [
|
| 57 |
+
"docker exec {interactive} {env} {container} /bin/bash -c {cmd} ".format(
|
| 58 |
+
interactive="-it" if with_interactive else "",
|
| 59 |
+
env=env_str,
|
| 60 |
+
container=container_name,
|
| 61 |
+
cmd=quote(cmd),
|
| 62 |
+
)
|
| 63 |
+
for cmd in cmds
|
| 64 |
+
]
|
| 65 |
+
|
| 66 |
+
|
| 67 |
+
def _check_helper(cname, template, docker_cmd):
|
| 68 |
+
return " ".join(
|
| 69 |
+
[docker_cmd, "inspect", "-f", "'{{" + template + "}}'", cname, "||", "true"]
|
| 70 |
+
)
|
| 71 |
+
|
| 72 |
+
|
| 73 |
+
def check_docker_running_cmd(cname, docker_cmd):
|
| 74 |
+
return _check_helper(cname, ".State.Running", docker_cmd)
|
| 75 |
+
|
| 76 |
+
|
| 77 |
+
def check_bind_mounts_cmd(cname, docker_cmd):
|
| 78 |
+
return _check_helper(cname, "json .Mounts", docker_cmd)
|
| 79 |
+
|
| 80 |
+
|
| 81 |
+
def check_docker_image(cname, docker_cmd):
|
| 82 |
+
return _check_helper(cname, ".Config.Image", docker_cmd)
|
| 83 |
+
|
| 84 |
+
|
| 85 |
+
def docker_start_cmds(
|
| 86 |
+
user,
|
| 87 |
+
image,
|
| 88 |
+
mount_dict,
|
| 89 |
+
container_name,
|
| 90 |
+
user_options,
|
| 91 |
+
cluster_name,
|
| 92 |
+
home_directory,
|
| 93 |
+
docker_cmd,
|
| 94 |
+
):
|
| 95 |
+
# Imported here due to circular dependency.
|
| 96 |
+
from ray.autoscaler.sdk import get_docker_host_mount_location
|
| 97 |
+
|
| 98 |
+
docker_mount_prefix = get_docker_host_mount_location(cluster_name)
|
| 99 |
+
mount = {f"{docker_mount_prefix}/{dst}": dst for dst in mount_dict}
|
| 100 |
+
|
| 101 |
+
mount_flags = " ".join(
|
| 102 |
+
[
|
| 103 |
+
"-v {src}:{dest}".format(src=k, dest=v.replace("~/", home_directory + "/"))
|
| 104 |
+
for k, v in mount.items()
|
| 105 |
+
]
|
| 106 |
+
)
|
| 107 |
+
|
| 108 |
+
# for click, used in ray cli
|
| 109 |
+
env_vars = {"LC_ALL": "C.UTF-8", "LANG": "C.UTF-8"}
|
| 110 |
+
env_flags = " ".join(
|
| 111 |
+
["-e {name}={val}".format(name=k, val=v) for k, v in env_vars.items()]
|
| 112 |
+
)
|
| 113 |
+
|
| 114 |
+
user_options_str = " ".join(user_options)
|
| 115 |
+
docker_run = [
|
| 116 |
+
docker_cmd,
|
| 117 |
+
"run",
|
| 118 |
+
"--rm",
|
| 119 |
+
"--name {}".format(container_name),
|
| 120 |
+
"-d",
|
| 121 |
+
"-it",
|
| 122 |
+
mount_flags,
|
| 123 |
+
env_flags,
|
| 124 |
+
user_options_str,
|
| 125 |
+
"--net=host",
|
| 126 |
+
image,
|
| 127 |
+
"bash",
|
| 128 |
+
]
|
| 129 |
+
return " ".join(docker_run)
|
.venv/lib/python3.11/site-packages/ray/autoscaler/_private/event_summarizer.py
ADDED
|
@@ -0,0 +1,75 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import time
|
| 2 |
+
from threading import RLock
|
| 3 |
+
from typing import Any, Callable, Dict, List
|
| 4 |
+
|
| 5 |
+
|
| 6 |
+
class EventSummarizer:
|
| 7 |
+
"""Utility that aggregates related log messages to reduce log spam."""
|
| 8 |
+
|
| 9 |
+
def __init__(self):
|
| 10 |
+
self.events_by_key: Dict[str, int] = {}
|
| 11 |
+
# Messages to send in next summary batch.
|
| 12 |
+
self.messages_to_send: List[str] = []
|
| 13 |
+
# Tracks TTL of messages. A message will not be re-sent once it is
|
| 14 |
+
# added here, until its TTL expires.
|
| 15 |
+
self.throttled_messages: Dict[str, float] = {}
|
| 16 |
+
|
| 17 |
+
# Event summarizer is used by the main thread and
|
| 18 |
+
# by node launcher child threads.
|
| 19 |
+
self.lock = RLock()
|
| 20 |
+
|
| 21 |
+
def add(
|
| 22 |
+
self, template: str, *, quantity: Any, aggregate: Callable[[Any, Any], Any]
|
| 23 |
+
) -> None:
|
| 24 |
+
"""Add a log message, which will be combined by template.
|
| 25 |
+
|
| 26 |
+
Args:
|
| 27 |
+
template: Format string with one placeholder for quantity.
|
| 28 |
+
quantity: Quantity to aggregate.
|
| 29 |
+
aggregate: Aggregation function used to combine the
|
| 30 |
+
quantities. The result is inserted into the template to
|
| 31 |
+
produce the final log message.
|
| 32 |
+
"""
|
| 33 |
+
with self.lock:
|
| 34 |
+
# Enforce proper sentence structure.
|
| 35 |
+
if not template.endswith("."):
|
| 36 |
+
template += "."
|
| 37 |
+
if template in self.events_by_key:
|
| 38 |
+
self.events_by_key[template] = aggregate(
|
| 39 |
+
self.events_by_key[template], quantity
|
| 40 |
+
)
|
| 41 |
+
else:
|
| 42 |
+
self.events_by_key[template] = quantity
|
| 43 |
+
|
| 44 |
+
def add_once_per_interval(self, message: str, key: str, interval_s: int):
|
| 45 |
+
"""Add a log message, which is throttled once per interval by a key.
|
| 46 |
+
|
| 47 |
+
Args:
|
| 48 |
+
message: The message to log.
|
| 49 |
+
key: The key to use to deduplicate the message.
|
| 50 |
+
interval_s: Throttling interval in seconds.
|
| 51 |
+
"""
|
| 52 |
+
with self.lock:
|
| 53 |
+
if key not in self.throttled_messages:
|
| 54 |
+
self.throttled_messages[key] = time.time() + interval_s
|
| 55 |
+
self.messages_to_send.append(message)
|
| 56 |
+
|
| 57 |
+
def summary(self) -> List[str]:
|
| 58 |
+
"""Generate the aggregated log summary of all added events."""
|
| 59 |
+
with self.lock:
|
| 60 |
+
out = []
|
| 61 |
+
for template, quantity in self.events_by_key.items():
|
| 62 |
+
out.append(template.format(quantity))
|
| 63 |
+
out.extend(self.messages_to_send)
|
| 64 |
+
return out
|
| 65 |
+
|
| 66 |
+
def clear(self) -> None:
|
| 67 |
+
"""Clear the events added."""
|
| 68 |
+
with self.lock:
|
| 69 |
+
self.events_by_key.clear()
|
| 70 |
+
self.messages_to_send.clear()
|
| 71 |
+
# Expire any messages that have reached their TTL. This allows them
|
| 72 |
+
# to be sent again.
|
| 73 |
+
for k, t in list(self.throttled_messages.items()):
|
| 74 |
+
if time.time() > t:
|
| 75 |
+
del self.throttled_messages[k]
|
.venv/lib/python3.11/site-packages/ray/autoscaler/_private/event_system.py
ADDED
|
@@ -0,0 +1,106 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from enum import Enum, auto
|
| 2 |
+
from typing import Any, Callable, Dict, List, Optional, Union
|
| 3 |
+
|
| 4 |
+
from ray.autoscaler._private.cli_logger import cli_logger
|
| 5 |
+
|
| 6 |
+
|
| 7 |
+
class CreateClusterEvent(Enum):
|
| 8 |
+
"""Events to track in ray.autoscaler.sdk.create_or_update_cluster.
|
| 9 |
+
|
| 10 |
+
Attributes:
|
| 11 |
+
up_started : Invoked at the beginning of create_or_update_cluster.
|
| 12 |
+
ssh_keypair_downloaded : Invoked when the ssh keypair is downloaded.
|
| 13 |
+
cluster_booting_started : Invoked when when the cluster booting starts.
|
| 14 |
+
acquiring_new_head_node : Invoked before the head node is acquired.
|
| 15 |
+
head_node_acquired : Invoked after the head node is acquired.
|
| 16 |
+
ssh_control_acquired : Invoked when the node is being updated.
|
| 17 |
+
run_initialization_cmd : Invoked before all initialization
|
| 18 |
+
commands are called and again before each initialization command.
|
| 19 |
+
run_setup_cmd : Invoked before all setup commands are
|
| 20 |
+
called and again before each setup command.
|
| 21 |
+
start_ray_runtime : Invoked before ray start commands are run.
|
| 22 |
+
start_ray_runtime_completed : Invoked after ray start commands
|
| 23 |
+
are run.
|
| 24 |
+
cluster_booting_completed : Invoked after cluster booting
|
| 25 |
+
is completed.
|
| 26 |
+
"""
|
| 27 |
+
|
| 28 |
+
up_started = auto()
|
| 29 |
+
ssh_keypair_downloaded = auto()
|
| 30 |
+
cluster_booting_started = auto()
|
| 31 |
+
acquiring_new_head_node = auto()
|
| 32 |
+
head_node_acquired = auto()
|
| 33 |
+
ssh_control_acquired = auto()
|
| 34 |
+
run_initialization_cmd = auto()
|
| 35 |
+
run_setup_cmd = auto()
|
| 36 |
+
start_ray_runtime = auto()
|
| 37 |
+
start_ray_runtime_completed = auto()
|
| 38 |
+
cluster_booting_completed = auto()
|
| 39 |
+
|
| 40 |
+
|
| 41 |
+
class _EventSystem:
|
| 42 |
+
"""Event system that handles storing and calling callbacks for events.
|
| 43 |
+
|
| 44 |
+
Attributes:
|
| 45 |
+
callback_map (Dict[str, List[Callable]]) : Stores list of callbacks
|
| 46 |
+
for events when registered.
|
| 47 |
+
"""
|
| 48 |
+
|
| 49 |
+
def __init__(self):
|
| 50 |
+
self.callback_map = {}
|
| 51 |
+
|
| 52 |
+
def add_callback_handler(
|
| 53 |
+
self,
|
| 54 |
+
event: str,
|
| 55 |
+
callback: Union[Callable[[Dict], None], List[Callable[[Dict], None]]],
|
| 56 |
+
):
|
| 57 |
+
"""Stores callback handler for event.
|
| 58 |
+
|
| 59 |
+
Args:
|
| 60 |
+
event: Event that callback should be called on. See
|
| 61 |
+
CreateClusterEvent for details on the events available to be
|
| 62 |
+
registered against.
|
| 63 |
+
callback (Callable[[Dict], None]): Callable object that is invoked
|
| 64 |
+
when specified event occurs.
|
| 65 |
+
"""
|
| 66 |
+
if event not in CreateClusterEvent.__members__.values():
|
| 67 |
+
cli_logger.warning(
|
| 68 |
+
f"{event} is not currently tracked, and this"
|
| 69 |
+
" callback will not be invoked."
|
| 70 |
+
)
|
| 71 |
+
|
| 72 |
+
self.callback_map.setdefault(event, []).extend(
|
| 73 |
+
[callback] if type(callback) is not list else callback
|
| 74 |
+
)
|
| 75 |
+
|
| 76 |
+
def execute_callback(
|
| 77 |
+
self, event: CreateClusterEvent, event_data: Optional[Dict[str, Any]] = None
|
| 78 |
+
):
|
| 79 |
+
"""Executes all callbacks for event.
|
| 80 |
+
|
| 81 |
+
Args:
|
| 82 |
+
event: Event that is invoked. See CreateClusterEvent
|
| 83 |
+
for details on the available events.
|
| 84 |
+
event_data (Dict[str, Any]): Argument that is passed to each
|
| 85 |
+
callable object stored for this particular event.
|
| 86 |
+
"""
|
| 87 |
+
if event_data is None:
|
| 88 |
+
event_data = {}
|
| 89 |
+
|
| 90 |
+
event_data["event_name"] = event
|
| 91 |
+
if event in self.callback_map:
|
| 92 |
+
for callback in self.callback_map[event]:
|
| 93 |
+
callback(event_data)
|
| 94 |
+
|
| 95 |
+
def clear_callbacks_for_event(self, event: str):
|
| 96 |
+
"""Clears stored callable objects for event.
|
| 97 |
+
|
| 98 |
+
Args:
|
| 99 |
+
event: Event that has callable objects stored in map.
|
| 100 |
+
See CreateClusterEvent for details on the available events.
|
| 101 |
+
"""
|
| 102 |
+
if event in self.callback_map:
|
| 103 |
+
del self.callback_map[event]
|
| 104 |
+
|
| 105 |
+
|
| 106 |
+
global_event_system = _EventSystem()
|
.venv/lib/python3.11/site-packages/ray/autoscaler/_private/fake_multi_node/__init__.py
ADDED
|
File without changes
|
.venv/lib/python3.11/site-packages/ray/autoscaler/_private/fake_multi_node/command_runner.py
ADDED
|
@@ -0,0 +1,91 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import subprocess
|
| 3 |
+
from typing import Dict, List, Tuple
|
| 4 |
+
|
| 5 |
+
from ray.autoscaler._private.docker import with_docker_exec
|
| 6 |
+
from ray.autoscaler.command_runner import CommandRunnerInterface
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
class FakeDockerCommandRunner(CommandRunnerInterface):
|
| 10 |
+
"""Command runner for the fke docker multinode cluster.
|
| 11 |
+
|
| 12 |
+
This command runner uses ``docker exec`` and ``docker cp`` to
|
| 13 |
+
run commands and copy files, respectively.
|
| 14 |
+
|
| 15 |
+
The regular ``DockerCommandRunner`` is made for use in SSH settings
|
| 16 |
+
where Docker runs on a remote hose. In contrast, this command runner
|
| 17 |
+
does not wrap the docker commands in ssh calls.
|
| 18 |
+
"""
|
| 19 |
+
|
| 20 |
+
def __init__(self, docker_config, **common_args):
|
| 21 |
+
self.container_name = docker_config["container_name"]
|
| 22 |
+
self.docker_config = docker_config
|
| 23 |
+
self.home_dir = None
|
| 24 |
+
self.initialized = False
|
| 25 |
+
# Optionally use 'podman' instead of 'docker'
|
| 26 |
+
use_podman = docker_config.get("use_podman", False)
|
| 27 |
+
self.docker_cmd = "podman" if use_podman else "docker"
|
| 28 |
+
|
| 29 |
+
def _run_shell(self, cmd: str, timeout: int = 120) -> str:
|
| 30 |
+
return subprocess.check_output(
|
| 31 |
+
cmd, shell=True, timeout=timeout, encoding="utf-8"
|
| 32 |
+
)
|
| 33 |
+
|
| 34 |
+
def run(
|
| 35 |
+
self,
|
| 36 |
+
cmd: str = None,
|
| 37 |
+
timeout: int = 120,
|
| 38 |
+
exit_on_fail: bool = False,
|
| 39 |
+
port_forward: List[Tuple[int, int]] = None,
|
| 40 |
+
with_output: bool = False,
|
| 41 |
+
environment_variables: Dict[str, object] = None,
|
| 42 |
+
run_env: str = "auto",
|
| 43 |
+
ssh_options_override_ssh_key: str = "",
|
| 44 |
+
shutdown_after_run: bool = False,
|
| 45 |
+
) -> str:
|
| 46 |
+
prefix = with_docker_exec(
|
| 47 |
+
[cmd],
|
| 48 |
+
container_name=self.container_name,
|
| 49 |
+
with_interactive=False,
|
| 50 |
+
docker_cmd=self.docker_cmd,
|
| 51 |
+
)[0]
|
| 52 |
+
return self._run_shell(prefix)
|
| 53 |
+
|
| 54 |
+
def run_init(
|
| 55 |
+
self, *, as_head: bool, file_mounts: Dict[str, str], sync_run_yet: bool
|
| 56 |
+
):
|
| 57 |
+
pass
|
| 58 |
+
|
| 59 |
+
def remote_shell_command_str(self):
|
| 60 |
+
return "{} exec -it {} bash".format(self.docker_cmd, self.container_name)
|
| 61 |
+
|
| 62 |
+
def run_rsync_down(self, source, target, options=None):
|
| 63 |
+
docker_dir = os.path.dirname(self._docker_expand_user(source))
|
| 64 |
+
|
| 65 |
+
self._run_shell(f"docker cp {self.container_name}:{docker_dir} {target}")
|
| 66 |
+
|
| 67 |
+
def run_rsync_up(self, source, target, options=None):
|
| 68 |
+
docker_dir = os.path.dirname(self._docker_expand_user(target))
|
| 69 |
+
self.run(cmd=f"mkdir -p {docker_dir}")
|
| 70 |
+
|
| 71 |
+
self._run_shell(f"docker cp {source} {self.container_name}:{docker_dir}")
|
| 72 |
+
|
| 73 |
+
def _docker_expand_user(self, string, any_char=False):
|
| 74 |
+
user_pos = string.find("~")
|
| 75 |
+
if user_pos > -1:
|
| 76 |
+
if self.home_dir is None:
|
| 77 |
+
self.home_dir = self._run_shell(
|
| 78 |
+
with_docker_exec(
|
| 79 |
+
["printenv HOME"],
|
| 80 |
+
container_name=self.container_name,
|
| 81 |
+
docker_cmd=self.docker_cmd,
|
| 82 |
+
)
|
| 83 |
+
).strip()
|
| 84 |
+
|
| 85 |
+
if any_char:
|
| 86 |
+
return string.replace("~/", self.home_dir + "/")
|
| 87 |
+
|
| 88 |
+
elif not any_char and user_pos == 0:
|
| 89 |
+
return string.replace("~", self.home_dir, 1)
|
| 90 |
+
|
| 91 |
+
return string
|
.venv/lib/python3.11/site-packages/ray/autoscaler/_private/fake_multi_node/docker_monitor.py
ADDED
|
@@ -0,0 +1,246 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Fake multinode docker monitoring script.
|
| 2 |
+
|
| 3 |
+
This script is the "docker compose server" for the fake_multinode
|
| 4 |
+
provider using Docker compose. It should be started before running
|
| 5 |
+
`RAY_FAKE_CLUSTER=1 ray up <cluster_config>`.
|
| 6 |
+
|
| 7 |
+
This script reads the volume directory from a supplied fake multinode
|
| 8 |
+
docker cluster config file.
|
| 9 |
+
It then waits until a docker-compose.yaml file is created in the same
|
| 10 |
+
directory, which is done by the `ray up` command.
|
| 11 |
+
|
| 12 |
+
It then watches for changes in the docker-compose.yaml file and runs
|
| 13 |
+
`docker compose up` whenever changes are detected. This will start docker
|
| 14 |
+
containers as requested by the autoscaler.
|
| 15 |
+
|
| 16 |
+
Generally, the docker-compose.yaml will be mounted in the head node of the
|
| 17 |
+
cluster, which will then continue to change it according to the autoscaler
|
| 18 |
+
requirements.
|
| 19 |
+
|
| 20 |
+
Additionally, this script monitors the docker container status using
|
| 21 |
+
`docker status` and writes it into a `status.json`. This information is
|
| 22 |
+
again used by the autoscaler to determine if any nodes have died.
|
| 23 |
+
"""
|
| 24 |
+
import argparse
|
| 25 |
+
import json
|
| 26 |
+
import os
|
| 27 |
+
import shutil
|
| 28 |
+
import subprocess
|
| 29 |
+
import time
|
| 30 |
+
from typing import Any, Dict, List, Optional
|
| 31 |
+
|
| 32 |
+
import yaml
|
| 33 |
+
|
| 34 |
+
|
| 35 |
+
def _read_yaml(path: str):
|
| 36 |
+
with open(path, "rt") as f:
|
| 37 |
+
return yaml.safe_load(f)
|
| 38 |
+
|
| 39 |
+
|
| 40 |
+
def _update_docker_compose(
|
| 41 |
+
docker_compose_path: str, project_name: str, status: Optional[Dict[str, Any]]
|
| 42 |
+
) -> bool:
|
| 43 |
+
docker_compose_config = _read_yaml(docker_compose_path)
|
| 44 |
+
|
| 45 |
+
if not docker_compose_config:
|
| 46 |
+
print("Docker compose currently empty")
|
| 47 |
+
return False
|
| 48 |
+
|
| 49 |
+
cmd = ["up", "-d"]
|
| 50 |
+
if status and len(status) > 0:
|
| 51 |
+
cmd += ["--no-recreate"]
|
| 52 |
+
|
| 53 |
+
shutdown = False
|
| 54 |
+
if not docker_compose_config["services"]:
|
| 55 |
+
# If no more nodes, run `down` instead of `up`
|
| 56 |
+
print("Shutting down nodes")
|
| 57 |
+
cmd = ["down"]
|
| 58 |
+
shutdown = True
|
| 59 |
+
try:
|
| 60 |
+
subprocess.check_call(
|
| 61 |
+
["docker", "compose", "-f", docker_compose_path, "-p", project_name]
|
| 62 |
+
+ cmd
|
| 63 |
+
+ [
|
| 64 |
+
"--remove-orphans",
|
| 65 |
+
]
|
| 66 |
+
)
|
| 67 |
+
except Exception as e:
|
| 68 |
+
print(f"Ran into error when updating docker compose: {e}")
|
| 69 |
+
# Ignore error
|
| 70 |
+
|
| 71 |
+
return shutdown
|
| 72 |
+
|
| 73 |
+
|
| 74 |
+
def _get_ip(
|
| 75 |
+
project_name: str,
|
| 76 |
+
container_name: str,
|
| 77 |
+
override_network: Optional[str] = None,
|
| 78 |
+
retry_times: int = 3,
|
| 79 |
+
) -> Optional[str]:
|
| 80 |
+
network = override_network or f"{project_name}_ray_local"
|
| 81 |
+
|
| 82 |
+
cmd = [
|
| 83 |
+
"docker",
|
| 84 |
+
"inspect",
|
| 85 |
+
"-f",
|
| 86 |
+
'"{{ .NetworkSettings.Networks' f".{network}.IPAddress" ' }}"',
|
| 87 |
+
f"{container_name}",
|
| 88 |
+
]
|
| 89 |
+
for i in range(retry_times):
|
| 90 |
+
try:
|
| 91 |
+
ip_address = subprocess.check_output(cmd, encoding="utf-8")
|
| 92 |
+
except Exception:
|
| 93 |
+
time.sleep(1)
|
| 94 |
+
else:
|
| 95 |
+
return ip_address.strip().strip('"').strip('\\"')
|
| 96 |
+
return None
|
| 97 |
+
|
| 98 |
+
|
| 99 |
+
def _update_docker_status(
|
| 100 |
+
docker_compose_path: str, project_name: str, docker_status_path: str
|
| 101 |
+
):
|
| 102 |
+
data_str = ""
|
| 103 |
+
try:
|
| 104 |
+
data_str = (
|
| 105 |
+
subprocess.check_output(
|
| 106 |
+
[
|
| 107 |
+
"docker",
|
| 108 |
+
"compose",
|
| 109 |
+
"-f",
|
| 110 |
+
docker_compose_path,
|
| 111 |
+
"-p",
|
| 112 |
+
project_name,
|
| 113 |
+
"ps",
|
| 114 |
+
"--format",
|
| 115 |
+
"json",
|
| 116 |
+
]
|
| 117 |
+
)
|
| 118 |
+
.decode("utf-8")
|
| 119 |
+
.strip()
|
| 120 |
+
.split("\n")
|
| 121 |
+
)
|
| 122 |
+
data: List[Dict[str, str]] = []
|
| 123 |
+
for line in data_str:
|
| 124 |
+
line = line.strip()
|
| 125 |
+
if line:
|
| 126 |
+
data.append(json.loads(line))
|
| 127 |
+
except Exception as e:
|
| 128 |
+
print(f"Ran into error when fetching status: {e}")
|
| 129 |
+
print(f"docker compose ps output: {data_str}")
|
| 130 |
+
return None
|
| 131 |
+
|
| 132 |
+
status = {}
|
| 133 |
+
for container in data:
|
| 134 |
+
node_id = container["Service"]
|
| 135 |
+
container_name = container["Name"]
|
| 136 |
+
if container["State"] == "running":
|
| 137 |
+
ip = _get_ip(project_name, container_name)
|
| 138 |
+
else:
|
| 139 |
+
ip = ""
|
| 140 |
+
container["IP"] = ip
|
| 141 |
+
status[node_id] = container
|
| 142 |
+
|
| 143 |
+
with open(docker_status_path, "wt") as f:
|
| 144 |
+
json.dump(status, f)
|
| 145 |
+
|
| 146 |
+
return status
|
| 147 |
+
|
| 148 |
+
|
| 149 |
+
def monitor_docker(
|
| 150 |
+
docker_compose_path: str,
|
| 151 |
+
status_path: str,
|
| 152 |
+
project_name: str,
|
| 153 |
+
update_interval: float = 1.0,
|
| 154 |
+
):
|
| 155 |
+
while not os.path.exists(docker_compose_path):
|
| 156 |
+
# Wait until cluster is created
|
| 157 |
+
time.sleep(0.5)
|
| 158 |
+
|
| 159 |
+
print("Docker compose config detected, starting status monitoring")
|
| 160 |
+
|
| 161 |
+
# Make sure this is always writeable from inside the containers
|
| 162 |
+
os.chmod(docker_compose_path, 0o777)
|
| 163 |
+
|
| 164 |
+
docker_config = {"force_update": True}
|
| 165 |
+
|
| 166 |
+
# Force update
|
| 167 |
+
next_update = time.monotonic() - 1.0
|
| 168 |
+
shutdown = False
|
| 169 |
+
status = None
|
| 170 |
+
|
| 171 |
+
# Loop:
|
| 172 |
+
# If the config changed, update cluster.
|
| 173 |
+
# Every `update_interval` seconds, update docker status.
|
| 174 |
+
while not shutdown:
|
| 175 |
+
new_docker_config = _read_yaml(docker_compose_path)
|
| 176 |
+
if new_docker_config != docker_config:
|
| 177 |
+
# Update cluster
|
| 178 |
+
shutdown = _update_docker_compose(docker_compose_path, project_name, status)
|
| 179 |
+
|
| 180 |
+
# Force status update
|
| 181 |
+
next_update = time.monotonic() - 1.0
|
| 182 |
+
|
| 183 |
+
if time.monotonic() > next_update:
|
| 184 |
+
# Update docker status
|
| 185 |
+
status = _update_docker_status(
|
| 186 |
+
docker_compose_path, project_name, status_path
|
| 187 |
+
)
|
| 188 |
+
next_update = time.monotonic() + update_interval
|
| 189 |
+
|
| 190 |
+
docker_config = new_docker_config
|
| 191 |
+
time.sleep(0.1)
|
| 192 |
+
|
| 193 |
+
print("Cluster shut down, terminating monitoring script.")
|
| 194 |
+
|
| 195 |
+
|
| 196 |
+
def start_monitor(config_file: str):
|
| 197 |
+
cluster_config = _read_yaml(config_file)
|
| 198 |
+
|
| 199 |
+
provider_config = cluster_config["provider"]
|
| 200 |
+
assert provider_config["type"] == "fake_multinode_docker", (
|
| 201 |
+
f"The docker monitor only works with providers of type "
|
| 202 |
+
f"`fake_multinode_docker`, got `{provider_config['type']}`"
|
| 203 |
+
)
|
| 204 |
+
|
| 205 |
+
project_name = provider_config["project_name"]
|
| 206 |
+
|
| 207 |
+
volume_dir = provider_config["shared_volume_dir"]
|
| 208 |
+
os.makedirs(volume_dir, mode=0o755, exist_ok=True)
|
| 209 |
+
|
| 210 |
+
# Create bootstrap config
|
| 211 |
+
bootstrap_config_path = os.path.join(volume_dir, "bootstrap_config.yaml")
|
| 212 |
+
shutil.copy(config_file, bootstrap_config_path)
|
| 213 |
+
|
| 214 |
+
# These two files usually don't exist, yet
|
| 215 |
+
docker_compose_config_path = os.path.join(volume_dir, "docker-compose.yaml")
|
| 216 |
+
|
| 217 |
+
docker_status_path = os.path.join(volume_dir, "status.json")
|
| 218 |
+
|
| 219 |
+
if os.path.exists(docker_compose_config_path):
|
| 220 |
+
# We wait until this file exists, so remove it if it exists
|
| 221 |
+
# from a previous run.
|
| 222 |
+
os.remove(docker_compose_config_path)
|
| 223 |
+
|
| 224 |
+
if os.path.exists(docker_status_path):
|
| 225 |
+
os.remove(docker_status_path)
|
| 226 |
+
# Create empty file so it can be mounted
|
| 227 |
+
with open(docker_status_path, "wt") as f:
|
| 228 |
+
f.write("{}")
|
| 229 |
+
|
| 230 |
+
print(
|
| 231 |
+
f"Starting monitor process. Please start Ray cluster with:\n"
|
| 232 |
+
f" RAY_FAKE_CLUSTER=1 ray up {config_file}"
|
| 233 |
+
)
|
| 234 |
+
monitor_docker(docker_compose_config_path, docker_status_path, project_name)
|
| 235 |
+
|
| 236 |
+
|
| 237 |
+
if __name__ == "__main__":
|
| 238 |
+
parser = argparse.ArgumentParser()
|
| 239 |
+
parser.add_argument(
|
| 240 |
+
"config_file",
|
| 241 |
+
help="Path to cluster config file containing a fake docker "
|
| 242 |
+
"cluster configuration.",
|
| 243 |
+
)
|
| 244 |
+
args = parser.parse_args()
|
| 245 |
+
|
| 246 |
+
start_monitor(args.config_file)
|
.venv/lib/python3.11/site-packages/ray/autoscaler/_private/fake_multi_node/test_utils.py
ADDED
|
@@ -0,0 +1,398 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import json
|
| 2 |
+
import logging
|
| 3 |
+
import os
|
| 4 |
+
import random
|
| 5 |
+
import shutil
|
| 6 |
+
import subprocess
|
| 7 |
+
import sys
|
| 8 |
+
import tempfile
|
| 9 |
+
import threading
|
| 10 |
+
import time
|
| 11 |
+
from typing import Any, Dict, Optional
|
| 12 |
+
|
| 13 |
+
import yaml
|
| 14 |
+
|
| 15 |
+
import ray
|
| 16 |
+
from ray._private.dict import deep_update
|
| 17 |
+
from ray.autoscaler._private.fake_multi_node.node_provider import (
|
| 18 |
+
FAKE_DOCKER_DEFAULT_CLIENT_PORT,
|
| 19 |
+
FAKE_DOCKER_DEFAULT_GCS_PORT,
|
| 20 |
+
)
|
| 21 |
+
from ray.util.queue import Empty, Queue
|
| 22 |
+
|
| 23 |
+
logger = logging.getLogger(__name__)
|
| 24 |
+
|
| 25 |
+
DEFAULT_DOCKER_IMAGE = "rayproject/ray:nightly-py{major}{minor}-cpu"
|
| 26 |
+
|
| 27 |
+
|
| 28 |
+
class ResourcesNotReadyError(RuntimeError):
|
| 29 |
+
pass
|
| 30 |
+
|
| 31 |
+
|
| 32 |
+
class DockerCluster:
|
| 33 |
+
"""Docker cluster wrapper.
|
| 34 |
+
|
| 35 |
+
Creates a directory for starting a fake multinode docker cluster.
|
| 36 |
+
|
| 37 |
+
Includes APIs to update the cluster config as needed in tests,
|
| 38 |
+
and to start and connect to the cluster.
|
| 39 |
+
"""
|
| 40 |
+
|
| 41 |
+
def __init__(self, config: Optional[Dict[str, Any]] = None):
|
| 42 |
+
self._base_config_file = os.path.join(
|
| 43 |
+
os.path.dirname(__file__), "example_docker.yaml"
|
| 44 |
+
)
|
| 45 |
+
self._tempdir = None
|
| 46 |
+
self._config_file = None
|
| 47 |
+
self._nodes_file = None
|
| 48 |
+
self._nodes = {}
|
| 49 |
+
self._status_file = None
|
| 50 |
+
self._status = {}
|
| 51 |
+
self._partial_config = config
|
| 52 |
+
self._cluster_config = None
|
| 53 |
+
self._docker_image = None
|
| 54 |
+
|
| 55 |
+
self._monitor_script = os.path.join(
|
| 56 |
+
os.path.dirname(__file__), "docker_monitor.py"
|
| 57 |
+
)
|
| 58 |
+
self._monitor_process = None
|
| 59 |
+
|
| 60 |
+
self._execution_thread = None
|
| 61 |
+
self._execution_event = threading.Event()
|
| 62 |
+
self._execution_queue = None
|
| 63 |
+
|
| 64 |
+
@property
|
| 65 |
+
def config_file(self):
|
| 66 |
+
return self._config_file
|
| 67 |
+
|
| 68 |
+
@property
|
| 69 |
+
def cluster_config(self):
|
| 70 |
+
return self._cluster_config
|
| 71 |
+
|
| 72 |
+
@property
|
| 73 |
+
def cluster_dir(self):
|
| 74 |
+
return self._tempdir
|
| 75 |
+
|
| 76 |
+
@property
|
| 77 |
+
def gcs_port(self):
|
| 78 |
+
return self._cluster_config.get("provider", {}).get(
|
| 79 |
+
"host_gcs_port", FAKE_DOCKER_DEFAULT_GCS_PORT
|
| 80 |
+
)
|
| 81 |
+
|
| 82 |
+
@property
|
| 83 |
+
def client_port(self):
|
| 84 |
+
return self._cluster_config.get("provider", {}).get(
|
| 85 |
+
"host_client_port", FAKE_DOCKER_DEFAULT_CLIENT_PORT
|
| 86 |
+
)
|
| 87 |
+
|
| 88 |
+
def connect(self, client: bool = True, timeout: int = 120, **init_kwargs):
|
| 89 |
+
"""Connect to the docker-compose Ray cluster.
|
| 90 |
+
|
| 91 |
+
Assumes the cluster is at RAY_TESTHOST (defaults to
|
| 92 |
+
``127.0.0.1``).
|
| 93 |
+
|
| 94 |
+
Args:
|
| 95 |
+
client: If True, uses Ray client to connect to the
|
| 96 |
+
cluster. If False, uses GCS to connect to the cluster.
|
| 97 |
+
timeout: Connection timeout in seconds.
|
| 98 |
+
**init_kwargs: kwargs to pass to ``ray.init()``.
|
| 99 |
+
|
| 100 |
+
"""
|
| 101 |
+
host = os.environ.get("RAY_TESTHOST", "127.0.0.1")
|
| 102 |
+
|
| 103 |
+
if client:
|
| 104 |
+
port = self.client_port
|
| 105 |
+
address = f"ray://{host}:{port}"
|
| 106 |
+
else:
|
| 107 |
+
port = self.gcs_port
|
| 108 |
+
address = f"{host}:{port}"
|
| 109 |
+
|
| 110 |
+
timeout_at = time.monotonic() + timeout
|
| 111 |
+
while time.monotonic() < timeout_at:
|
| 112 |
+
try:
|
| 113 |
+
ray.init(address, **init_kwargs)
|
| 114 |
+
self.wait_for_resources({"CPU": 1})
|
| 115 |
+
except ResourcesNotReadyError:
|
| 116 |
+
time.sleep(1)
|
| 117 |
+
continue
|
| 118 |
+
else:
|
| 119 |
+
break
|
| 120 |
+
|
| 121 |
+
try:
|
| 122 |
+
ray.cluster_resources()
|
| 123 |
+
except Exception as e:
|
| 124 |
+
raise RuntimeError(f"Timed out connecting to Ray: {e}")
|
| 125 |
+
|
| 126 |
+
def remote_execution_api(self) -> "RemoteAPI":
|
| 127 |
+
"""Create an object to control cluster state from within the cluster."""
|
| 128 |
+
self._execution_queue = Queue(actor_options={"num_cpus": 0})
|
| 129 |
+
stop_event = self._execution_event
|
| 130 |
+
|
| 131 |
+
def entrypoint():
|
| 132 |
+
while not stop_event.is_set():
|
| 133 |
+
try:
|
| 134 |
+
cmd, kwargs = self._execution_queue.get(timeout=1)
|
| 135 |
+
except Empty:
|
| 136 |
+
continue
|
| 137 |
+
|
| 138 |
+
if cmd == "kill_node":
|
| 139 |
+
self.kill_node(**kwargs)
|
| 140 |
+
|
| 141 |
+
self._execution_thread = threading.Thread(target=entrypoint)
|
| 142 |
+
self._execution_thread.start()
|
| 143 |
+
|
| 144 |
+
return RemoteAPI(self._execution_queue)
|
| 145 |
+
|
| 146 |
+
@staticmethod
|
| 147 |
+
def wait_for_resources(resources: Dict[str, float], timeout: int = 60):
|
| 148 |
+
"""Wait until Ray cluster resources are available
|
| 149 |
+
|
| 150 |
+
Args:
|
| 151 |
+
resources: Minimum resources needed before
|
| 152 |
+
this function returns.
|
| 153 |
+
timeout: Timeout in seconds.
|
| 154 |
+
|
| 155 |
+
"""
|
| 156 |
+
timeout = time.monotonic() + timeout
|
| 157 |
+
|
| 158 |
+
available = ray.cluster_resources()
|
| 159 |
+
while any(available.get(k, 0.0) < v for k, v in resources.items()):
|
| 160 |
+
if time.monotonic() > timeout:
|
| 161 |
+
raise ResourcesNotReadyError(
|
| 162 |
+
f"Timed out waiting for resources: {resources}"
|
| 163 |
+
)
|
| 164 |
+
time.sleep(1)
|
| 165 |
+
available = ray.cluster_resources()
|
| 166 |
+
|
| 167 |
+
def update_config(self, config: Optional[Dict[str, Any]] = None):
|
| 168 |
+
"""Update autoscaling config.
|
| 169 |
+
|
| 170 |
+
Does a deep update of the base config with a new configuration.
|
| 171 |
+
This can change autoscaling behavior.
|
| 172 |
+
|
| 173 |
+
Args:
|
| 174 |
+
config: Partial config to update current
|
| 175 |
+
config with.
|
| 176 |
+
|
| 177 |
+
"""
|
| 178 |
+
assert self._tempdir, "Call setup() first"
|
| 179 |
+
|
| 180 |
+
config = config or {}
|
| 181 |
+
|
| 182 |
+
if config:
|
| 183 |
+
self._partial_config = config
|
| 184 |
+
|
| 185 |
+
if not config.get("provider", {}).get("image"):
|
| 186 |
+
# No image specified, trying to parse from buildkite
|
| 187 |
+
docker_image = os.environ.get("RAY_DOCKER_IMAGE", None)
|
| 188 |
+
|
| 189 |
+
if not docker_image:
|
| 190 |
+
# If still no docker image, use one according to Python version
|
| 191 |
+
mj = sys.version_info.major
|
| 192 |
+
mi = sys.version_info.minor
|
| 193 |
+
|
| 194 |
+
docker_image = DEFAULT_DOCKER_IMAGE.format(major=mj, minor=mi)
|
| 195 |
+
|
| 196 |
+
self._docker_image = docker_image
|
| 197 |
+
|
| 198 |
+
with open(self._base_config_file, "rt") as f:
|
| 199 |
+
cluster_config = yaml.safe_load(f)
|
| 200 |
+
|
| 201 |
+
if self._partial_config:
|
| 202 |
+
deep_update(cluster_config, self._partial_config, new_keys_allowed=True)
|
| 203 |
+
|
| 204 |
+
if self._docker_image:
|
| 205 |
+
cluster_config["provider"]["image"] = self._docker_image
|
| 206 |
+
|
| 207 |
+
cluster_config["provider"]["shared_volume_dir"] = self._tempdir
|
| 208 |
+
|
| 209 |
+
self._cluster_config = cluster_config
|
| 210 |
+
|
| 211 |
+
with open(self._config_file, "wt") as f:
|
| 212 |
+
yaml.safe_dump(self._cluster_config, f)
|
| 213 |
+
|
| 214 |
+
logging.info(f"Updated cluster config to: {self._cluster_config}")
|
| 215 |
+
|
| 216 |
+
def maybe_pull_image(self):
|
| 217 |
+
if self._docker_image:
|
| 218 |
+
try:
|
| 219 |
+
images_str = subprocess.check_output(
|
| 220 |
+
f"docker image inspect {self._docker_image}", shell=True
|
| 221 |
+
)
|
| 222 |
+
images = json.loads(images_str)
|
| 223 |
+
except Exception as e:
|
| 224 |
+
logger.error(f"Error inspecting image {self._docker_image}: {e}")
|
| 225 |
+
return
|
| 226 |
+
|
| 227 |
+
if not images:
|
| 228 |
+
try:
|
| 229 |
+
subprocess.check_call(
|
| 230 |
+
f"docker pull {self._docker_image}", shell=True
|
| 231 |
+
)
|
| 232 |
+
except Exception as e:
|
| 233 |
+
logger.error(f"Error pulling image {self._docker_image}: {e}")
|
| 234 |
+
|
| 235 |
+
def setup(self):
|
| 236 |
+
"""Setup docker compose cluster environment.
|
| 237 |
+
|
| 238 |
+
Creates the temporary directory, writes the initial config file,
|
| 239 |
+
and pulls the docker image, if required.
|
| 240 |
+
"""
|
| 241 |
+
self._tempdir = tempfile.mkdtemp(dir=os.environ.get("RAY_TEMPDIR", None))
|
| 242 |
+
os.chmod(self._tempdir, 0o777)
|
| 243 |
+
self._config_file = os.path.join(self._tempdir, "cluster.yaml")
|
| 244 |
+
self._nodes_file = os.path.join(self._tempdir, "nodes.json")
|
| 245 |
+
self._status_file = os.path.join(self._tempdir, "status.json")
|
| 246 |
+
self.update_config()
|
| 247 |
+
self.maybe_pull_image()
|
| 248 |
+
|
| 249 |
+
def teardown(self, keep_dir: bool = False):
|
| 250 |
+
"""Tear down docker compose cluster environment.
|
| 251 |
+
|
| 252 |
+
Args:
|
| 253 |
+
keep_dir: If True, cluster directory
|
| 254 |
+
will not be removed after termination.
|
| 255 |
+
"""
|
| 256 |
+
if not keep_dir:
|
| 257 |
+
shutil.rmtree(self._tempdir)
|
| 258 |
+
self._tempdir = None
|
| 259 |
+
self._config_file = None
|
| 260 |
+
|
| 261 |
+
def _start_monitor(self):
|
| 262 |
+
self._monitor_process = subprocess.Popen(
|
| 263 |
+
[sys.executable, self._monitor_script, self.config_file]
|
| 264 |
+
)
|
| 265 |
+
time.sleep(2)
|
| 266 |
+
|
| 267 |
+
def _stop_monitor(self):
|
| 268 |
+
if self._monitor_process:
|
| 269 |
+
self._monitor_process.wait(timeout=30)
|
| 270 |
+
if self._monitor_process.poll() is None:
|
| 271 |
+
self._monitor_process.terminate()
|
| 272 |
+
self._monitor_process = None
|
| 273 |
+
|
| 274 |
+
def start(self):
|
| 275 |
+
"""Start docker compose cluster.
|
| 276 |
+
|
| 277 |
+
Starts the monitor process and runs ``ray up``.
|
| 278 |
+
"""
|
| 279 |
+
self._start_monitor()
|
| 280 |
+
|
| 281 |
+
subprocess.check_call(
|
| 282 |
+
f"RAY_FAKE_CLUSTER=1 ray up -y {self.config_file}", shell=True
|
| 283 |
+
)
|
| 284 |
+
|
| 285 |
+
def stop(self):
|
| 286 |
+
"""Stop docker compose cluster.
|
| 287 |
+
|
| 288 |
+
Runs ``ray down`` and stops the monitor process.
|
| 289 |
+
"""
|
| 290 |
+
if ray.is_initialized:
|
| 291 |
+
ray.shutdown()
|
| 292 |
+
|
| 293 |
+
subprocess.check_call(
|
| 294 |
+
f"RAY_FAKE_CLUSTER=1 ray down -y {self.config_file}", shell=True
|
| 295 |
+
)
|
| 296 |
+
|
| 297 |
+
self._stop_monitor()
|
| 298 |
+
self._execution_event.set()
|
| 299 |
+
|
| 300 |
+
def _update_nodes(self):
|
| 301 |
+
with open(self._nodes_file, "rt") as f:
|
| 302 |
+
self._nodes = json.load(f)
|
| 303 |
+
|
| 304 |
+
def _update_status(self):
|
| 305 |
+
with open(self._status_file, "rt") as f:
|
| 306 |
+
self._status = json.load(f)
|
| 307 |
+
|
| 308 |
+
def _get_node(
|
| 309 |
+
self,
|
| 310 |
+
node_id: Optional[str] = None,
|
| 311 |
+
num: Optional[int] = None,
|
| 312 |
+
rand: Optional[str] = None,
|
| 313 |
+
) -> str:
|
| 314 |
+
self._update_nodes()
|
| 315 |
+
if node_id:
|
| 316 |
+
assert (
|
| 317 |
+
not num and not rand
|
| 318 |
+
), "Only provide either `node_id`, `num`, or `random`."
|
| 319 |
+
elif num:
|
| 320 |
+
assert (
|
| 321 |
+
not node_id and not rand
|
| 322 |
+
), "Only provide either `node_id`, `num`, or `random`."
|
| 323 |
+
base = "fffffffffffffffffffffffffffffffffffffffffffffffffff"
|
| 324 |
+
node_id = base + str(num).zfill(5)
|
| 325 |
+
elif rand:
|
| 326 |
+
assert (
|
| 327 |
+
not node_id and not num
|
| 328 |
+
), "Only provide either `node_id`, `num`, or `random`."
|
| 329 |
+
assert rand in [
|
| 330 |
+
"worker",
|
| 331 |
+
"any",
|
| 332 |
+
], "`random` must be one of ['worker', 'any']"
|
| 333 |
+
choices = list(self._nodes.keys())
|
| 334 |
+
if rand == "worker":
|
| 335 |
+
choices.remove(
|
| 336 |
+
"fffffffffffffffffffffffffffffffffffffffffffffffffff00000"
|
| 337 |
+
)
|
| 338 |
+
# Else: any
|
| 339 |
+
node_id = random.choice(choices)
|
| 340 |
+
|
| 341 |
+
assert node_id in self._nodes, f"Node with ID {node_id} is not in active nodes."
|
| 342 |
+
return node_id
|
| 343 |
+
|
| 344 |
+
def _get_docker_container(self, node_id: str) -> Optional[str]:
|
| 345 |
+
self._update_status()
|
| 346 |
+
node_status = self._status.get(node_id)
|
| 347 |
+
if not node_status:
|
| 348 |
+
return None
|
| 349 |
+
|
| 350 |
+
return node_status["Name"]
|
| 351 |
+
|
| 352 |
+
def kill_node(
|
| 353 |
+
self,
|
| 354 |
+
node_id: Optional[str] = None,
|
| 355 |
+
num: Optional[int] = None,
|
| 356 |
+
rand: Optional[str] = None,
|
| 357 |
+
):
|
| 358 |
+
"""Kill node.
|
| 359 |
+
|
| 360 |
+
If ``node_id`` is given, kill that node.
|
| 361 |
+
|
| 362 |
+
If ``num`` is given, construct node_id from this number, and kill
|
| 363 |
+
that node.
|
| 364 |
+
|
| 365 |
+
If ``rand`` is given (as either ``worker`` or ``any``), kill a random
|
| 366 |
+
node.
|
| 367 |
+
"""
|
| 368 |
+
node_id = self._get_node(node_id=node_id, num=num, rand=rand)
|
| 369 |
+
container = self._get_docker_container(node_id=node_id)
|
| 370 |
+
subprocess.check_call(f"docker kill {container}", shell=True)
|
| 371 |
+
|
| 372 |
+
|
| 373 |
+
class RemoteAPI:
|
| 374 |
+
"""Remote API to control cluster state from within cluster tasks.
|
| 375 |
+
|
| 376 |
+
This API uses a Ray queue to interact with an execution thread on the
|
| 377 |
+
host machine that will execute commands passed to the queue.
|
| 378 |
+
|
| 379 |
+
Instances of this class can be serialized and passed to Ray remote actors
|
| 380 |
+
to interact with cluster state (but they can also be used outside actors).
|
| 381 |
+
|
| 382 |
+
The API subset is limited to specific commands.
|
| 383 |
+
|
| 384 |
+
Args:
|
| 385 |
+
queue: Ray queue to push command instructions to.
|
| 386 |
+
|
| 387 |
+
"""
|
| 388 |
+
|
| 389 |
+
def __init__(self, queue: Queue):
|
| 390 |
+
self._queue = queue
|
| 391 |
+
|
| 392 |
+
def kill_node(
|
| 393 |
+
self,
|
| 394 |
+
node_id: Optional[str] = None,
|
| 395 |
+
num: Optional[int] = None,
|
| 396 |
+
rand: Optional[str] = None,
|
| 397 |
+
):
|
| 398 |
+
self._queue.put(("kill_node", dict(node_id=node_id, num=num, rand=rand)))
|
.venv/lib/python3.11/site-packages/ray/autoscaler/_private/legacy_info_string.py
ADDED
|
@@ -0,0 +1,37 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import logging
|
| 2 |
+
|
| 3 |
+
from ray._private.ray_constants import DEBUG_AUTOSCALING_STATUS_LEGACY
|
| 4 |
+
from ray.experimental.internal_kv import _internal_kv_initialized, _internal_kv_put
|
| 5 |
+
|
| 6 |
+
"""This file provides legacy support for the old info string in order to
|
| 7 |
+
ensure the dashboard's `api/cluster_status` does not break backwards
|
| 8 |
+
compatibilty.
|
| 9 |
+
"""
|
| 10 |
+
|
| 11 |
+
logger = logging.getLogger(__name__)
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
def legacy_log_info_string(autoscaler, nodes):
|
| 15 |
+
tmp = "Cluster status: "
|
| 16 |
+
tmp += info_string(autoscaler, nodes)
|
| 17 |
+
tmp += "\n"
|
| 18 |
+
tmp += autoscaler.load_metrics.info_string()
|
| 19 |
+
tmp += "\n"
|
| 20 |
+
tmp += autoscaler.resource_demand_scheduler.debug_string(
|
| 21 |
+
nodes,
|
| 22 |
+
autoscaler.pending_launches.breakdown(),
|
| 23 |
+
autoscaler.load_metrics.get_resource_utilization(),
|
| 24 |
+
)
|
| 25 |
+
if _internal_kv_initialized():
|
| 26 |
+
_internal_kv_put(DEBUG_AUTOSCALING_STATUS_LEGACY, tmp, overwrite=True)
|
| 27 |
+
logger.debug(tmp)
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
def info_string(autoscaler, nodes):
|
| 31 |
+
suffix = ""
|
| 32 |
+
if autoscaler.updaters:
|
| 33 |
+
suffix += " ({} updating)".format(len(autoscaler.updaters))
|
| 34 |
+
if autoscaler.num_failed_updates:
|
| 35 |
+
suffix += " ({} failed to update)".format(len(autoscaler.num_failed_updates))
|
| 36 |
+
|
| 37 |
+
return "{} nodes{}".format(len(nodes), suffix)
|
.venv/lib/python3.11/site-packages/ray/autoscaler/_private/load_metrics.py
ADDED
|
@@ -0,0 +1,375 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import logging
|
| 2 |
+
import time
|
| 3 |
+
from collections import Counter
|
| 4 |
+
from functools import reduce
|
| 5 |
+
from typing import Dict, List
|
| 6 |
+
|
| 7 |
+
from ray._private.gcs_utils import PlacementGroupTableData
|
| 8 |
+
from ray.autoscaler._private.constants import (
|
| 9 |
+
AUTOSCALER_MAX_RESOURCE_DEMAND_VECTOR_SIZE,
|
| 10 |
+
AUTOSCALER_REPORT_PER_NODE_STATUS,
|
| 11 |
+
)
|
| 12 |
+
from ray.autoscaler._private.util import (
|
| 13 |
+
DictCount,
|
| 14 |
+
LoadMetricsSummary,
|
| 15 |
+
NodeIP,
|
| 16 |
+
ResourceDict,
|
| 17 |
+
)
|
| 18 |
+
from ray.core.generated.common_pb2 import PlacementStrategy
|
| 19 |
+
|
| 20 |
+
logger = logging.getLogger(__name__)
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
def add_resources(dict1: Dict[str, float], dict2: Dict[str, float]) -> Dict[str, float]:
|
| 24 |
+
"""Add the values in two dictionaries.
|
| 25 |
+
|
| 26 |
+
Returns:
|
| 27 |
+
dict: A new dictionary (inputs remain unmodified).
|
| 28 |
+
"""
|
| 29 |
+
new_dict = dict1.copy()
|
| 30 |
+
for k, v in dict2.items():
|
| 31 |
+
new_dict[k] = v + new_dict.get(k, 0)
|
| 32 |
+
return new_dict
|
| 33 |
+
|
| 34 |
+
|
| 35 |
+
def freq_of_dicts(dicts: List[Dict], serializer=None, deserializer=dict) -> DictCount:
|
| 36 |
+
"""Count a list of dictionaries (or unhashable types).
|
| 37 |
+
|
| 38 |
+
This is somewhat annoying because mutable data structures aren't hashable,
|
| 39 |
+
and set/dict keys must be hashable.
|
| 40 |
+
|
| 41 |
+
Args:
|
| 42 |
+
dicts (List[D]): A list of dictionaries to be counted.
|
| 43 |
+
serializer (D -> S): A custom serialization function. The output type S
|
| 44 |
+
must be hashable. The default serializer converts a dictionary into
|
| 45 |
+
a frozenset of KV pairs.
|
| 46 |
+
deserializer (S -> U): A custom deserialization function. See the
|
| 47 |
+
serializer for information about type S. For dictionaries U := D.
|
| 48 |
+
|
| 49 |
+
Returns:
|
| 50 |
+
List[Tuple[U, int]]: Returns a list of tuples. Each entry in the list
|
| 51 |
+
is a tuple containing a unique entry from `dicts` and its
|
| 52 |
+
corresponding frequency count.
|
| 53 |
+
"""
|
| 54 |
+
if serializer is None:
|
| 55 |
+
serializer = lambda d: frozenset(d.items()) # noqa: E731
|
| 56 |
+
|
| 57 |
+
freqs = Counter(serializer(d) for d in dicts)
|
| 58 |
+
as_list = []
|
| 59 |
+
for as_set, count in freqs.items():
|
| 60 |
+
as_list.append((deserializer(as_set), count))
|
| 61 |
+
return as_list
|
| 62 |
+
|
| 63 |
+
|
| 64 |
+
class LoadMetrics:
|
| 65 |
+
"""Container for cluster load metrics.
|
| 66 |
+
|
| 67 |
+
Metrics here are updated from raylet heartbeats. The autoscaler
|
| 68 |
+
queries these metrics to determine when to scale up, and which nodes
|
| 69 |
+
can be removed.
|
| 70 |
+
"""
|
| 71 |
+
|
| 72 |
+
def __init__(self):
|
| 73 |
+
self.last_heartbeat_time_by_ip = {}
|
| 74 |
+
self.static_resources_by_ip = {}
|
| 75 |
+
self.dynamic_resources_by_ip = {}
|
| 76 |
+
self.raylet_id_by_ip = {}
|
| 77 |
+
self.waiting_bundles = []
|
| 78 |
+
self.infeasible_bundles = []
|
| 79 |
+
self.pending_placement_groups = []
|
| 80 |
+
self.resource_requests = []
|
| 81 |
+
self.cluster_full_of_actors_detected = False
|
| 82 |
+
self.ray_nodes_last_used_time_by_ip = {}
|
| 83 |
+
|
| 84 |
+
def __bool__(self):
|
| 85 |
+
"""A load metrics instance is Falsey iff the autoscaler process
|
| 86 |
+
has not received a resource message from the GCS.
|
| 87 |
+
"""
|
| 88 |
+
return bool(self.raylet_id_by_ip)
|
| 89 |
+
|
| 90 |
+
def update(
|
| 91 |
+
self,
|
| 92 |
+
ip: str,
|
| 93 |
+
raylet_id: bytes,
|
| 94 |
+
static_resources: Dict[str, Dict],
|
| 95 |
+
dynamic_resources: Dict[str, Dict],
|
| 96 |
+
node_idle_duration_s: float,
|
| 97 |
+
waiting_bundles: List[Dict[str, float]] = None,
|
| 98 |
+
infeasible_bundles: List[Dict[str, float]] = None,
|
| 99 |
+
pending_placement_groups: List[PlacementGroupTableData] = None,
|
| 100 |
+
cluster_full_of_actors_detected: bool = False,
|
| 101 |
+
):
|
| 102 |
+
self.static_resources_by_ip[ip] = static_resources
|
| 103 |
+
self.raylet_id_by_ip[ip] = raylet_id
|
| 104 |
+
self.cluster_full_of_actors_detected = cluster_full_of_actors_detected
|
| 105 |
+
|
| 106 |
+
if not waiting_bundles:
|
| 107 |
+
waiting_bundles = []
|
| 108 |
+
if not infeasible_bundles:
|
| 109 |
+
infeasible_bundles = []
|
| 110 |
+
if not pending_placement_groups:
|
| 111 |
+
pending_placement_groups = []
|
| 112 |
+
|
| 113 |
+
# We are not guaranteed to have a corresponding dynamic resource
|
| 114 |
+
# for every static resource because dynamic resources are based on
|
| 115 |
+
# the available resources in the heartbeat, which does not exist
|
| 116 |
+
# if it is zero. Thus, we have to update dynamic resources here.
|
| 117 |
+
dynamic_resources_update = dynamic_resources.copy()
|
| 118 |
+
for resource_name, capacity in self.static_resources_by_ip[ip].items():
|
| 119 |
+
if resource_name not in dynamic_resources_update:
|
| 120 |
+
dynamic_resources_update[resource_name] = 0.0
|
| 121 |
+
self.dynamic_resources_by_ip[ip] = dynamic_resources_update
|
| 122 |
+
|
| 123 |
+
now = time.time()
|
| 124 |
+
self.ray_nodes_last_used_time_by_ip[ip] = now - node_idle_duration_s
|
| 125 |
+
self.last_heartbeat_time_by_ip[ip] = now
|
| 126 |
+
self.waiting_bundles = waiting_bundles
|
| 127 |
+
self.infeasible_bundles = infeasible_bundles
|
| 128 |
+
self.pending_placement_groups = pending_placement_groups
|
| 129 |
+
|
| 130 |
+
def mark_active(self, ip):
|
| 131 |
+
assert ip is not None, "IP should be known at this time"
|
| 132 |
+
logger.debug("Node {} is newly setup, treating as active".format(ip))
|
| 133 |
+
self.last_heartbeat_time_by_ip[ip] = time.time()
|
| 134 |
+
|
| 135 |
+
def is_active(self, ip):
|
| 136 |
+
return ip in self.last_heartbeat_time_by_ip
|
| 137 |
+
|
| 138 |
+
def prune_active_ips(self, active_ips: List[str]):
|
| 139 |
+
"""The Raylet ips stored by LoadMetrics are obtained by polling
|
| 140 |
+
the GCS in Monitor.update_load_metrics().
|
| 141 |
+
|
| 142 |
+
On the other hand, the autoscaler gets a list of node ips from
|
| 143 |
+
its NodeProvider.
|
| 144 |
+
|
| 145 |
+
This method removes from LoadMetrics the ips unknown to the autoscaler.
|
| 146 |
+
|
| 147 |
+
Args:
|
| 148 |
+
active_ips (List[str]): The node ips known to the autoscaler.
|
| 149 |
+
"""
|
| 150 |
+
active_ips = set(active_ips)
|
| 151 |
+
|
| 152 |
+
def prune(mapping, should_log):
|
| 153 |
+
unwanted_ips = set(mapping) - active_ips
|
| 154 |
+
for unwanted_ip in unwanted_ips:
|
| 155 |
+
if should_log:
|
| 156 |
+
logger.info("LoadMetrics: " f"Removed ip: {unwanted_ip}.")
|
| 157 |
+
del mapping[unwanted_ip]
|
| 158 |
+
if unwanted_ips and should_log:
|
| 159 |
+
logger.info(
|
| 160 |
+
"LoadMetrics: "
|
| 161 |
+
"Removed {} stale ip mappings: {} not in {}".format(
|
| 162 |
+
len(unwanted_ips), unwanted_ips, active_ips
|
| 163 |
+
)
|
| 164 |
+
)
|
| 165 |
+
assert not (unwanted_ips & set(mapping))
|
| 166 |
+
|
| 167 |
+
prune(self.ray_nodes_last_used_time_by_ip, should_log=True)
|
| 168 |
+
prune(self.static_resources_by_ip, should_log=False)
|
| 169 |
+
prune(self.raylet_id_by_ip, should_log=False)
|
| 170 |
+
prune(self.dynamic_resources_by_ip, should_log=False)
|
| 171 |
+
prune(self.last_heartbeat_time_by_ip, should_log=False)
|
| 172 |
+
|
| 173 |
+
def get_node_resources(self):
|
| 174 |
+
"""Return a list of node resources (static resource sizes).
|
| 175 |
+
|
| 176 |
+
Example:
|
| 177 |
+
>>> from ray.autoscaler._private.load_metrics import LoadMetrics
|
| 178 |
+
>>> metrics = LoadMetrics(...) # doctest: +SKIP
|
| 179 |
+
>>> metrics.get_node_resources() # doctest: +SKIP
|
| 180 |
+
[{"CPU": 1}, {"CPU": 4, "GPU": 8}] # for two different nodes
|
| 181 |
+
"""
|
| 182 |
+
return self.static_resources_by_ip.values()
|
| 183 |
+
|
| 184 |
+
def get_static_node_resources_by_ip(self) -> Dict[NodeIP, ResourceDict]:
|
| 185 |
+
"""Return a dict of node resources for every node ip.
|
| 186 |
+
|
| 187 |
+
Example:
|
| 188 |
+
>>> from ray.autoscaler._private.load_metrics import LoadMetrics
|
| 189 |
+
>>> metrics = LoadMetrics(...) # doctest: +SKIP
|
| 190 |
+
>>> metrics.get_static_node_resources_by_ip() # doctest: +SKIP
|
| 191 |
+
{127.0.0.1: {"CPU": 1}, 127.0.0.2: {"CPU": 4, "GPU": 8}}
|
| 192 |
+
"""
|
| 193 |
+
return self.static_resources_by_ip
|
| 194 |
+
|
| 195 |
+
def get_resource_utilization(self):
|
| 196 |
+
return self.dynamic_resources_by_ip
|
| 197 |
+
|
| 198 |
+
def _get_resource_usage(self):
|
| 199 |
+
resources_used = {}
|
| 200 |
+
resources_total = {}
|
| 201 |
+
for ip, max_resources in self.static_resources_by_ip.items():
|
| 202 |
+
avail_resources = self.dynamic_resources_by_ip[ip]
|
| 203 |
+
for resource_id, amount in max_resources.items():
|
| 204 |
+
used = amount - avail_resources[resource_id]
|
| 205 |
+
if resource_id not in resources_used:
|
| 206 |
+
resources_used[resource_id] = 0.0
|
| 207 |
+
resources_total[resource_id] = 0.0
|
| 208 |
+
resources_used[resource_id] += used
|
| 209 |
+
resources_total[resource_id] += amount
|
| 210 |
+
used = max(0, used)
|
| 211 |
+
|
| 212 |
+
return resources_used, resources_total
|
| 213 |
+
|
| 214 |
+
def get_resource_demand_vector(self, clip=True):
|
| 215 |
+
if clip:
|
| 216 |
+
# Bound the total number of bundles to
|
| 217 |
+
# 2xMAX_RESOURCE_DEMAND_VECTOR_SIZE. This guarantees the resource
|
| 218 |
+
# demand scheduler bin packing algorithm takes a reasonable amount
|
| 219 |
+
# of time to run.
|
| 220 |
+
return (
|
| 221 |
+
self.waiting_bundles[:AUTOSCALER_MAX_RESOURCE_DEMAND_VECTOR_SIZE]
|
| 222 |
+
+ self.infeasible_bundles[:AUTOSCALER_MAX_RESOURCE_DEMAND_VECTOR_SIZE]
|
| 223 |
+
)
|
| 224 |
+
else:
|
| 225 |
+
return self.waiting_bundles + self.infeasible_bundles
|
| 226 |
+
|
| 227 |
+
def get_resource_requests(self):
|
| 228 |
+
return self.resource_requests
|
| 229 |
+
|
| 230 |
+
def get_pending_placement_groups(self):
|
| 231 |
+
return self.pending_placement_groups
|
| 232 |
+
|
| 233 |
+
def resources_avail_summary(self) -> str:
|
| 234 |
+
"""Return a concise string of cluster size to report to event logs.
|
| 235 |
+
|
| 236 |
+
For example, "3 CPUs, 4 GPUs".
|
| 237 |
+
"""
|
| 238 |
+
total_resources = (
|
| 239 |
+
reduce(add_resources, self.static_resources_by_ip.values())
|
| 240 |
+
if self.static_resources_by_ip
|
| 241 |
+
else {}
|
| 242 |
+
)
|
| 243 |
+
out = "{} CPUs".format(int(total_resources.get("CPU", 0)))
|
| 244 |
+
if "GPU" in total_resources:
|
| 245 |
+
out += ", {} GPUs".format(int(total_resources["GPU"]))
|
| 246 |
+
if "TPU" in total_resources:
|
| 247 |
+
out += ", {} TPUs".format(int(total_resources["TPU"]))
|
| 248 |
+
return out
|
| 249 |
+
|
| 250 |
+
def summary(self):
|
| 251 |
+
available_resources = (
|
| 252 |
+
reduce(add_resources, self.dynamic_resources_by_ip.values())
|
| 253 |
+
if self.dynamic_resources_by_ip
|
| 254 |
+
else {}
|
| 255 |
+
)
|
| 256 |
+
total_resources = (
|
| 257 |
+
reduce(add_resources, self.static_resources_by_ip.values())
|
| 258 |
+
if self.static_resources_by_ip
|
| 259 |
+
else {}
|
| 260 |
+
)
|
| 261 |
+
usage_dict = {}
|
| 262 |
+
for key in total_resources:
|
| 263 |
+
if key in ["memory", "object_store_memory"]:
|
| 264 |
+
total = total_resources[key]
|
| 265 |
+
available = available_resources[key]
|
| 266 |
+
usage_dict[key] = (total - available, total)
|
| 267 |
+
else:
|
| 268 |
+
total = total_resources[key]
|
| 269 |
+
usage_dict[key] = (total - available_resources[key], total)
|
| 270 |
+
|
| 271 |
+
summarized_demand_vector = freq_of_dicts(
|
| 272 |
+
self.get_resource_demand_vector(clip=False)
|
| 273 |
+
)
|
| 274 |
+
summarized_resource_requests = freq_of_dicts(self.get_resource_requests())
|
| 275 |
+
|
| 276 |
+
def placement_group_serializer(pg):
|
| 277 |
+
bundles = tuple(
|
| 278 |
+
frozenset(bundle.unit_resources.items()) for bundle in pg.bundles
|
| 279 |
+
)
|
| 280 |
+
return (bundles, pg.strategy)
|
| 281 |
+
|
| 282 |
+
def placement_group_deserializer(pg_tuple):
|
| 283 |
+
# We marshal this as a dictionary so that we can easily json.dumps
|
| 284 |
+
# it later.
|
| 285 |
+
# TODO (Alex): Would there be a benefit to properly
|
| 286 |
+
# marshalling this (into a protobuf)?
|
| 287 |
+
bundles = list(map(dict, pg_tuple[0]))
|
| 288 |
+
return {
|
| 289 |
+
"bundles": freq_of_dicts(bundles),
|
| 290 |
+
"strategy": PlacementStrategy.Name(pg_tuple[1]),
|
| 291 |
+
}
|
| 292 |
+
|
| 293 |
+
summarized_placement_groups = freq_of_dicts(
|
| 294 |
+
self.get_pending_placement_groups(),
|
| 295 |
+
serializer=placement_group_serializer,
|
| 296 |
+
deserializer=placement_group_deserializer,
|
| 297 |
+
)
|
| 298 |
+
nodes_summary = freq_of_dicts(self.static_resources_by_ip.values())
|
| 299 |
+
|
| 300 |
+
usage_by_node = None
|
| 301 |
+
if AUTOSCALER_REPORT_PER_NODE_STATUS:
|
| 302 |
+
usage_by_node = {}
|
| 303 |
+
for ip, totals in self.static_resources_by_ip.items():
|
| 304 |
+
available = self.dynamic_resources_by_ip.get(ip, {})
|
| 305 |
+
usage_by_node[ip] = {}
|
| 306 |
+
for resource, total in totals.items():
|
| 307 |
+
usage_by_node[ip][resource] = (
|
| 308 |
+
total - available.get(resource, 0),
|
| 309 |
+
total,
|
| 310 |
+
)
|
| 311 |
+
|
| 312 |
+
return LoadMetricsSummary(
|
| 313 |
+
usage=usage_dict,
|
| 314 |
+
resource_demand=summarized_demand_vector,
|
| 315 |
+
pg_demand=summarized_placement_groups,
|
| 316 |
+
request_demand=summarized_resource_requests,
|
| 317 |
+
node_types=nodes_summary,
|
| 318 |
+
usage_by_node=usage_by_node,
|
| 319 |
+
)
|
| 320 |
+
|
| 321 |
+
def set_resource_requests(self, requested_resources):
|
| 322 |
+
if requested_resources is not None:
|
| 323 |
+
assert isinstance(requested_resources, list), requested_resources
|
| 324 |
+
self.resource_requests = [
|
| 325 |
+
request for request in requested_resources if len(request) > 0
|
| 326 |
+
]
|
| 327 |
+
|
| 328 |
+
def info_string(self):
|
| 329 |
+
return " - " + "\n - ".join(
|
| 330 |
+
["{}: {}".format(k, v) for k, v in sorted(self._info().items())]
|
| 331 |
+
)
|
| 332 |
+
|
| 333 |
+
def _info(self):
|
| 334 |
+
resources_used, resources_total = self._get_resource_usage()
|
| 335 |
+
|
| 336 |
+
now = time.time()
|
| 337 |
+
idle_times = [now - t for t in self.ray_nodes_last_used_time_by_ip.values()]
|
| 338 |
+
heartbeat_times = [now - t for t in self.last_heartbeat_time_by_ip.values()]
|
| 339 |
+
most_delayed_heartbeats = sorted(
|
| 340 |
+
self.last_heartbeat_time_by_ip.items(), key=lambda pair: pair[1]
|
| 341 |
+
)[:5]
|
| 342 |
+
most_delayed_heartbeats = {ip: (now - t) for ip, t in most_delayed_heartbeats}
|
| 343 |
+
|
| 344 |
+
def format_resource(key, value):
|
| 345 |
+
if key in ["object_store_memory", "memory"]:
|
| 346 |
+
return "{} GiB".format(round(value / (1024 * 1024 * 1024), 2))
|
| 347 |
+
else:
|
| 348 |
+
return round(value, 2)
|
| 349 |
+
|
| 350 |
+
return {
|
| 351 |
+
"ResourceUsage": ", ".join(
|
| 352 |
+
[
|
| 353 |
+
"{}/{} {}".format(
|
| 354 |
+
format_resource(rid, resources_used[rid]),
|
| 355 |
+
format_resource(rid, resources_total[rid]),
|
| 356 |
+
rid,
|
| 357 |
+
)
|
| 358 |
+
for rid in sorted(resources_used)
|
| 359 |
+
if not rid.startswith("node:")
|
| 360 |
+
]
|
| 361 |
+
),
|
| 362 |
+
"NodeIdleSeconds": "Min={} Mean={} Max={}".format(
|
| 363 |
+
int(min(idle_times)) if idle_times else -1,
|
| 364 |
+
int(float(sum(idle_times)) / len(idle_times)) if idle_times else -1,
|
| 365 |
+
int(max(idle_times)) if idle_times else -1,
|
| 366 |
+
),
|
| 367 |
+
"TimeSinceLastHeartbeat": "Min={} Mean={} Max={}".format(
|
| 368 |
+
int(min(heartbeat_times)) if heartbeat_times else -1,
|
| 369 |
+
int(float(sum(heartbeat_times)) / len(heartbeat_times))
|
| 370 |
+
if heartbeat_times
|
| 371 |
+
else -1,
|
| 372 |
+
int(max(heartbeat_times)) if heartbeat_times else -1,
|
| 373 |
+
),
|
| 374 |
+
"MostDelayedHeartbeats": most_delayed_heartbeats,
|
| 375 |
+
}
|
.venv/lib/python3.11/site-packages/ray/autoscaler/_private/loader.py
ADDED
|
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import importlib
|
| 2 |
+
|
| 3 |
+
|
| 4 |
+
def load_function_or_class(path):
|
| 5 |
+
"""Load a function or class at runtime given a full path.
|
| 6 |
+
|
| 7 |
+
Example of the path: mypkg.mysubpkg.myclass
|
| 8 |
+
"""
|
| 9 |
+
class_data = path.split(".")
|
| 10 |
+
if len(class_data) < 2:
|
| 11 |
+
raise ValueError("You need to pass a valid path like mymodule.provider_class")
|
| 12 |
+
module_path = ".".join(class_data[:-1])
|
| 13 |
+
fn_or_class_str = class_data[-1]
|
| 14 |
+
module = importlib.import_module(module_path)
|
| 15 |
+
return getattr(module, fn_or_class_str)
|
.venv/lib/python3.11/site-packages/ray/autoscaler/_private/local/__init__.py
ADDED
|
File without changes
|
.venv/lib/python3.11/site-packages/ray/autoscaler/_private/local/__pycache__/__init__.cpython-311.pyc
ADDED
|
Binary file (202 Bytes). View file
|
|
|
.venv/lib/python3.11/site-packages/ray/autoscaler/_private/local/__pycache__/config.cpython-311.pyc
ADDED
|
Binary file (5.4 kB). View file
|
|
|
.venv/lib/python3.11/site-packages/ray/autoscaler/_private/local/__pycache__/coordinator_node_provider.cpython-311.pyc
ADDED
|
Binary file (5.85 kB). View file
|
|
|
.venv/lib/python3.11/site-packages/ray/autoscaler/_private/local/__pycache__/node_provider.cpython-311.pyc
ADDED
|
Binary file (17.3 kB). View file
|
|
|
.venv/lib/python3.11/site-packages/ray/autoscaler/_private/local/config.py
ADDED
|
@@ -0,0 +1,121 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import copy
|
| 2 |
+
import os
|
| 3 |
+
from typing import Any, Dict
|
| 4 |
+
|
| 5 |
+
from ray._private.utils import get_ray_temp_dir
|
| 6 |
+
from ray.autoscaler._private.cli_logger import cli_logger
|
| 7 |
+
|
| 8 |
+
unsupported_field_message = "The field {} is not supported for on-premise clusters."
|
| 9 |
+
|
| 10 |
+
LOCAL_CLUSTER_NODE_TYPE = "local.cluster.node"
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
def prepare_local(config: Dict[str, Any]) -> Dict[str, Any]:
|
| 14 |
+
"""
|
| 15 |
+
Prepare local cluster config for ingestion by cluster launcher and
|
| 16 |
+
autoscaler.
|
| 17 |
+
"""
|
| 18 |
+
config = copy.deepcopy(config)
|
| 19 |
+
for field in "head_node", "worker_nodes", "available_node_types":
|
| 20 |
+
if config.get(field):
|
| 21 |
+
err_msg = unsupported_field_message.format(field)
|
| 22 |
+
cli_logger.abort(err_msg)
|
| 23 |
+
# We use a config with a single node type for on-prem clusters.
|
| 24 |
+
# Resources internally detected by Ray are not overridden by the autoscaler
|
| 25 |
+
# (see NodeProvider.do_update)
|
| 26 |
+
config["available_node_types"] = {
|
| 27 |
+
LOCAL_CLUSTER_NODE_TYPE: {"node_config": {}, "resources": {}}
|
| 28 |
+
}
|
| 29 |
+
config["head_node_type"] = LOCAL_CLUSTER_NODE_TYPE
|
| 30 |
+
if "coordinator_address" in config["provider"]:
|
| 31 |
+
config = prepare_coordinator(config)
|
| 32 |
+
else:
|
| 33 |
+
config = prepare_manual(config)
|
| 34 |
+
return config
|
| 35 |
+
|
| 36 |
+
|
| 37 |
+
def prepare_coordinator(config: Dict[str, Any]) -> Dict[str, Any]:
|
| 38 |
+
config = copy.deepcopy(config)
|
| 39 |
+
# User should explicitly set the max number of workers for the coordinator
|
| 40 |
+
# to allocate.
|
| 41 |
+
if "max_workers" not in config:
|
| 42 |
+
cli_logger.abort(
|
| 43 |
+
"The field `max_workers` is required when using an "
|
| 44 |
+
"automatically managed on-premise cluster."
|
| 45 |
+
)
|
| 46 |
+
node_type = config["available_node_types"][LOCAL_CLUSTER_NODE_TYPE]
|
| 47 |
+
# The autoscaler no longer uses global `min_workers`.
|
| 48 |
+
# Move `min_workers` to the node_type config.
|
| 49 |
+
node_type["min_workers"] = config.pop("min_workers", 0)
|
| 50 |
+
node_type["max_workers"] = config["max_workers"]
|
| 51 |
+
return config
|
| 52 |
+
|
| 53 |
+
|
| 54 |
+
def prepare_manual(config: Dict[str, Any]) -> Dict[str, Any]:
|
| 55 |
+
"""Validates and sets defaults for configs of manually managed on-prem
|
| 56 |
+
clusters.
|
| 57 |
+
|
| 58 |
+
- Checks for presence of required `worker_ips` and `head_ips` fields.
|
| 59 |
+
- Defaults min and max workers to the number of `worker_ips`.
|
| 60 |
+
- Caps min and max workers at the number of `worker_ips`.
|
| 61 |
+
- Writes min and max worker info into the single worker node type.
|
| 62 |
+
"""
|
| 63 |
+
config = copy.deepcopy(config)
|
| 64 |
+
if ("worker_ips" not in config["provider"]) or (
|
| 65 |
+
"head_ip" not in config["provider"]
|
| 66 |
+
):
|
| 67 |
+
cli_logger.abort(
|
| 68 |
+
"Please supply a `head_ip` and list of `worker_ips`. "
|
| 69 |
+
"Alternatively, supply a `coordinator_address`."
|
| 70 |
+
)
|
| 71 |
+
num_ips = len(config["provider"]["worker_ips"])
|
| 72 |
+
node_type = config["available_node_types"][LOCAL_CLUSTER_NODE_TYPE]
|
| 73 |
+
# Default to keeping all provided ips in the cluster.
|
| 74 |
+
config.setdefault("max_workers", num_ips)
|
| 75 |
+
|
| 76 |
+
# The autoscaler no longer uses global `min_workers`.
|
| 77 |
+
# We will move `min_workers` to the node_type config.
|
| 78 |
+
min_workers = config.pop("min_workers", num_ips)
|
| 79 |
+
max_workers = config["max_workers"]
|
| 80 |
+
|
| 81 |
+
if min_workers > num_ips:
|
| 82 |
+
cli_logger.warning(
|
| 83 |
+
f"The value of `min_workers` supplied ({min_workers}) is greater"
|
| 84 |
+
f" than the number of available worker ips ({num_ips})."
|
| 85 |
+
f" Setting `min_workers={num_ips}`."
|
| 86 |
+
)
|
| 87 |
+
node_type["min_workers"] = num_ips
|
| 88 |
+
else:
|
| 89 |
+
node_type["min_workers"] = min_workers
|
| 90 |
+
|
| 91 |
+
if max_workers > num_ips:
|
| 92 |
+
cli_logger.warning(
|
| 93 |
+
f"The value of `max_workers` supplied ({max_workers}) is greater"
|
| 94 |
+
f" than the number of available worker ips ({num_ips})."
|
| 95 |
+
f" Setting `max_workers={num_ips}`."
|
| 96 |
+
)
|
| 97 |
+
node_type["max_workers"] = num_ips
|
| 98 |
+
config["max_workers"] = num_ips
|
| 99 |
+
else:
|
| 100 |
+
node_type["max_workers"] = max_workers
|
| 101 |
+
|
| 102 |
+
if max_workers < num_ips:
|
| 103 |
+
cli_logger.warning(
|
| 104 |
+
f"The value of `max_workers` supplied ({max_workers}) is less"
|
| 105 |
+
f" than the number of available worker ips ({num_ips})."
|
| 106 |
+
f" At most {max_workers} Ray worker nodes will connect to the cluster."
|
| 107 |
+
)
|
| 108 |
+
|
| 109 |
+
return config
|
| 110 |
+
|
| 111 |
+
|
| 112 |
+
def get_lock_path(cluster_name: str) -> str:
|
| 113 |
+
return os.path.join(get_ray_temp_dir(), "cluster-{}.lock".format(cluster_name))
|
| 114 |
+
|
| 115 |
+
|
| 116 |
+
def get_state_path(cluster_name: str) -> str:
|
| 117 |
+
return os.path.join(get_ray_temp_dir(), "cluster-{}.state".format(cluster_name))
|
| 118 |
+
|
| 119 |
+
|
| 120 |
+
def bootstrap_local(config: Dict[str, Any]) -> Dict[str, Any]:
|
| 121 |
+
return config
|
.venv/lib/python3.11/site-packages/ray/autoscaler/_private/local/coordinator_node_provider.py
ADDED
|
@@ -0,0 +1,110 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import json
|
| 2 |
+
import logging
|
| 3 |
+
from http.client import RemoteDisconnected
|
| 4 |
+
|
| 5 |
+
from ray.autoscaler.node_provider import NodeProvider
|
| 6 |
+
from ray.autoscaler.tags import TAG_RAY_CLUSTER_NAME
|
| 7 |
+
|
| 8 |
+
logger = logging.getLogger(__name__)
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
class CoordinatorSenderNodeProvider(NodeProvider):
|
| 12 |
+
"""NodeProvider for automatically managed private/local clusters.
|
| 13 |
+
|
| 14 |
+
The cluster management is handled by a remote coordinating server.
|
| 15 |
+
The server listens on <coordinator_address>, therefore, the address
|
| 16 |
+
should be provided in the provider section in the cluster config.
|
| 17 |
+
The server receieves HTTP requests from this class and uses
|
| 18 |
+
LocalNodeProvider to get their responses.
|
| 19 |
+
"""
|
| 20 |
+
|
| 21 |
+
def __init__(self, provider_config, cluster_name):
|
| 22 |
+
NodeProvider.__init__(self, provider_config, cluster_name)
|
| 23 |
+
self.coordinator_address = provider_config["coordinator_address"]
|
| 24 |
+
|
| 25 |
+
def _get_http_response(self, request):
|
| 26 |
+
headers = {
|
| 27 |
+
"Content-Type": "application/json",
|
| 28 |
+
}
|
| 29 |
+
request_message = json.dumps(request).encode()
|
| 30 |
+
http_coordinator_address = "http://" + self.coordinator_address
|
| 31 |
+
|
| 32 |
+
try:
|
| 33 |
+
import requests # `requests` is not part of stdlib.
|
| 34 |
+
from requests.exceptions import ConnectionError
|
| 35 |
+
|
| 36 |
+
r = requests.get(
|
| 37 |
+
http_coordinator_address,
|
| 38 |
+
data=request_message,
|
| 39 |
+
headers=headers,
|
| 40 |
+
timeout=None,
|
| 41 |
+
)
|
| 42 |
+
except (RemoteDisconnected, ConnectionError):
|
| 43 |
+
logger.exception(
|
| 44 |
+
"Could not connect to: "
|
| 45 |
+
+ http_coordinator_address
|
| 46 |
+
+ ". Did you run python coordinator_server.py"
|
| 47 |
+
+ " --ips <list_of_node_ips> --port <PORT>?"
|
| 48 |
+
)
|
| 49 |
+
raise
|
| 50 |
+
except ImportError:
|
| 51 |
+
logger.exception(
|
| 52 |
+
"Not all Ray Autoscaler dependencies were found. "
|
| 53 |
+
"In Ray 1.4+, the Ray CLI, autoscaler, and dashboard will "
|
| 54 |
+
'only be usable via `pip install "ray[default]"`. Please '
|
| 55 |
+
"update your install command."
|
| 56 |
+
)
|
| 57 |
+
raise
|
| 58 |
+
|
| 59 |
+
response = r.json()
|
| 60 |
+
return response
|
| 61 |
+
|
| 62 |
+
def non_terminated_nodes(self, tag_filters):
|
| 63 |
+
# Only get the non terminated nodes associated with this cluster name.
|
| 64 |
+
tag_filters[TAG_RAY_CLUSTER_NAME] = self.cluster_name
|
| 65 |
+
request = {"type": "non_terminated_nodes", "args": (tag_filters,)}
|
| 66 |
+
return self._get_http_response(request)
|
| 67 |
+
|
| 68 |
+
def is_running(self, node_id):
|
| 69 |
+
request = {"type": "is_running", "args": (node_id,)}
|
| 70 |
+
return self._get_http_response(request)
|
| 71 |
+
|
| 72 |
+
def is_terminated(self, node_id):
|
| 73 |
+
request = {"type": "is_terminated", "args": (node_id,)}
|
| 74 |
+
return self._get_http_response(request)
|
| 75 |
+
|
| 76 |
+
def node_tags(self, node_id):
|
| 77 |
+
request = {"type": "node_tags", "args": (node_id,)}
|
| 78 |
+
return self._get_http_response(request)
|
| 79 |
+
|
| 80 |
+
def external_ip(self, node_id):
|
| 81 |
+
request = {"type": "external_ip", "args": (node_id,)}
|
| 82 |
+
response = self._get_http_response(request)
|
| 83 |
+
return response
|
| 84 |
+
|
| 85 |
+
def internal_ip(self, node_id):
|
| 86 |
+
request = {"type": "internal_ip", "args": (node_id,)}
|
| 87 |
+
response = self._get_http_response(request)
|
| 88 |
+
return response
|
| 89 |
+
|
| 90 |
+
def create_node(self, node_config, tags, count):
|
| 91 |
+
# Tag the newly created node with this cluster name. Helps to get
|
| 92 |
+
# the right nodes when calling non_terminated_nodes.
|
| 93 |
+
tags[TAG_RAY_CLUSTER_NAME] = self.cluster_name
|
| 94 |
+
request = {
|
| 95 |
+
"type": "create_node",
|
| 96 |
+
"args": (node_config, tags, count),
|
| 97 |
+
}
|
| 98 |
+
self._get_http_response(request)
|
| 99 |
+
|
| 100 |
+
def set_node_tags(self, node_id, tags):
|
| 101 |
+
request = {"type": "set_node_tags", "args": (node_id, tags)}
|
| 102 |
+
self._get_http_response(request)
|
| 103 |
+
|
| 104 |
+
def terminate_node(self, node_id):
|
| 105 |
+
request = {"type": "terminate_node", "args": (node_id,)}
|
| 106 |
+
self._get_http_response(request)
|
| 107 |
+
|
| 108 |
+
def terminate_nodes(self, node_ids):
|
| 109 |
+
request = {"type": "terminate_nodes", "args": (node_ids,)}
|
| 110 |
+
self._get_http_response(request)
|
.venv/lib/python3.11/site-packages/ray/autoscaler/_private/local/node_provider.py
ADDED
|
@@ -0,0 +1,304 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import json
|
| 2 |
+
import logging
|
| 3 |
+
import os
|
| 4 |
+
import socket
|
| 5 |
+
from threading import RLock
|
| 6 |
+
|
| 7 |
+
from filelock import FileLock
|
| 8 |
+
|
| 9 |
+
from ray.autoscaler._private.local.config import (
|
| 10 |
+
LOCAL_CLUSTER_NODE_TYPE,
|
| 11 |
+
bootstrap_local,
|
| 12 |
+
get_lock_path,
|
| 13 |
+
get_state_path,
|
| 14 |
+
)
|
| 15 |
+
from ray.autoscaler.node_provider import NodeProvider
|
| 16 |
+
from ray.autoscaler.tags import (
|
| 17 |
+
NODE_KIND_HEAD,
|
| 18 |
+
NODE_KIND_WORKER,
|
| 19 |
+
STATUS_UP_TO_DATE,
|
| 20 |
+
TAG_RAY_NODE_KIND,
|
| 21 |
+
TAG_RAY_NODE_NAME,
|
| 22 |
+
TAG_RAY_NODE_STATUS,
|
| 23 |
+
TAG_RAY_USER_NODE_TYPE,
|
| 24 |
+
)
|
| 25 |
+
|
| 26 |
+
logger = logging.getLogger(__name__)
|
| 27 |
+
|
| 28 |
+
filelock_logger = logging.getLogger("filelock")
|
| 29 |
+
filelock_logger.setLevel(logging.WARNING)
|
| 30 |
+
|
| 31 |
+
|
| 32 |
+
class ClusterState:
|
| 33 |
+
def __init__(self, lock_path, save_path, provider_config):
|
| 34 |
+
self.lock = RLock()
|
| 35 |
+
os.makedirs(os.path.dirname(lock_path), exist_ok=True)
|
| 36 |
+
self.file_lock = FileLock(lock_path)
|
| 37 |
+
self.save_path = save_path
|
| 38 |
+
|
| 39 |
+
with self.lock:
|
| 40 |
+
with self.file_lock:
|
| 41 |
+
if os.path.exists(self.save_path):
|
| 42 |
+
workers = json.loads(open(self.save_path).read())
|
| 43 |
+
head_config = workers.get(provider_config["head_ip"])
|
| 44 |
+
if (
|
| 45 |
+
not head_config
|
| 46 |
+
or head_config.get("tags", {}).get(TAG_RAY_NODE_KIND)
|
| 47 |
+
!= NODE_KIND_HEAD
|
| 48 |
+
):
|
| 49 |
+
workers = {}
|
| 50 |
+
logger.info("Head IP changed - recreating cluster.")
|
| 51 |
+
else:
|
| 52 |
+
workers = {}
|
| 53 |
+
logger.info(
|
| 54 |
+
"ClusterState: Loaded cluster state: {}".format(list(workers))
|
| 55 |
+
)
|
| 56 |
+
for worker_ip in provider_config["worker_ips"]:
|
| 57 |
+
if worker_ip not in workers:
|
| 58 |
+
workers[worker_ip] = {
|
| 59 |
+
"tags": {TAG_RAY_NODE_KIND: NODE_KIND_WORKER},
|
| 60 |
+
"state": "terminated",
|
| 61 |
+
}
|
| 62 |
+
else:
|
| 63 |
+
assert (
|
| 64 |
+
workers[worker_ip]["tags"][TAG_RAY_NODE_KIND]
|
| 65 |
+
== NODE_KIND_WORKER
|
| 66 |
+
)
|
| 67 |
+
if provider_config["head_ip"] not in workers:
|
| 68 |
+
workers[provider_config["head_ip"]] = {
|
| 69 |
+
"tags": {TAG_RAY_NODE_KIND: NODE_KIND_HEAD},
|
| 70 |
+
"state": "terminated",
|
| 71 |
+
}
|
| 72 |
+
else:
|
| 73 |
+
assert (
|
| 74 |
+
workers[provider_config["head_ip"]]["tags"][TAG_RAY_NODE_KIND]
|
| 75 |
+
== NODE_KIND_HEAD
|
| 76 |
+
)
|
| 77 |
+
# Relevant when a user reduces the number of workers
|
| 78 |
+
# without changing the headnode.
|
| 79 |
+
list_of_node_ips = list(provider_config["worker_ips"])
|
| 80 |
+
list_of_node_ips.append(provider_config["head_ip"])
|
| 81 |
+
for worker_ip in list(workers):
|
| 82 |
+
if worker_ip not in list_of_node_ips:
|
| 83 |
+
del workers[worker_ip]
|
| 84 |
+
|
| 85 |
+
# Set external head ip, if provided by user.
|
| 86 |
+
# Necessary if calling `ray up` from outside the network.
|
| 87 |
+
# Refer to LocalNodeProvider.external_ip function.
|
| 88 |
+
external_head_ip = provider_config.get("external_head_ip")
|
| 89 |
+
if external_head_ip:
|
| 90 |
+
head = workers[provider_config["head_ip"]]
|
| 91 |
+
head["external_ip"] = external_head_ip
|
| 92 |
+
|
| 93 |
+
assert len(workers) == len(provider_config["worker_ips"]) + 1
|
| 94 |
+
with open(self.save_path, "w") as f:
|
| 95 |
+
logger.debug(
|
| 96 |
+
"ClusterState: Writing cluster state: {}".format(workers)
|
| 97 |
+
)
|
| 98 |
+
f.write(json.dumps(workers))
|
| 99 |
+
|
| 100 |
+
def get(self):
|
| 101 |
+
with self.lock:
|
| 102 |
+
with self.file_lock:
|
| 103 |
+
workers = json.loads(open(self.save_path).read())
|
| 104 |
+
return workers
|
| 105 |
+
|
| 106 |
+
def put(self, worker_id, info):
|
| 107 |
+
assert "tags" in info
|
| 108 |
+
assert "state" in info
|
| 109 |
+
with self.lock:
|
| 110 |
+
with self.file_lock:
|
| 111 |
+
workers = self.get()
|
| 112 |
+
workers[worker_id] = info
|
| 113 |
+
with open(self.save_path, "w") as f:
|
| 114 |
+
logger.info(
|
| 115 |
+
"ClusterState: "
|
| 116 |
+
"Writing cluster state: {}".format(list(workers))
|
| 117 |
+
)
|
| 118 |
+
f.write(json.dumps(workers))
|
| 119 |
+
|
| 120 |
+
|
| 121 |
+
class OnPremCoordinatorState(ClusterState):
|
| 122 |
+
"""Generates & updates the state file of CoordinatorSenderNodeProvider.
|
| 123 |
+
|
| 124 |
+
Unlike ClusterState, which generates a cluster specific file with
|
| 125 |
+
predefined head and worker ips, OnPremCoordinatorState overwrites
|
| 126 |
+
ClusterState's __init__ function to generate and manage a unified
|
| 127 |
+
file of the status of all the nodes for multiple clusters.
|
| 128 |
+
"""
|
| 129 |
+
|
| 130 |
+
def __init__(self, lock_path, save_path, list_of_node_ips):
|
| 131 |
+
self.lock = RLock()
|
| 132 |
+
self.file_lock = FileLock(lock_path)
|
| 133 |
+
self.save_path = save_path
|
| 134 |
+
|
| 135 |
+
with self.lock:
|
| 136 |
+
with self.file_lock:
|
| 137 |
+
if os.path.exists(self.save_path):
|
| 138 |
+
nodes = json.loads(open(self.save_path).read())
|
| 139 |
+
else:
|
| 140 |
+
nodes = {}
|
| 141 |
+
logger.info(
|
| 142 |
+
"OnPremCoordinatorState: "
|
| 143 |
+
"Loaded on prem coordinator state: {}".format(nodes)
|
| 144 |
+
)
|
| 145 |
+
|
| 146 |
+
# Filter removed node ips.
|
| 147 |
+
for node_ip in list(nodes):
|
| 148 |
+
if node_ip not in list_of_node_ips:
|
| 149 |
+
del nodes[node_ip]
|
| 150 |
+
|
| 151 |
+
for node_ip in list_of_node_ips:
|
| 152 |
+
if node_ip not in nodes:
|
| 153 |
+
nodes[node_ip] = {
|
| 154 |
+
"tags": {},
|
| 155 |
+
"state": "terminated",
|
| 156 |
+
}
|
| 157 |
+
assert len(nodes) == len(list_of_node_ips)
|
| 158 |
+
with open(self.save_path, "w") as f:
|
| 159 |
+
logger.info(
|
| 160 |
+
"OnPremCoordinatorState: "
|
| 161 |
+
"Writing on prem coordinator state: {}".format(nodes)
|
| 162 |
+
)
|
| 163 |
+
f.write(json.dumps(nodes))
|
| 164 |
+
|
| 165 |
+
|
| 166 |
+
class LocalNodeProvider(NodeProvider):
|
| 167 |
+
"""NodeProvider for private/local clusters.
|
| 168 |
+
|
| 169 |
+
`node_id` is overloaded to also be `node_ip` in this class.
|
| 170 |
+
|
| 171 |
+
When `cluster_name` is provided, it manages a single cluster in a cluster
|
| 172 |
+
specific state file. But when `cluster_name` is None, it manages multiple
|
| 173 |
+
clusters in a unified state file that requires each node to be tagged with
|
| 174 |
+
TAG_RAY_CLUSTER_NAME in create and non_terminated_nodes function calls to
|
| 175 |
+
associate each node with the right cluster.
|
| 176 |
+
|
| 177 |
+
The current use case of managing multiple clusters is by
|
| 178 |
+
OnPremCoordinatorServer which receives node provider HTTP requests
|
| 179 |
+
from CoordinatorSenderNodeProvider and uses LocalNodeProvider to get
|
| 180 |
+
the responses.
|
| 181 |
+
"""
|
| 182 |
+
|
| 183 |
+
def __init__(self, provider_config, cluster_name):
|
| 184 |
+
NodeProvider.__init__(self, provider_config, cluster_name)
|
| 185 |
+
|
| 186 |
+
if cluster_name:
|
| 187 |
+
lock_path = get_lock_path(cluster_name)
|
| 188 |
+
state_path = get_state_path(cluster_name)
|
| 189 |
+
self.state = ClusterState(
|
| 190 |
+
lock_path,
|
| 191 |
+
state_path,
|
| 192 |
+
provider_config,
|
| 193 |
+
)
|
| 194 |
+
self.use_coordinator = False
|
| 195 |
+
else:
|
| 196 |
+
# LocalNodeProvider with a coordinator server.
|
| 197 |
+
self.state = OnPremCoordinatorState(
|
| 198 |
+
"/tmp/coordinator.lock",
|
| 199 |
+
"/tmp/coordinator.state",
|
| 200 |
+
provider_config["list_of_node_ips"],
|
| 201 |
+
)
|
| 202 |
+
self.use_coordinator = True
|
| 203 |
+
|
| 204 |
+
def non_terminated_nodes(self, tag_filters):
|
| 205 |
+
workers = self.state.get()
|
| 206 |
+
matching_ips = []
|
| 207 |
+
for worker_ip, info in workers.items():
|
| 208 |
+
if info["state"] == "terminated":
|
| 209 |
+
continue
|
| 210 |
+
ok = True
|
| 211 |
+
for k, v in tag_filters.items():
|
| 212 |
+
if info["tags"].get(k) != v:
|
| 213 |
+
ok = False
|
| 214 |
+
break
|
| 215 |
+
if ok:
|
| 216 |
+
matching_ips.append(worker_ip)
|
| 217 |
+
return matching_ips
|
| 218 |
+
|
| 219 |
+
def is_running(self, node_id):
|
| 220 |
+
return self.state.get()[node_id]["state"] == "running"
|
| 221 |
+
|
| 222 |
+
def is_terminated(self, node_id):
|
| 223 |
+
return not self.is_running(node_id)
|
| 224 |
+
|
| 225 |
+
def node_tags(self, node_id):
|
| 226 |
+
return self.state.get()[node_id]["tags"]
|
| 227 |
+
|
| 228 |
+
def external_ip(self, node_id):
|
| 229 |
+
"""Returns an external ip if the user has supplied one.
|
| 230 |
+
Otherwise, use the same logic as internal_ip below.
|
| 231 |
+
|
| 232 |
+
This can be used to call ray up from outside the network, for example
|
| 233 |
+
if the Ray cluster exists in an AWS VPC and we're interacting with
|
| 234 |
+
the cluster from a laptop (where using an internal_ip will not work).
|
| 235 |
+
|
| 236 |
+
Useful for debugging the local node provider with cloud VMs."""
|
| 237 |
+
|
| 238 |
+
node_state = self.state.get()[node_id]
|
| 239 |
+
ext_ip = node_state.get("external_ip")
|
| 240 |
+
if ext_ip:
|
| 241 |
+
return ext_ip
|
| 242 |
+
else:
|
| 243 |
+
return socket.gethostbyname(node_id)
|
| 244 |
+
|
| 245 |
+
def internal_ip(self, node_id):
|
| 246 |
+
return socket.gethostbyname(node_id)
|
| 247 |
+
|
| 248 |
+
def set_node_tags(self, node_id, tags):
|
| 249 |
+
with self.state.file_lock:
|
| 250 |
+
info = self.state.get()[node_id]
|
| 251 |
+
info["tags"].update(tags)
|
| 252 |
+
self.state.put(node_id, info)
|
| 253 |
+
|
| 254 |
+
def create_node(self, node_config, tags, count):
|
| 255 |
+
"""Creates min(count, currently available) nodes."""
|
| 256 |
+
node_type = tags[TAG_RAY_NODE_KIND]
|
| 257 |
+
with self.state.file_lock:
|
| 258 |
+
workers = self.state.get()
|
| 259 |
+
for node_id, info in workers.items():
|
| 260 |
+
if info["state"] == "terminated" and (
|
| 261 |
+
self.use_coordinator or info["tags"][TAG_RAY_NODE_KIND] == node_type
|
| 262 |
+
):
|
| 263 |
+
info["tags"] = tags
|
| 264 |
+
info["state"] = "running"
|
| 265 |
+
self.state.put(node_id, info)
|
| 266 |
+
count = count - 1
|
| 267 |
+
if count == 0:
|
| 268 |
+
return
|
| 269 |
+
|
| 270 |
+
def terminate_node(self, node_id):
|
| 271 |
+
workers = self.state.get()
|
| 272 |
+
info = workers[node_id]
|
| 273 |
+
info["state"] = "terminated"
|
| 274 |
+
self.state.put(node_id, info)
|
| 275 |
+
|
| 276 |
+
@staticmethod
|
| 277 |
+
def bootstrap_config(cluster_config):
|
| 278 |
+
return bootstrap_local(cluster_config)
|
| 279 |
+
|
| 280 |
+
|
| 281 |
+
def record_local_head_state_if_needed(local_provider: LocalNodeProvider) -> None:
|
| 282 |
+
"""This function is called on the Ray head from StandardAutoscaler.reset
|
| 283 |
+
to record the head node's own existence in the cluster state file.
|
| 284 |
+
|
| 285 |
+
This is necessary because `provider.create_node` in
|
| 286 |
+
`commands.get_or_create_head_node` records the head state on the
|
| 287 |
+
cluster-launching machine but not on the head.
|
| 288 |
+
"""
|
| 289 |
+
head_ip = local_provider.provider_config["head_ip"]
|
| 290 |
+
cluster_name = local_provider.cluster_name
|
| 291 |
+
# If the head node is not marked as created in the cluster state file,
|
| 292 |
+
if head_ip not in local_provider.non_terminated_nodes({}):
|
| 293 |
+
# These tags are based on the ones in commands.get_or_create_head_node;
|
| 294 |
+
# keep in sync.
|
| 295 |
+
head_tags = {
|
| 296 |
+
TAG_RAY_NODE_KIND: NODE_KIND_HEAD,
|
| 297 |
+
TAG_RAY_USER_NODE_TYPE: LOCAL_CLUSTER_NODE_TYPE,
|
| 298 |
+
TAG_RAY_NODE_NAME: "ray-{}-head".format(cluster_name),
|
| 299 |
+
TAG_RAY_NODE_STATUS: STATUS_UP_TO_DATE,
|
| 300 |
+
}
|
| 301 |
+
# Mark the head node as created in the cluster state file.
|
| 302 |
+
local_provider.create_node(node_config={}, tags=head_tags, count=1)
|
| 303 |
+
|
| 304 |
+
assert head_ip in local_provider.non_terminated_nodes({})
|
.venv/lib/python3.11/site-packages/ray/autoscaler/_private/log_timer.py
ADDED
|
@@ -0,0 +1,33 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import datetime
|
| 2 |
+
import logging
|
| 3 |
+
|
| 4 |
+
from ray.autoscaler._private.cli_logger import cli_logger
|
| 5 |
+
|
| 6 |
+
logger = logging.getLogger(__name__)
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
class LogTimer:
|
| 10 |
+
def __init__(self, message, show_status=False):
|
| 11 |
+
self._message = message
|
| 12 |
+
self._show_status = show_status
|
| 13 |
+
|
| 14 |
+
def __enter__(self):
|
| 15 |
+
self._start_time = datetime.datetime.utcnow()
|
| 16 |
+
|
| 17 |
+
def __exit__(self, *error_vals):
|
| 18 |
+
if cli_logger.log_style != "record":
|
| 19 |
+
return
|
| 20 |
+
|
| 21 |
+
td = datetime.datetime.utcnow() - self._start_time
|
| 22 |
+
status = ""
|
| 23 |
+
if self._show_status:
|
| 24 |
+
status = "failed" if any(error_vals) else "succeeded"
|
| 25 |
+
cli_logger.print(
|
| 26 |
+
" ".join(
|
| 27 |
+
[
|
| 28 |
+
self._message,
|
| 29 |
+
status,
|
| 30 |
+
"[LogTimer={:.0f}ms]".format(td.total_seconds() * 1000),
|
| 31 |
+
]
|
| 32 |
+
)
|
| 33 |
+
)
|
.venv/lib/python3.11/site-packages/ray/autoscaler/_private/monitor.py
ADDED
|
@@ -0,0 +1,719 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Autoscaler monitoring loop daemon."""
|
| 2 |
+
|
| 3 |
+
import argparse
|
| 4 |
+
import json
|
| 5 |
+
import logging
|
| 6 |
+
import os
|
| 7 |
+
import signal
|
| 8 |
+
import sys
|
| 9 |
+
import time
|
| 10 |
+
import traceback
|
| 11 |
+
from collections import Counter
|
| 12 |
+
from dataclasses import asdict
|
| 13 |
+
from typing import Any, Callable, Dict, Optional, Union
|
| 14 |
+
|
| 15 |
+
import ray
|
| 16 |
+
import ray._private.ray_constants as ray_constants
|
| 17 |
+
import ray._private.utils
|
| 18 |
+
from ray._private.event.event_logger import get_event_logger
|
| 19 |
+
from ray._private.ray_logging import setup_component_logger
|
| 20 |
+
from ray._raylet import GcsClient
|
| 21 |
+
from ray.autoscaler._private.autoscaler import StandardAutoscaler
|
| 22 |
+
from ray.autoscaler._private.commands import teardown_cluster
|
| 23 |
+
from ray.autoscaler._private.constants import (
|
| 24 |
+
AUTOSCALER_MAX_RESOURCE_DEMAND_VECTOR_SIZE,
|
| 25 |
+
AUTOSCALER_METRIC_PORT,
|
| 26 |
+
AUTOSCALER_UPDATE_INTERVAL_S,
|
| 27 |
+
DISABLE_LAUNCH_CONFIG_CHECK_KEY,
|
| 28 |
+
)
|
| 29 |
+
from ray.autoscaler._private.event_summarizer import EventSummarizer
|
| 30 |
+
from ray.autoscaler._private.load_metrics import LoadMetrics
|
| 31 |
+
from ray.autoscaler._private.prom_metrics import AutoscalerPrometheusMetrics
|
| 32 |
+
from ray.autoscaler._private.util import format_readonly_node_type
|
| 33 |
+
from ray.autoscaler.v2.sdk import get_cluster_resource_state
|
| 34 |
+
from ray.core.generated import gcs_pb2
|
| 35 |
+
from ray.core.generated.event_pb2 import Event as RayEvent
|
| 36 |
+
from ray.experimental.internal_kv import (
|
| 37 |
+
_initialize_internal_kv,
|
| 38 |
+
_internal_kv_del,
|
| 39 |
+
_internal_kv_get,
|
| 40 |
+
_internal_kv_initialized,
|
| 41 |
+
_internal_kv_put,
|
| 42 |
+
)
|
| 43 |
+
|
| 44 |
+
try:
|
| 45 |
+
import prometheus_client
|
| 46 |
+
except ImportError:
|
| 47 |
+
prometheus_client = None
|
| 48 |
+
|
| 49 |
+
|
| 50 |
+
logger = logging.getLogger(__name__)
|
| 51 |
+
|
| 52 |
+
|
| 53 |
+
def parse_resource_demands(resource_load_by_shape):
|
| 54 |
+
"""Handle the message.resource_load_by_shape protobuf for the demand
|
| 55 |
+
based autoscaling. Catch and log all exceptions so this doesn't
|
| 56 |
+
interfere with the utilization based autoscaler until we're confident
|
| 57 |
+
this is stable. Worker queue backlogs are added to the appropriate
|
| 58 |
+
resource demand vector.
|
| 59 |
+
|
| 60 |
+
Args:
|
| 61 |
+
resource_load_by_shape (pb2.gcs.ResourceLoad): The resource demands
|
| 62 |
+
in protobuf form or None.
|
| 63 |
+
|
| 64 |
+
Returns:
|
| 65 |
+
List[ResourceDict]: Waiting bundles (ready and feasible).
|
| 66 |
+
List[ResourceDict]: Infeasible bundles.
|
| 67 |
+
"""
|
| 68 |
+
waiting_bundles, infeasible_bundles = [], []
|
| 69 |
+
try:
|
| 70 |
+
for resource_demand_pb in list(resource_load_by_shape.resource_demands):
|
| 71 |
+
request_shape = dict(resource_demand_pb.shape)
|
| 72 |
+
for _ in range(resource_demand_pb.num_ready_requests_queued):
|
| 73 |
+
waiting_bundles.append(request_shape)
|
| 74 |
+
for _ in range(resource_demand_pb.num_infeasible_requests_queued):
|
| 75 |
+
infeasible_bundles.append(request_shape)
|
| 76 |
+
|
| 77 |
+
# Infeasible and ready states for tasks are (logically)
|
| 78 |
+
# mutually exclusive.
|
| 79 |
+
if resource_demand_pb.num_infeasible_requests_queued > 0:
|
| 80 |
+
backlog_queue = infeasible_bundles
|
| 81 |
+
else:
|
| 82 |
+
backlog_queue = waiting_bundles
|
| 83 |
+
for _ in range(resource_demand_pb.backlog_size):
|
| 84 |
+
backlog_queue.append(request_shape)
|
| 85 |
+
if (
|
| 86 |
+
len(waiting_bundles + infeasible_bundles)
|
| 87 |
+
> AUTOSCALER_MAX_RESOURCE_DEMAND_VECTOR_SIZE
|
| 88 |
+
):
|
| 89 |
+
break
|
| 90 |
+
except Exception:
|
| 91 |
+
logger.exception("Failed to parse resource demands.")
|
| 92 |
+
|
| 93 |
+
return waiting_bundles, infeasible_bundles
|
| 94 |
+
|
| 95 |
+
|
| 96 |
+
# Readonly provider config (e.g., for laptop mode, manually setup clusters).
|
| 97 |
+
BASE_READONLY_CONFIG = {
|
| 98 |
+
"cluster_name": "default",
|
| 99 |
+
"max_workers": 0,
|
| 100 |
+
"upscaling_speed": 1.0,
|
| 101 |
+
"docker": {},
|
| 102 |
+
"idle_timeout_minutes": 0,
|
| 103 |
+
"provider": {
|
| 104 |
+
"type": "readonly",
|
| 105 |
+
"use_node_id_as_ip": True, # For emulated multi-node on laptop.
|
| 106 |
+
DISABLE_LAUNCH_CONFIG_CHECK_KEY: True, # No launch check.
|
| 107 |
+
},
|
| 108 |
+
"auth": {},
|
| 109 |
+
"available_node_types": {
|
| 110 |
+
"ray.head.default": {"resources": {}, "node_config": {}, "max_workers": 0}
|
| 111 |
+
},
|
| 112 |
+
"head_node_type": "ray.head.default",
|
| 113 |
+
"file_mounts": {},
|
| 114 |
+
"cluster_synced_files": [],
|
| 115 |
+
"file_mounts_sync_continuously": False,
|
| 116 |
+
"rsync_exclude": [],
|
| 117 |
+
"rsync_filter": [],
|
| 118 |
+
"initialization_commands": [],
|
| 119 |
+
"setup_commands": [],
|
| 120 |
+
"head_setup_commands": [],
|
| 121 |
+
"worker_setup_commands": [],
|
| 122 |
+
"head_start_ray_commands": [],
|
| 123 |
+
"worker_start_ray_commands": [],
|
| 124 |
+
}
|
| 125 |
+
|
| 126 |
+
|
| 127 |
+
class Monitor:
|
| 128 |
+
"""Autoscaling monitor.
|
| 129 |
+
|
| 130 |
+
This process periodically collects stats from the GCS and triggers
|
| 131 |
+
autoscaler updates.
|
| 132 |
+
"""
|
| 133 |
+
|
| 134 |
+
def __init__(
|
| 135 |
+
self,
|
| 136 |
+
address: str,
|
| 137 |
+
autoscaling_config: Union[str, Callable[[], Dict[str, Any]]],
|
| 138 |
+
log_dir: str = None,
|
| 139 |
+
prefix_cluster_info: bool = False,
|
| 140 |
+
monitor_ip: Optional[str] = None,
|
| 141 |
+
retry_on_failure: bool = True,
|
| 142 |
+
):
|
| 143 |
+
self.gcs_address = address
|
| 144 |
+
worker = ray._private.worker.global_worker
|
| 145 |
+
# TODO: eventually plumb ClusterID through to here
|
| 146 |
+
self.gcs_client = GcsClient(address=self.gcs_address)
|
| 147 |
+
|
| 148 |
+
if monitor_ip:
|
| 149 |
+
monitor_addr = f"{monitor_ip}:{AUTOSCALER_METRIC_PORT}"
|
| 150 |
+
self.gcs_client.internal_kv_put(
|
| 151 |
+
b"AutoscalerMetricsAddress", monitor_addr.encode(), True, None
|
| 152 |
+
)
|
| 153 |
+
_initialize_internal_kv(self.gcs_client)
|
| 154 |
+
if monitor_ip:
|
| 155 |
+
monitor_addr = f"{monitor_ip}:{AUTOSCALER_METRIC_PORT}"
|
| 156 |
+
self.gcs_client.internal_kv_put(
|
| 157 |
+
b"AutoscalerMetricsAddress", monitor_addr.encode(), True, None
|
| 158 |
+
)
|
| 159 |
+
self._session_name = self.get_session_name(self.gcs_client)
|
| 160 |
+
logger.info(f"session_name: {self._session_name}")
|
| 161 |
+
worker.mode = 0
|
| 162 |
+
head_node_ip = self.gcs_address.split(":")[0]
|
| 163 |
+
|
| 164 |
+
self.load_metrics = LoadMetrics()
|
| 165 |
+
self.last_avail_resources = None
|
| 166 |
+
self.event_summarizer = EventSummarizer()
|
| 167 |
+
self.prefix_cluster_info = prefix_cluster_info
|
| 168 |
+
self.retry_on_failure = retry_on_failure
|
| 169 |
+
self.autoscaling_config = autoscaling_config
|
| 170 |
+
self.autoscaler = None
|
| 171 |
+
# If set, we are in a manually created cluster (non-autoscaling) and
|
| 172 |
+
# simply mirroring what the GCS tells us the cluster node types are.
|
| 173 |
+
self.readonly_config = None
|
| 174 |
+
|
| 175 |
+
if log_dir:
|
| 176 |
+
try:
|
| 177 |
+
self.event_logger = get_event_logger(
|
| 178 |
+
RayEvent.SourceType.AUTOSCALER, log_dir
|
| 179 |
+
)
|
| 180 |
+
except Exception:
|
| 181 |
+
self.event_logger = None
|
| 182 |
+
else:
|
| 183 |
+
self.event_logger = None
|
| 184 |
+
|
| 185 |
+
self.prom_metrics = AutoscalerPrometheusMetrics(session_name=self._session_name)
|
| 186 |
+
|
| 187 |
+
if monitor_ip and prometheus_client:
|
| 188 |
+
# If monitor_ip wasn't passed in, then don't attempt to start the
|
| 189 |
+
# metric server to keep behavior identical to before metrics were
|
| 190 |
+
# introduced
|
| 191 |
+
try:
|
| 192 |
+
logger.info(
|
| 193 |
+
"Starting autoscaler metrics server on port {}".format(
|
| 194 |
+
AUTOSCALER_METRIC_PORT
|
| 195 |
+
)
|
| 196 |
+
)
|
| 197 |
+
kwargs = {"addr": "127.0.0.1"} if head_node_ip == "127.0.0.1" else {}
|
| 198 |
+
prometheus_client.start_http_server(
|
| 199 |
+
port=AUTOSCALER_METRIC_PORT,
|
| 200 |
+
registry=self.prom_metrics.registry,
|
| 201 |
+
**kwargs,
|
| 202 |
+
)
|
| 203 |
+
|
| 204 |
+
# Reset some gauges, since we don't know which labels have
|
| 205 |
+
# leaked if the autoscaler was restarted.
|
| 206 |
+
self.prom_metrics.pending_nodes.clear()
|
| 207 |
+
self.prom_metrics.active_nodes.clear()
|
| 208 |
+
except Exception:
|
| 209 |
+
logger.exception(
|
| 210 |
+
"An exception occurred while starting the metrics server."
|
| 211 |
+
)
|
| 212 |
+
elif not prometheus_client:
|
| 213 |
+
logger.warning(
|
| 214 |
+
"`prometheus_client` not found, so metrics will not be exported."
|
| 215 |
+
)
|
| 216 |
+
|
| 217 |
+
logger.info("Monitor: Started")
|
| 218 |
+
|
| 219 |
+
def _initialize_autoscaler(self):
|
| 220 |
+
if self.autoscaling_config:
|
| 221 |
+
autoscaling_config = self.autoscaling_config
|
| 222 |
+
else:
|
| 223 |
+
# This config mirrors the current setup of the manually created
|
| 224 |
+
# cluster. Each node gets its own unique node type.
|
| 225 |
+
self.readonly_config = BASE_READONLY_CONFIG
|
| 226 |
+
|
| 227 |
+
# Note that the "available_node_types" of the config can change.
|
| 228 |
+
def get_latest_readonly_config():
|
| 229 |
+
return self.readonly_config
|
| 230 |
+
|
| 231 |
+
autoscaling_config = get_latest_readonly_config
|
| 232 |
+
self.autoscaler = StandardAutoscaler(
|
| 233 |
+
autoscaling_config,
|
| 234 |
+
self.load_metrics,
|
| 235 |
+
self.gcs_client,
|
| 236 |
+
self._session_name,
|
| 237 |
+
prefix_cluster_info=self.prefix_cluster_info,
|
| 238 |
+
event_summarizer=self.event_summarizer,
|
| 239 |
+
prom_metrics=self.prom_metrics,
|
| 240 |
+
)
|
| 241 |
+
|
| 242 |
+
def update_load_metrics(self):
|
| 243 |
+
"""Fetches resource usage data from GCS and updates load metrics."""
|
| 244 |
+
|
| 245 |
+
response = self.gcs_client.get_all_resource_usage(timeout=60)
|
| 246 |
+
resources_batch_data = response.resource_usage_data
|
| 247 |
+
log_resource_batch_data_if_desired(resources_batch_data)
|
| 248 |
+
|
| 249 |
+
# This is a workaround to get correct idle_duration_ms
|
| 250 |
+
# from "get_cluster_resource_state"
|
| 251 |
+
# ref: https://github.com/ray-project/ray/pull/48519#issuecomment-2481659346
|
| 252 |
+
cluster_resource_state = get_cluster_resource_state(self.gcs_client)
|
| 253 |
+
ray_node_states = cluster_resource_state.node_states
|
| 254 |
+
ray_nodes_idle_duration_ms_by_id = {
|
| 255 |
+
node.node_id: node.idle_duration_ms for node in ray_node_states
|
| 256 |
+
}
|
| 257 |
+
|
| 258 |
+
# Tell the readonly node provider what nodes to report.
|
| 259 |
+
if self.readonly_config:
|
| 260 |
+
new_nodes = []
|
| 261 |
+
for msg in list(resources_batch_data.batch):
|
| 262 |
+
node_id = msg.node_id.hex()
|
| 263 |
+
new_nodes.append((node_id, msg.node_manager_address))
|
| 264 |
+
self.autoscaler.provider._set_nodes(new_nodes)
|
| 265 |
+
|
| 266 |
+
mirror_node_types = {}
|
| 267 |
+
cluster_full = False
|
| 268 |
+
if (
|
| 269 |
+
hasattr(response, "cluster_full_of_actors_detected_by_gcs")
|
| 270 |
+
and response.cluster_full_of_actors_detected_by_gcs
|
| 271 |
+
):
|
| 272 |
+
# GCS has detected the cluster full of actors.
|
| 273 |
+
cluster_full = True
|
| 274 |
+
for resource_message in resources_batch_data.batch:
|
| 275 |
+
node_id = resource_message.node_id
|
| 276 |
+
# Generate node type config based on GCS reported node list.
|
| 277 |
+
if self.readonly_config:
|
| 278 |
+
# Keep prefix in sync with ReadonlyNodeProvider.
|
| 279 |
+
node_type = format_readonly_node_type(node_id.hex())
|
| 280 |
+
resources = {}
|
| 281 |
+
for k, v in resource_message.resources_total.items():
|
| 282 |
+
resources[k] = v
|
| 283 |
+
mirror_node_types[node_type] = {
|
| 284 |
+
"resources": resources,
|
| 285 |
+
"node_config": {},
|
| 286 |
+
"max_workers": 1,
|
| 287 |
+
}
|
| 288 |
+
if (
|
| 289 |
+
hasattr(resource_message, "cluster_full_of_actors_detected")
|
| 290 |
+
and resource_message.cluster_full_of_actors_detected
|
| 291 |
+
):
|
| 292 |
+
# A worker node has detected the cluster full of actors.
|
| 293 |
+
cluster_full = True
|
| 294 |
+
total_resources = dict(resource_message.resources_total)
|
| 295 |
+
available_resources = dict(resource_message.resources_available)
|
| 296 |
+
|
| 297 |
+
waiting_bundles, infeasible_bundles = parse_resource_demands(
|
| 298 |
+
resources_batch_data.resource_load_by_shape
|
| 299 |
+
)
|
| 300 |
+
|
| 301 |
+
pending_placement_groups = list(
|
| 302 |
+
resources_batch_data.placement_group_load.placement_group_data
|
| 303 |
+
)
|
| 304 |
+
|
| 305 |
+
use_node_id_as_ip = self.autoscaler is not None and self.autoscaler.config[
|
| 306 |
+
"provider"
|
| 307 |
+
].get("use_node_id_as_ip", False)
|
| 308 |
+
|
| 309 |
+
# "use_node_id_as_ip" is a hack meant to address situations in
|
| 310 |
+
# which there's more than one Ray node residing at a given ip.
|
| 311 |
+
# TODO (Dmitri): Stop using ips as node identifiers.
|
| 312 |
+
# https://github.com/ray-project/ray/issues/19086
|
| 313 |
+
if use_node_id_as_ip:
|
| 314 |
+
peloton_id = total_resources.get("NODE_ID_AS_RESOURCE")
|
| 315 |
+
# Legacy support https://github.com/ray-project/ray/pull/17312
|
| 316 |
+
if peloton_id is not None:
|
| 317 |
+
ip = str(int(peloton_id))
|
| 318 |
+
else:
|
| 319 |
+
ip = node_id.hex()
|
| 320 |
+
else:
|
| 321 |
+
ip = resource_message.node_manager_address
|
| 322 |
+
|
| 323 |
+
idle_duration_s = 0.0
|
| 324 |
+
if node_id in ray_nodes_idle_duration_ms_by_id:
|
| 325 |
+
idle_duration_s = ray_nodes_idle_duration_ms_by_id[node_id] / 1000
|
| 326 |
+
else:
|
| 327 |
+
logger.warning(
|
| 328 |
+
f"node_id {node_id} not found in ray_nodes_idle_duration_ms_by_id"
|
| 329 |
+
)
|
| 330 |
+
|
| 331 |
+
self.load_metrics.update(
|
| 332 |
+
ip,
|
| 333 |
+
node_id,
|
| 334 |
+
total_resources,
|
| 335 |
+
available_resources,
|
| 336 |
+
idle_duration_s,
|
| 337 |
+
waiting_bundles,
|
| 338 |
+
infeasible_bundles,
|
| 339 |
+
pending_placement_groups,
|
| 340 |
+
cluster_full,
|
| 341 |
+
)
|
| 342 |
+
if self.readonly_config:
|
| 343 |
+
self.readonly_config["available_node_types"].update(mirror_node_types)
|
| 344 |
+
|
| 345 |
+
def get_session_name(self, gcs_client: GcsClient) -> Optional[str]:
|
| 346 |
+
"""Obtain the session name from the GCS.
|
| 347 |
+
|
| 348 |
+
If the GCS doesn't respond, session name is considered None.
|
| 349 |
+
In this case, the metrics reported from the monitor won't have
|
| 350 |
+
the correct session name.
|
| 351 |
+
"""
|
| 352 |
+
if not _internal_kv_initialized():
|
| 353 |
+
return None
|
| 354 |
+
|
| 355 |
+
session_name = gcs_client.internal_kv_get(
|
| 356 |
+
b"session_name",
|
| 357 |
+
ray_constants.KV_NAMESPACE_SESSION,
|
| 358 |
+
timeout=10,
|
| 359 |
+
)
|
| 360 |
+
|
| 361 |
+
if session_name:
|
| 362 |
+
session_name = session_name.decode()
|
| 363 |
+
|
| 364 |
+
return session_name
|
| 365 |
+
|
| 366 |
+
def update_resource_requests(self):
|
| 367 |
+
"""Fetches resource requests from the internal KV and updates load."""
|
| 368 |
+
if not _internal_kv_initialized():
|
| 369 |
+
return
|
| 370 |
+
data = _internal_kv_get(
|
| 371 |
+
ray._private.ray_constants.AUTOSCALER_RESOURCE_REQUEST_CHANNEL
|
| 372 |
+
)
|
| 373 |
+
if data:
|
| 374 |
+
try:
|
| 375 |
+
resource_request = json.loads(data)
|
| 376 |
+
self.load_metrics.set_resource_requests(resource_request)
|
| 377 |
+
except Exception:
|
| 378 |
+
logger.exception("Error parsing resource requests")
|
| 379 |
+
|
| 380 |
+
def _run(self):
|
| 381 |
+
"""Run the monitor loop."""
|
| 382 |
+
|
| 383 |
+
while True:
|
| 384 |
+
try:
|
| 385 |
+
gcs_request_start_time = time.time()
|
| 386 |
+
self.update_load_metrics()
|
| 387 |
+
gcs_request_time = time.time() - gcs_request_start_time
|
| 388 |
+
self.update_resource_requests()
|
| 389 |
+
self.update_event_summary()
|
| 390 |
+
load_metrics_summary = self.load_metrics.summary()
|
| 391 |
+
status = {
|
| 392 |
+
"gcs_request_time": gcs_request_time,
|
| 393 |
+
"time": time.time(),
|
| 394 |
+
"monitor_pid": os.getpid(),
|
| 395 |
+
}
|
| 396 |
+
|
| 397 |
+
if self.autoscaler and not self.load_metrics:
|
| 398 |
+
# load_metrics is Falsey iff we haven't collected any
|
| 399 |
+
# resource messages from the GCS, which can happen at startup if
|
| 400 |
+
# the GCS hasn't yet received data from the Raylets.
|
| 401 |
+
# In this case, do not do an autoscaler update.
|
| 402 |
+
# Wait to get load metrics.
|
| 403 |
+
logger.info(
|
| 404 |
+
"Autoscaler has not yet received load metrics. Waiting."
|
| 405 |
+
)
|
| 406 |
+
elif self.autoscaler:
|
| 407 |
+
# Process autoscaling actions
|
| 408 |
+
update_start_time = time.time()
|
| 409 |
+
self.autoscaler.update()
|
| 410 |
+
status["autoscaler_update_time"] = time.time() - update_start_time
|
| 411 |
+
autoscaler_summary = self.autoscaler.summary()
|
| 412 |
+
try:
|
| 413 |
+
self.emit_metrics(
|
| 414 |
+
load_metrics_summary,
|
| 415 |
+
autoscaler_summary,
|
| 416 |
+
self.autoscaler.all_node_types,
|
| 417 |
+
)
|
| 418 |
+
except Exception:
|
| 419 |
+
logger.exception("Error emitting metrics")
|
| 420 |
+
|
| 421 |
+
if autoscaler_summary:
|
| 422 |
+
status["autoscaler_report"] = asdict(autoscaler_summary)
|
| 423 |
+
status[
|
| 424 |
+
"non_terminated_nodes_time"
|
| 425 |
+
] = (
|
| 426 |
+
self.autoscaler.non_terminated_nodes.non_terminated_nodes_time # noqa: E501
|
| 427 |
+
)
|
| 428 |
+
|
| 429 |
+
for msg in self.event_summarizer.summary():
|
| 430 |
+
# Need to prefix each line of the message for the lines to
|
| 431 |
+
# get pushed to the driver logs.
|
| 432 |
+
for line in msg.split("\n"):
|
| 433 |
+
logger.info(
|
| 434 |
+
"{}{}".format(
|
| 435 |
+
ray_constants.LOG_PREFIX_EVENT_SUMMARY, line
|
| 436 |
+
)
|
| 437 |
+
)
|
| 438 |
+
if self.event_logger:
|
| 439 |
+
self.event_logger.info(line)
|
| 440 |
+
|
| 441 |
+
self.event_summarizer.clear()
|
| 442 |
+
|
| 443 |
+
status["load_metrics_report"] = asdict(load_metrics_summary)
|
| 444 |
+
as_json = json.dumps(status)
|
| 445 |
+
if _internal_kv_initialized():
|
| 446 |
+
_internal_kv_put(
|
| 447 |
+
ray_constants.DEBUG_AUTOSCALING_STATUS, as_json, overwrite=True
|
| 448 |
+
)
|
| 449 |
+
except Exception:
|
| 450 |
+
# By default, do not exit the monitor on failure.
|
| 451 |
+
if self.retry_on_failure:
|
| 452 |
+
logger.exception("Monitor: Execution exception. Trying again...")
|
| 453 |
+
else:
|
| 454 |
+
raise
|
| 455 |
+
|
| 456 |
+
# Wait for a autoscaler update interval before processing the next
|
| 457 |
+
# round of messages.
|
| 458 |
+
time.sleep(AUTOSCALER_UPDATE_INTERVAL_S)
|
| 459 |
+
|
| 460 |
+
def emit_metrics(self, load_metrics_summary, autoscaler_summary, node_types):
|
| 461 |
+
if autoscaler_summary is None:
|
| 462 |
+
return None
|
| 463 |
+
|
| 464 |
+
for resource_name in ["CPU", "GPU", "TPU"]:
|
| 465 |
+
_, total = load_metrics_summary.usage.get(resource_name, (0, 0))
|
| 466 |
+
pending = autoscaler_summary.pending_resources.get(resource_name, 0)
|
| 467 |
+
self.prom_metrics.cluster_resources.labels(
|
| 468 |
+
resource=resource_name,
|
| 469 |
+
SessionName=self.prom_metrics.session_name,
|
| 470 |
+
).set(total)
|
| 471 |
+
self.prom_metrics.pending_resources.labels(
|
| 472 |
+
resource=resource_name,
|
| 473 |
+
SessionName=self.prom_metrics.session_name,
|
| 474 |
+
).set(pending)
|
| 475 |
+
|
| 476 |
+
pending_node_count = Counter()
|
| 477 |
+
for _, node_type, _ in autoscaler_summary.pending_nodes:
|
| 478 |
+
pending_node_count[node_type] += 1
|
| 479 |
+
|
| 480 |
+
for node_type, count in autoscaler_summary.pending_launches.items():
|
| 481 |
+
pending_node_count[node_type] += count
|
| 482 |
+
|
| 483 |
+
for node_type in node_types:
|
| 484 |
+
count = pending_node_count[node_type]
|
| 485 |
+
self.prom_metrics.pending_nodes.labels(
|
| 486 |
+
SessionName=self.prom_metrics.session_name,
|
| 487 |
+
NodeType=node_type,
|
| 488 |
+
).set(count)
|
| 489 |
+
|
| 490 |
+
for node_type in node_types:
|
| 491 |
+
count = autoscaler_summary.active_nodes.get(node_type, 0)
|
| 492 |
+
self.prom_metrics.active_nodes.labels(
|
| 493 |
+
SessionName=self.prom_metrics.session_name,
|
| 494 |
+
NodeType=node_type,
|
| 495 |
+
).set(count)
|
| 496 |
+
|
| 497 |
+
failed_node_counts = Counter()
|
| 498 |
+
for _, node_type in autoscaler_summary.failed_nodes:
|
| 499 |
+
failed_node_counts[node_type] += 1
|
| 500 |
+
|
| 501 |
+
# NOTE: This metric isn't reset with monitor resets. This means it will
|
| 502 |
+
# only be updated when the autoscaler' node tracker remembers failed
|
| 503 |
+
# nodes. If the node type failure is evicted from the autoscaler, the
|
| 504 |
+
# metric may not update for a while.
|
| 505 |
+
for node_type, count in failed_node_counts.items():
|
| 506 |
+
self.prom_metrics.recently_failed_nodes.labels(
|
| 507 |
+
SessionName=self.prom_metrics.session_name,
|
| 508 |
+
NodeType=node_type,
|
| 509 |
+
).set(count)
|
| 510 |
+
|
| 511 |
+
def update_event_summary(self):
|
| 512 |
+
"""Report the current size of the cluster.
|
| 513 |
+
|
| 514 |
+
To avoid log spam, only cluster size changes (CPU, GPU or TPU count change)
|
| 515 |
+
are reported to the event summarizer. The event summarizer will report
|
| 516 |
+
only the latest cluster size per batch.
|
| 517 |
+
"""
|
| 518 |
+
avail_resources = self.load_metrics.resources_avail_summary()
|
| 519 |
+
if not self.readonly_config and avail_resources != self.last_avail_resources:
|
| 520 |
+
self.event_summarizer.add(
|
| 521 |
+
"Resized to {}.", # e.g., Resized to 100 CPUs, 4 GPUs, 4 TPUs.
|
| 522 |
+
quantity=avail_resources,
|
| 523 |
+
aggregate=lambda old, new: new,
|
| 524 |
+
)
|
| 525 |
+
self.last_avail_resources = avail_resources
|
| 526 |
+
|
| 527 |
+
def destroy_autoscaler_workers(self):
|
| 528 |
+
"""Cleanup the autoscaler, in case of an exception in the run() method.
|
| 529 |
+
|
| 530 |
+
We kill the worker nodes, but retain the head node in order to keep
|
| 531 |
+
logs around, keeping costs minimal. This monitor process runs on the
|
| 532 |
+
head node anyway, so this is more reliable."""
|
| 533 |
+
|
| 534 |
+
if self.autoscaler is None:
|
| 535 |
+
return # Nothing to clean up.
|
| 536 |
+
|
| 537 |
+
if self.autoscaling_config is None:
|
| 538 |
+
# This is a logic error in the program. Can't do anything.
|
| 539 |
+
logger.error("Monitor: Cleanup failed due to lack of autoscaler config.")
|
| 540 |
+
return
|
| 541 |
+
|
| 542 |
+
logger.info("Monitor: Exception caught. Taking down workers...")
|
| 543 |
+
clean = False
|
| 544 |
+
while not clean:
|
| 545 |
+
try:
|
| 546 |
+
teardown_cluster(
|
| 547 |
+
config_file=self.autoscaling_config,
|
| 548 |
+
yes=True, # Non-interactive.
|
| 549 |
+
workers_only=True, # Retain head node for logs.
|
| 550 |
+
override_cluster_name=None,
|
| 551 |
+
keep_min_workers=True, # Retain minimal amount of workers.
|
| 552 |
+
)
|
| 553 |
+
clean = True
|
| 554 |
+
logger.info("Monitor: Workers taken down.")
|
| 555 |
+
except Exception:
|
| 556 |
+
logger.error("Monitor: Cleanup exception. Trying again...")
|
| 557 |
+
time.sleep(2)
|
| 558 |
+
|
| 559 |
+
def _handle_failure(self, error):
|
| 560 |
+
if (
|
| 561 |
+
self.autoscaler is not None
|
| 562 |
+
and os.environ.get("RAY_AUTOSCALER_FATESHARE_WORKERS", "") == "1"
|
| 563 |
+
):
|
| 564 |
+
self.autoscaler.kill_workers()
|
| 565 |
+
# Take down autoscaler workers if necessary.
|
| 566 |
+
self.destroy_autoscaler_workers()
|
| 567 |
+
|
| 568 |
+
# Something went wrong, so push an error to all current and future
|
| 569 |
+
# drivers.
|
| 570 |
+
message = f"The autoscaler failed with the following error:\n{error}"
|
| 571 |
+
if _internal_kv_initialized():
|
| 572 |
+
_internal_kv_put(
|
| 573 |
+
ray_constants.DEBUG_AUTOSCALING_ERROR, message, overwrite=True
|
| 574 |
+
)
|
| 575 |
+
gcs_publisher = ray._raylet.GcsPublisher(address=self.gcs_address)
|
| 576 |
+
from ray._private.utils import publish_error_to_driver
|
| 577 |
+
|
| 578 |
+
publish_error_to_driver(
|
| 579 |
+
ray_constants.MONITOR_DIED_ERROR,
|
| 580 |
+
message,
|
| 581 |
+
gcs_publisher=gcs_publisher,
|
| 582 |
+
)
|
| 583 |
+
|
| 584 |
+
def _signal_handler(self, sig, frame):
|
| 585 |
+
try:
|
| 586 |
+
self._handle_failure(
|
| 587 |
+
f"Terminated with signal {sig}\n"
|
| 588 |
+
+ "".join(traceback.format_stack(frame))
|
| 589 |
+
)
|
| 590 |
+
except Exception:
|
| 591 |
+
logger.exception("Monitor: Failure in signal handler.")
|
| 592 |
+
sys.exit(sig + 128)
|
| 593 |
+
|
| 594 |
+
def run(self):
|
| 595 |
+
# Register signal handlers for autoscaler termination.
|
| 596 |
+
# Signals will not be received on windows
|
| 597 |
+
signal.signal(signal.SIGINT, self._signal_handler)
|
| 598 |
+
signal.signal(signal.SIGTERM, self._signal_handler)
|
| 599 |
+
try:
|
| 600 |
+
if _internal_kv_initialized():
|
| 601 |
+
# Delete any previous autoscaling errors.
|
| 602 |
+
_internal_kv_del(ray_constants.DEBUG_AUTOSCALING_ERROR)
|
| 603 |
+
self._initialize_autoscaler()
|
| 604 |
+
self._run()
|
| 605 |
+
except Exception:
|
| 606 |
+
logger.exception("Error in monitor loop")
|
| 607 |
+
self._handle_failure(traceback.format_exc())
|
| 608 |
+
raise
|
| 609 |
+
|
| 610 |
+
|
| 611 |
+
def log_resource_batch_data_if_desired(
|
| 612 |
+
resources_batch_data: gcs_pb2.ResourceUsageBatchData,
|
| 613 |
+
) -> None:
|
| 614 |
+
if os.getenv("AUTOSCALER_LOG_RESOURCE_BATCH_DATA") == "1":
|
| 615 |
+
logger.info("Logging raw resource message pulled from GCS.")
|
| 616 |
+
logger.info(resources_batch_data)
|
| 617 |
+
logger.info("Done logging raw resource message.")
|
| 618 |
+
|
| 619 |
+
|
| 620 |
+
if __name__ == "__main__":
|
| 621 |
+
parser = argparse.ArgumentParser(
|
| 622 |
+
description=("Parse GCS server for the monitor to connect to.")
|
| 623 |
+
)
|
| 624 |
+
parser.add_argument(
|
| 625 |
+
"--gcs-address", required=False, type=str, help="The address (ip:port) of GCS."
|
| 626 |
+
)
|
| 627 |
+
parser.add_argument(
|
| 628 |
+
"--autoscaling-config",
|
| 629 |
+
required=False,
|
| 630 |
+
type=str,
|
| 631 |
+
help="the path to the autoscaling config file",
|
| 632 |
+
)
|
| 633 |
+
parser.add_argument(
|
| 634 |
+
"--logging-level",
|
| 635 |
+
required=False,
|
| 636 |
+
type=str,
|
| 637 |
+
default=ray_constants.LOGGER_LEVEL,
|
| 638 |
+
choices=ray_constants.LOGGER_LEVEL_CHOICES,
|
| 639 |
+
help=ray_constants.LOGGER_LEVEL_HELP,
|
| 640 |
+
)
|
| 641 |
+
parser.add_argument(
|
| 642 |
+
"--logging-format",
|
| 643 |
+
required=False,
|
| 644 |
+
type=str,
|
| 645 |
+
default=ray_constants.LOGGER_FORMAT,
|
| 646 |
+
help=ray_constants.LOGGER_FORMAT_HELP,
|
| 647 |
+
)
|
| 648 |
+
parser.add_argument(
|
| 649 |
+
"--logging-filename",
|
| 650 |
+
required=False,
|
| 651 |
+
type=str,
|
| 652 |
+
default=ray_constants.MONITOR_LOG_FILE_NAME,
|
| 653 |
+
help="Specify the name of log file, "
|
| 654 |
+
"log to stdout if set empty, default is "
|
| 655 |
+
f'"{ray_constants.MONITOR_LOG_FILE_NAME}"',
|
| 656 |
+
)
|
| 657 |
+
parser.add_argument(
|
| 658 |
+
"--logs-dir",
|
| 659 |
+
required=True,
|
| 660 |
+
type=str,
|
| 661 |
+
help="Specify the path of the temporary directory used by Ray processes.",
|
| 662 |
+
)
|
| 663 |
+
parser.add_argument(
|
| 664 |
+
"--logging-rotate-bytes",
|
| 665 |
+
required=False,
|
| 666 |
+
type=int,
|
| 667 |
+
default=ray_constants.LOGGING_ROTATE_BYTES,
|
| 668 |
+
help="Specify the max bytes for rotating "
|
| 669 |
+
"log file, default is "
|
| 670 |
+
f"{ray_constants.LOGGING_ROTATE_BYTES} bytes.",
|
| 671 |
+
)
|
| 672 |
+
parser.add_argument(
|
| 673 |
+
"--logging-rotate-backup-count",
|
| 674 |
+
required=False,
|
| 675 |
+
type=int,
|
| 676 |
+
default=ray_constants.LOGGING_ROTATE_BACKUP_COUNT,
|
| 677 |
+
help="Specify the backup count of rotated log file, default is "
|
| 678 |
+
f"{ray_constants.LOGGING_ROTATE_BACKUP_COUNT}.",
|
| 679 |
+
)
|
| 680 |
+
parser.add_argument(
|
| 681 |
+
"--monitor-ip",
|
| 682 |
+
required=False,
|
| 683 |
+
type=str,
|
| 684 |
+
default=None,
|
| 685 |
+
help="The IP address of the machine hosting the monitor process.",
|
| 686 |
+
)
|
| 687 |
+
|
| 688 |
+
args = parser.parse_args()
|
| 689 |
+
setup_component_logger(
|
| 690 |
+
logging_level=args.logging_level,
|
| 691 |
+
logging_format=args.logging_format,
|
| 692 |
+
log_dir=args.logs_dir,
|
| 693 |
+
filename=args.logging_filename,
|
| 694 |
+
max_bytes=args.logging_rotate_bytes,
|
| 695 |
+
backup_count=args.logging_rotate_backup_count,
|
| 696 |
+
)
|
| 697 |
+
|
| 698 |
+
logger.info(f"Starting monitor using ray installation: {ray.__file__}")
|
| 699 |
+
logger.info(f"Ray version: {ray.__version__}")
|
| 700 |
+
logger.info(f"Ray commit: {ray.__commit__}")
|
| 701 |
+
logger.info(f"Monitor started with command: {sys.argv}")
|
| 702 |
+
|
| 703 |
+
if args.autoscaling_config:
|
| 704 |
+
autoscaling_config = os.path.expanduser(args.autoscaling_config)
|
| 705 |
+
else:
|
| 706 |
+
autoscaling_config = None
|
| 707 |
+
|
| 708 |
+
bootstrap_address = args.gcs_address
|
| 709 |
+
if bootstrap_address is None:
|
| 710 |
+
raise ValueError("--gcs-address must be set!")
|
| 711 |
+
|
| 712 |
+
monitor = Monitor(
|
| 713 |
+
bootstrap_address,
|
| 714 |
+
autoscaling_config,
|
| 715 |
+
log_dir=args.logs_dir,
|
| 716 |
+
monitor_ip=args.monitor_ip,
|
| 717 |
+
)
|
| 718 |
+
|
| 719 |
+
monitor.run()
|
.venv/lib/python3.11/site-packages/ray/autoscaler/_private/node_launcher.py
ADDED
|
@@ -0,0 +1,221 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import copy
|
| 2 |
+
import logging
|
| 3 |
+
import operator
|
| 4 |
+
import threading
|
| 5 |
+
import time
|
| 6 |
+
import traceback
|
| 7 |
+
from typing import Any, Dict, Optional
|
| 8 |
+
|
| 9 |
+
from ray.autoscaler._private.node_provider_availability_tracker import (
|
| 10 |
+
NodeProviderAvailabilityTracker,
|
| 11 |
+
)
|
| 12 |
+
from ray.autoscaler._private.prom_metrics import AutoscalerPrometheusMetrics
|
| 13 |
+
from ray.autoscaler._private.util import hash_launch_conf
|
| 14 |
+
from ray.autoscaler.node_launch_exception import NodeLaunchException
|
| 15 |
+
from ray.autoscaler.tags import (
|
| 16 |
+
NODE_KIND_WORKER,
|
| 17 |
+
STATUS_UNINITIALIZED,
|
| 18 |
+
TAG_RAY_LAUNCH_CONFIG,
|
| 19 |
+
TAG_RAY_NODE_KIND,
|
| 20 |
+
TAG_RAY_NODE_NAME,
|
| 21 |
+
TAG_RAY_NODE_STATUS,
|
| 22 |
+
TAG_RAY_USER_NODE_TYPE,
|
| 23 |
+
)
|
| 24 |
+
|
| 25 |
+
logger = logging.getLogger(__name__)
|
| 26 |
+
|
| 27 |
+
|
| 28 |
+
class BaseNodeLauncher:
|
| 29 |
+
"""Launches Ray nodes in the main thread using
|
| 30 |
+
`BaseNodeLauncher.launch_node()`.
|
| 31 |
+
|
| 32 |
+
This is a superclass of NodeLauncher, which launches nodes asynchronously
|
| 33 |
+
in the background.
|
| 34 |
+
|
| 35 |
+
By default, the subclass NodeLauncher is used to launch nodes in subthreads.
|
| 36 |
+
That behavior can be flagged off in the provider config by setting
|
| 37 |
+
`foreground_node_launch: True`; the autoscaler will then makes blocking calls to
|
| 38 |
+
BaseNodeLauncher.launch_node() in the main thread.
|
| 39 |
+
"""
|
| 40 |
+
|
| 41 |
+
def __init__(
|
| 42 |
+
self,
|
| 43 |
+
provider,
|
| 44 |
+
pending,
|
| 45 |
+
event_summarizer,
|
| 46 |
+
node_provider_availability_tracker: NodeProviderAvailabilityTracker,
|
| 47 |
+
session_name: Optional[str] = None,
|
| 48 |
+
prom_metrics=None,
|
| 49 |
+
node_types=None,
|
| 50 |
+
index=None,
|
| 51 |
+
*args,
|
| 52 |
+
**kwargs,
|
| 53 |
+
):
|
| 54 |
+
self.pending = pending
|
| 55 |
+
self.event_summarizer = event_summarizer
|
| 56 |
+
self.node_provider_availability_tracker = node_provider_availability_tracker
|
| 57 |
+
self.prom_metrics = prom_metrics or AutoscalerPrometheusMetrics(
|
| 58 |
+
session_name=session_name
|
| 59 |
+
)
|
| 60 |
+
self.provider = provider
|
| 61 |
+
self.node_types = node_types
|
| 62 |
+
self.index = str(index) if index is not None else ""
|
| 63 |
+
|
| 64 |
+
def launch_node(
|
| 65 |
+
self, config: Dict[str, Any], count: int, node_type: str
|
| 66 |
+
) -> Optional[Dict]:
|
| 67 |
+
self.log("Got {} nodes to launch.".format(count))
|
| 68 |
+
created_nodes = self._launch_node(config, count, node_type)
|
| 69 |
+
self.pending.dec(node_type, count)
|
| 70 |
+
return created_nodes
|
| 71 |
+
|
| 72 |
+
def _launch_node(
|
| 73 |
+
self, config: Dict[str, Any], count: int, node_type: str
|
| 74 |
+
) -> Optional[Dict]:
|
| 75 |
+
if self.node_types:
|
| 76 |
+
assert node_type, node_type
|
| 77 |
+
|
| 78 |
+
# The `worker_nodes` field is deprecated in favor of per-node-type
|
| 79 |
+
# node_configs. We allow it for backwards-compatibility.
|
| 80 |
+
launch_config = copy.deepcopy(config.get("worker_nodes", {}))
|
| 81 |
+
if node_type:
|
| 82 |
+
launch_config.update(
|
| 83 |
+
config["available_node_types"][node_type]["node_config"]
|
| 84 |
+
)
|
| 85 |
+
resources = copy.deepcopy(
|
| 86 |
+
config["available_node_types"][node_type]["resources"]
|
| 87 |
+
)
|
| 88 |
+
labels = copy.deepcopy(
|
| 89 |
+
config["available_node_types"][node_type].get("labels", {})
|
| 90 |
+
)
|
| 91 |
+
launch_hash = hash_launch_conf(launch_config, config["auth"])
|
| 92 |
+
node_config = copy.deepcopy(config.get("worker_nodes", {}))
|
| 93 |
+
node_tags = {
|
| 94 |
+
TAG_RAY_NODE_NAME: "ray-{}-worker".format(config["cluster_name"]),
|
| 95 |
+
TAG_RAY_NODE_KIND: NODE_KIND_WORKER,
|
| 96 |
+
TAG_RAY_NODE_STATUS: STATUS_UNINITIALIZED,
|
| 97 |
+
TAG_RAY_LAUNCH_CONFIG: launch_hash,
|
| 98 |
+
}
|
| 99 |
+
# A custom node type is specified; set the tag in this case, and also
|
| 100 |
+
# merge the configs. We merge the configs instead of overriding, so
|
| 101 |
+
# that the bootstrapped per-cloud properties are preserved.
|
| 102 |
+
# TODO(ekl) this logic is duplicated in commands.py (keep in sync)
|
| 103 |
+
if node_type:
|
| 104 |
+
node_tags[TAG_RAY_USER_NODE_TYPE] = node_type
|
| 105 |
+
node_config.update(launch_config)
|
| 106 |
+
|
| 107 |
+
node_launch_start_time = time.time()
|
| 108 |
+
|
| 109 |
+
error_msg = None
|
| 110 |
+
full_exception = None
|
| 111 |
+
created_nodes = {}
|
| 112 |
+
try:
|
| 113 |
+
created_nodes = self.provider.create_node_with_resources_and_labels(
|
| 114 |
+
node_config, node_tags, count, resources, labels
|
| 115 |
+
)
|
| 116 |
+
except NodeLaunchException as node_launch_exception:
|
| 117 |
+
self.node_provider_availability_tracker.update_node_availability(
|
| 118 |
+
node_type, int(node_launch_start_time), node_launch_exception
|
| 119 |
+
)
|
| 120 |
+
|
| 121 |
+
if node_launch_exception.src_exc_info is not None:
|
| 122 |
+
full_exception = "\n".join(
|
| 123 |
+
traceback.format_exception(*node_launch_exception.src_exc_info)
|
| 124 |
+
)
|
| 125 |
+
|
| 126 |
+
error_msg = (
|
| 127 |
+
f"Failed to launch {{}} node(s) of type {node_type}. "
|
| 128 |
+
f"({node_launch_exception.category}): "
|
| 129 |
+
f"{node_launch_exception.description}"
|
| 130 |
+
)
|
| 131 |
+
except Exception:
|
| 132 |
+
error_msg = f"Failed to launch {{}} node(s) of type {node_type}."
|
| 133 |
+
full_exception = traceback.format_exc()
|
| 134 |
+
else:
|
| 135 |
+
# Record some metrics/observability information when a node is launched.
|
| 136 |
+
launch_time = time.time() - node_launch_start_time
|
| 137 |
+
for _ in range(count):
|
| 138 |
+
# Note: when launching multiple nodes we observe the time it
|
| 139 |
+
# took all nodes to launch for each node. For example, if 4
|
| 140 |
+
# nodes were created in 25 seconds, we would observe the 25
|
| 141 |
+
# second create time 4 times.
|
| 142 |
+
self.prom_metrics.worker_create_node_time.observe(launch_time)
|
| 143 |
+
self.prom_metrics.started_nodes.inc(count)
|
| 144 |
+
self.node_provider_availability_tracker.update_node_availability(
|
| 145 |
+
node_type=node_type,
|
| 146 |
+
timestamp=int(node_launch_start_time),
|
| 147 |
+
node_launch_exception=None,
|
| 148 |
+
)
|
| 149 |
+
|
| 150 |
+
if error_msg is not None:
|
| 151 |
+
self.event_summarizer.add(
|
| 152 |
+
error_msg,
|
| 153 |
+
quantity=count,
|
| 154 |
+
aggregate=operator.add,
|
| 155 |
+
)
|
| 156 |
+
self.log(error_msg)
|
| 157 |
+
self.prom_metrics.node_launch_exceptions.inc()
|
| 158 |
+
self.prom_metrics.failed_create_nodes.inc(count)
|
| 159 |
+
else:
|
| 160 |
+
self.log("Launching {} nodes, type {}.".format(count, node_type))
|
| 161 |
+
self.event_summarizer.add(
|
| 162 |
+
"Adding {} node(s) of type " + str(node_type) + ".",
|
| 163 |
+
quantity=count,
|
| 164 |
+
aggregate=operator.add,
|
| 165 |
+
)
|
| 166 |
+
|
| 167 |
+
if full_exception is not None:
|
| 168 |
+
self.log(full_exception)
|
| 169 |
+
|
| 170 |
+
return created_nodes
|
| 171 |
+
|
| 172 |
+
def log(self, statement):
|
| 173 |
+
# launcher_class is "BaseNodeLauncher", or "NodeLauncher" if called
|
| 174 |
+
# from that subclass.
|
| 175 |
+
launcher_class: str = type(self).__name__
|
| 176 |
+
prefix = "{}{}:".format(launcher_class, self.index)
|
| 177 |
+
logger.info(prefix + " {}".format(statement))
|
| 178 |
+
|
| 179 |
+
|
| 180 |
+
class NodeLauncher(BaseNodeLauncher, threading.Thread):
|
| 181 |
+
"""Launches nodes asynchronously in the background."""
|
| 182 |
+
|
| 183 |
+
def __init__(
|
| 184 |
+
self,
|
| 185 |
+
provider,
|
| 186 |
+
queue,
|
| 187 |
+
pending,
|
| 188 |
+
event_summarizer,
|
| 189 |
+
node_provider_availability_tracker,
|
| 190 |
+
session_name: Optional[str] = None,
|
| 191 |
+
prom_metrics=None,
|
| 192 |
+
node_types=None,
|
| 193 |
+
index=None,
|
| 194 |
+
*thread_args,
|
| 195 |
+
**thread_kwargs,
|
| 196 |
+
):
|
| 197 |
+
self.queue = queue
|
| 198 |
+
BaseNodeLauncher.__init__(
|
| 199 |
+
self,
|
| 200 |
+
provider=provider,
|
| 201 |
+
pending=pending,
|
| 202 |
+
event_summarizer=event_summarizer,
|
| 203 |
+
session_name=session_name,
|
| 204 |
+
node_provider_availability_tracker=node_provider_availability_tracker,
|
| 205 |
+
prom_metrics=prom_metrics,
|
| 206 |
+
node_types=node_types,
|
| 207 |
+
index=index,
|
| 208 |
+
)
|
| 209 |
+
threading.Thread.__init__(self, *thread_args, **thread_kwargs)
|
| 210 |
+
|
| 211 |
+
def run(self):
|
| 212 |
+
"""Collects launch data from queue populated by StandardAutoscaler.
|
| 213 |
+
Launches nodes in a background thread.
|
| 214 |
+
|
| 215 |
+
Overrides threading.Thread.run().
|
| 216 |
+
NodeLauncher.start() executes this loop in a background thread.
|
| 217 |
+
"""
|
| 218 |
+
while True:
|
| 219 |
+
config, count, node_type = self.queue.get()
|
| 220 |
+
# launch_node is implemented in BaseNodeLauncher
|
| 221 |
+
self.launch_node(config, count, node_type)
|
.venv/lib/python3.11/site-packages/ray/autoscaler/_private/node_provider_availability_tracker.py
ADDED
|
@@ -0,0 +1,165 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import threading
|
| 2 |
+
import time
|
| 3 |
+
from dataclasses import dataclass
|
| 4 |
+
from typing import Callable, Dict, Optional, Tuple
|
| 5 |
+
|
| 6 |
+
from ray.autoscaler._private.constants import (
|
| 7 |
+
AUTOSCALER_NODE_AVAILABILITY_MAX_STALENESS_S,
|
| 8 |
+
)
|
| 9 |
+
from ray.autoscaler.node_launch_exception import NodeLaunchException
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
@dataclass
|
| 13 |
+
class UnavailableNodeInformation:
|
| 14 |
+
category: str
|
| 15 |
+
description: str
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
@dataclass
|
| 19 |
+
class NodeAvailabilityRecord:
|
| 20 |
+
node_type: str
|
| 21 |
+
is_available: bool
|
| 22 |
+
last_checked_timestamp: float
|
| 23 |
+
unavailable_node_information: Optional[UnavailableNodeInformation]
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
@dataclass
|
| 27 |
+
class NodeAvailabilitySummary:
|
| 28 |
+
node_availabilities: Dict[
|
| 29 |
+
str, NodeAvailabilityRecord
|
| 30 |
+
] # Mapping from node type to node availability record.
|
| 31 |
+
|
| 32 |
+
@classmethod
|
| 33 |
+
def from_fields(cls, **fields) -> Optional["NodeAvailabilitySummary"]:
|
| 34 |
+
"""Implement marshalling from nested fields. pydantic isn't a core dependency
|
| 35 |
+
so we're implementing this by hand instead."""
|
| 36 |
+
parsed = {}
|
| 37 |
+
|
| 38 |
+
node_availabilites_dict = fields.get("node_availabilities", {})
|
| 39 |
+
|
| 40 |
+
for node_type, node_availability_record_dict in node_availabilites_dict.items():
|
| 41 |
+
unavailable_information_dict = node_availability_record_dict.pop(
|
| 42 |
+
"unavailable_node_information", None
|
| 43 |
+
)
|
| 44 |
+
unavaiable_information = None
|
| 45 |
+
if unavailable_information_dict is not None:
|
| 46 |
+
unavaiable_information = UnavailableNodeInformation(
|
| 47 |
+
**unavailable_information_dict
|
| 48 |
+
)
|
| 49 |
+
|
| 50 |
+
parsed[node_type] = NodeAvailabilityRecord(
|
| 51 |
+
unavailable_node_information=unavaiable_information,
|
| 52 |
+
**node_availability_record_dict,
|
| 53 |
+
)
|
| 54 |
+
|
| 55 |
+
return NodeAvailabilitySummary(node_availabilities=parsed)
|
| 56 |
+
|
| 57 |
+
def __eq__(self, other: "NodeAvailabilitySummary"):
|
| 58 |
+
return self.node_availabilities == other.node_availabilities
|
| 59 |
+
|
| 60 |
+
def __bool__(self) -> bool:
|
| 61 |
+
return bool(self.node_availabilities)
|
| 62 |
+
|
| 63 |
+
|
| 64 |
+
class NodeProviderAvailabilityTracker:
|
| 65 |
+
"""A thread safe, TTL cache of node provider availability. We don't use
|
| 66 |
+
cachetools.TTLCache because it always sets the expiration time relative to
|
| 67 |
+
insertion time, but in our case, we want entries to expire relative to when
|
| 68 |
+
the node creation was attempted (and entries aren't necessarily added in
|
| 69 |
+
order). We want the entries to expire because the information grows stale
|
| 70 |
+
over time.
|
| 71 |
+
"""
|
| 72 |
+
|
| 73 |
+
def __init__(
|
| 74 |
+
self,
|
| 75 |
+
timer: Callable[[], float] = time.time,
|
| 76 |
+
ttl: float = AUTOSCALER_NODE_AVAILABILITY_MAX_STALENESS_S,
|
| 77 |
+
):
|
| 78 |
+
"""A cache that tracks the availability of nodes and throw away
|
| 79 |
+
entries which have grown too stale.
|
| 80 |
+
|
| 81 |
+
Args:
|
| 82 |
+
timer: A function that returns the current time in seconds.
|
| 83 |
+
ttl: The ttl from the insertion timestamp of an entry.
|
| 84 |
+
"""
|
| 85 |
+
self.timer = timer
|
| 86 |
+
self.ttl = ttl
|
| 87 |
+
# Mapping from node type to (eviction_time, record)
|
| 88 |
+
self.store: Dict[str, Tuple[float, NodeAvailabilityRecord]] = {}
|
| 89 |
+
# A global lock to simplify thread safety handling.
|
| 90 |
+
self.lock = threading.RLock()
|
| 91 |
+
|
| 92 |
+
def _update_node_availability_requires_lock(
|
| 93 |
+
self,
|
| 94 |
+
node_type: str,
|
| 95 |
+
timestamp: int,
|
| 96 |
+
node_launch_exception: Optional[NodeLaunchException],
|
| 97 |
+
) -> None:
|
| 98 |
+
if node_launch_exception is None:
|
| 99 |
+
record = NodeAvailabilityRecord(
|
| 100 |
+
node_type=node_type,
|
| 101 |
+
is_available=True,
|
| 102 |
+
last_checked_timestamp=timestamp,
|
| 103 |
+
unavailable_node_information=None,
|
| 104 |
+
)
|
| 105 |
+
else:
|
| 106 |
+
info = UnavailableNodeInformation(
|
| 107 |
+
category=node_launch_exception.category,
|
| 108 |
+
description=node_launch_exception.description,
|
| 109 |
+
)
|
| 110 |
+
record = NodeAvailabilityRecord(
|
| 111 |
+
node_type=node_type,
|
| 112 |
+
is_available=False,
|
| 113 |
+
last_checked_timestamp=timestamp,
|
| 114 |
+
unavailable_node_information=info,
|
| 115 |
+
)
|
| 116 |
+
|
| 117 |
+
expiration_time = timestamp + self.ttl
|
| 118 |
+
|
| 119 |
+
# TODO (Alex): In theory it would be nice to make this dictionary
|
| 120 |
+
# ordered by expiration time, unfortunately that's a bit difficult
|
| 121 |
+
# since `update_node_availability` can be called with out of order
|
| 122 |
+
# timestamps.
|
| 123 |
+
self.store[node_type] = (expiration_time, record)
|
| 124 |
+
|
| 125 |
+
self._remove_old_entries()
|
| 126 |
+
|
| 127 |
+
def update_node_availability(
|
| 128 |
+
self,
|
| 129 |
+
node_type: str,
|
| 130 |
+
timestamp: int,
|
| 131 |
+
node_launch_exception: Optional[NodeLaunchException],
|
| 132 |
+
) -> None:
|
| 133 |
+
"""
|
| 134 |
+
Update the availability and details of a single ndoe type.
|
| 135 |
+
|
| 136 |
+
Args:
|
| 137 |
+
node_type: The node type.
|
| 138 |
+
timestamp: The timestamp that this information is accurate as of.
|
| 139 |
+
node_launch_exception: Details about why the node launch failed. If
|
| 140 |
+
empty, the node type will be considered available."""
|
| 141 |
+
with self.lock:
|
| 142 |
+
self._update_node_availability_requires_lock(
|
| 143 |
+
node_type, timestamp, node_launch_exception
|
| 144 |
+
)
|
| 145 |
+
|
| 146 |
+
def summary(self) -> NodeAvailabilitySummary:
|
| 147 |
+
"""
|
| 148 |
+
Returns a summary of node availabilities and their staleness.
|
| 149 |
+
|
| 150 |
+
Returns
|
| 151 |
+
A summary of node availabilities and their staleness.
|
| 152 |
+
"""
|
| 153 |
+
with self.lock:
|
| 154 |
+
self._remove_old_entries()
|
| 155 |
+
return NodeAvailabilitySummary(
|
| 156 |
+
{node_type: record for node_type, (_, record) in self.store.items()}
|
| 157 |
+
)
|
| 158 |
+
|
| 159 |
+
def _remove_old_entries(self):
|
| 160 |
+
"""Remove any expired entries from the cache."""
|
| 161 |
+
cur_time = self.timer()
|
| 162 |
+
with self.lock:
|
| 163 |
+
for key, (expiration_time, _) in list(self.store.items()):
|
| 164 |
+
if expiration_time < cur_time:
|
| 165 |
+
del self.store[key]
|
.venv/lib/python3.11/site-packages/ray/autoscaler/_private/node_tracker.py
ADDED
|
@@ -0,0 +1,77 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from typing import List, Set, Tuple
|
| 2 |
+
|
| 3 |
+
from ray.autoscaler._private import constants
|
| 4 |
+
|
| 5 |
+
|
| 6 |
+
class NodeTracker:
|
| 7 |
+
"""Map nodes to their corresponding logs.
|
| 8 |
+
|
| 9 |
+
We need to be a little careful here. At an given point in time, node_id <->
|
| 10 |
+
ip can be interchangeably used, but the node_id -> ip relation is not
|
| 11 |
+
bijective _across time_ since IP addresses can be reused. Therefore, we
|
| 12 |
+
should treat node_id as the only unique identifier.
|
| 13 |
+
"""
|
| 14 |
+
|
| 15 |
+
def __init__(self):
|
| 16 |
+
# Mapping from node_id -> (ip, node type, stdout_path, process runner)
|
| 17 |
+
self.node_mapping = {}
|
| 18 |
+
|
| 19 |
+
# A quick, inefficient FIFO cache implementation.
|
| 20 |
+
self.lru_order = []
|
| 21 |
+
|
| 22 |
+
def _add_node_mapping(self, node_id: str, value: str):
|
| 23 |
+
if node_id in self.node_mapping:
|
| 24 |
+
return
|
| 25 |
+
|
| 26 |
+
assert len(self.lru_order) == len(self.node_mapping)
|
| 27 |
+
if len(self.lru_order) >= constants.AUTOSCALER_MAX_NODES_TRACKED:
|
| 28 |
+
# The LRU eviction case
|
| 29 |
+
node_id = self.lru_order.pop(0)
|
| 30 |
+
del self.node_mapping[node_id]
|
| 31 |
+
|
| 32 |
+
self.node_mapping[node_id] = value
|
| 33 |
+
self.lru_order.append(node_id)
|
| 34 |
+
|
| 35 |
+
def track(self, node_id: str, ip: str, node_type: str):
|
| 36 |
+
"""
|
| 37 |
+
Begin to track a new node.
|
| 38 |
+
|
| 39 |
+
Args:
|
| 40 |
+
node_id: The node id.
|
| 41 |
+
ip: The node ip address.
|
| 42 |
+
node_type: The node type.
|
| 43 |
+
"""
|
| 44 |
+
if node_id not in self.node_mapping:
|
| 45 |
+
self._add_node_mapping(node_id, (ip, node_type))
|
| 46 |
+
|
| 47 |
+
def untrack(self, node_id: str):
|
| 48 |
+
"""Gracefully stop tracking a node. If a node is intentionally removed from
|
| 49 |
+
the cluster, we should stop tracking it so we don't mistakenly mark it
|
| 50 |
+
as failed.
|
| 51 |
+
|
| 52 |
+
Args:
|
| 53 |
+
node_id: The node id which failed.
|
| 54 |
+
"""
|
| 55 |
+
if node_id in self.node_mapping:
|
| 56 |
+
self.lru_order.remove(node_id)
|
| 57 |
+
del self.node_mapping[node_id]
|
| 58 |
+
|
| 59 |
+
def get_all_failed_node_info(
|
| 60 |
+
self, non_failed_ids: Set[str]
|
| 61 |
+
) -> List[Tuple[str, str]]:
|
| 62 |
+
"""Get the information about all failed nodes. A failed node is any node which
|
| 63 |
+
we began to track that is not pending or alive (i.e. not failed).
|
| 64 |
+
|
| 65 |
+
Args:
|
| 66 |
+
non_failed_ids: Nodes are failed unless they are in this set.
|
| 67 |
+
|
| 68 |
+
Returns:
|
| 69 |
+
List[Tuple[str, str]]: A list of tuples. Each tuple is the ip
|
| 70 |
+
address and type of a failed node.
|
| 71 |
+
"""
|
| 72 |
+
failed_nodes = self.node_mapping.keys() - non_failed_ids
|
| 73 |
+
failed_info = []
|
| 74 |
+
# Returning the list in order is important for display purposes.
|
| 75 |
+
for node_id in filter(lambda node_id: node_id in failed_nodes, self.lru_order):
|
| 76 |
+
failed_info.append(self.node_mapping[node_id])
|
| 77 |
+
return failed_info
|
.venv/lib/python3.11/site-packages/ray/autoscaler/_private/prom_metrics.py
ADDED
|
@@ -0,0 +1,292 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from typing import Optional
|
| 2 |
+
|
| 3 |
+
|
| 4 |
+
class NullMetric:
|
| 5 |
+
"""Mock metric class to be used in case of prometheus_client import error."""
|
| 6 |
+
|
| 7 |
+
def set(self, *args, **kwargs):
|
| 8 |
+
pass
|
| 9 |
+
|
| 10 |
+
def observe(self, *args, **kwargs):
|
| 11 |
+
pass
|
| 12 |
+
|
| 13 |
+
def inc(self, *args, **kwargs):
|
| 14 |
+
pass
|
| 15 |
+
|
| 16 |
+
def labels(self, *args, **kwargs):
|
| 17 |
+
return self
|
| 18 |
+
|
| 19 |
+
def clear(self):
|
| 20 |
+
pass
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
try:
|
| 24 |
+
|
| 25 |
+
from prometheus_client import CollectorRegistry, Counter, Gauge, Histogram
|
| 26 |
+
|
| 27 |
+
# The metrics in this class should be kept in sync with
|
| 28 |
+
# python/ray/tests/test_metrics_agent.py
|
| 29 |
+
class AutoscalerPrometheusMetrics:
|
| 30 |
+
def __init__(
|
| 31 |
+
self, session_name: str = None, registry: Optional[CollectorRegistry] = None
|
| 32 |
+
):
|
| 33 |
+
self.registry: CollectorRegistry = registry or CollectorRegistry(
|
| 34 |
+
auto_describe=True
|
| 35 |
+
)
|
| 36 |
+
self._session_name = session_name
|
| 37 |
+
# Buckets: 5 seconds, 10 seconds, 20 seconds, 30 seconds,
|
| 38 |
+
# 45 seconds, 1 minute, 1.5 minutes, 2 minutes,
|
| 39 |
+
# 3 minutes, 4 minutes, 5 minutes, 6 minutes,
|
| 40 |
+
# 8 minutes, 10 minutes, 12 minutes, 15 minutes
|
| 41 |
+
# 20 minutes, 25 minutes, 30 minutes
|
| 42 |
+
# used for both worker launch time and worker update time
|
| 43 |
+
histogram_buckets = [
|
| 44 |
+
5,
|
| 45 |
+
10,
|
| 46 |
+
20,
|
| 47 |
+
30,
|
| 48 |
+
45,
|
| 49 |
+
60,
|
| 50 |
+
90,
|
| 51 |
+
120,
|
| 52 |
+
180,
|
| 53 |
+
240,
|
| 54 |
+
300,
|
| 55 |
+
360,
|
| 56 |
+
480,
|
| 57 |
+
600,
|
| 58 |
+
720,
|
| 59 |
+
900,
|
| 60 |
+
1200,
|
| 61 |
+
1500,
|
| 62 |
+
1800,
|
| 63 |
+
]
|
| 64 |
+
# Buckets: .01 seconds to 1000 seconds.
|
| 65 |
+
# Used for autoscaler update time.
|
| 66 |
+
update_time_buckets = [0.01, 0.1, 1, 10, 100, 1000]
|
| 67 |
+
self.worker_create_node_time: Histogram = Histogram(
|
| 68 |
+
"worker_create_node_time_seconds",
|
| 69 |
+
"Worker launch time. This is the time it takes for a call to "
|
| 70 |
+
"a node provider's create_node method to return. Note that "
|
| 71 |
+
"when nodes are launched in batches, the launch time for that "
|
| 72 |
+
"batch will be observed once for *each* node in that batch. "
|
| 73 |
+
"For example, if 8 nodes are launched in 3 minutes, a launch "
|
| 74 |
+
"time of 3 minutes will be observed 8 times.",
|
| 75 |
+
labelnames=("SessionName",),
|
| 76 |
+
unit="seconds",
|
| 77 |
+
namespace="autoscaler",
|
| 78 |
+
registry=self.registry,
|
| 79 |
+
buckets=histogram_buckets,
|
| 80 |
+
).labels(SessionName=session_name)
|
| 81 |
+
self.worker_update_time: Histogram = Histogram(
|
| 82 |
+
"worker_update_time_seconds",
|
| 83 |
+
"Worker update time. This is the time between when an updater "
|
| 84 |
+
"thread begins executing and when it exits successfully. This "
|
| 85 |
+
"metric only observes times for successful updates.",
|
| 86 |
+
labelnames=("SessionName",),
|
| 87 |
+
unit="seconds",
|
| 88 |
+
namespace="autoscaler",
|
| 89 |
+
registry=self.registry,
|
| 90 |
+
buckets=histogram_buckets,
|
| 91 |
+
).labels(SessionName=session_name)
|
| 92 |
+
self.update_time: Histogram = Histogram(
|
| 93 |
+
"update_time",
|
| 94 |
+
"Autoscaler update time. This is the time for an autoscaler "
|
| 95 |
+
"update iteration to complete.",
|
| 96 |
+
labelnames=("SessionName",),
|
| 97 |
+
unit="seconds",
|
| 98 |
+
namespace="autoscaler",
|
| 99 |
+
registry=self.registry,
|
| 100 |
+
buckets=update_time_buckets,
|
| 101 |
+
).labels(SessionName=session_name)
|
| 102 |
+
self.pending_nodes: Gauge = Gauge(
|
| 103 |
+
"pending_nodes",
|
| 104 |
+
"Number of nodes pending to be started.",
|
| 105 |
+
labelnames=(
|
| 106 |
+
"NodeType",
|
| 107 |
+
"SessionName",
|
| 108 |
+
),
|
| 109 |
+
unit="nodes",
|
| 110 |
+
namespace="autoscaler",
|
| 111 |
+
registry=self.registry,
|
| 112 |
+
)
|
| 113 |
+
self.active_nodes: Gauge = Gauge(
|
| 114 |
+
"active_nodes",
|
| 115 |
+
"Number of nodes in the cluster.",
|
| 116 |
+
labelnames=(
|
| 117 |
+
"NodeType",
|
| 118 |
+
"SessionName",
|
| 119 |
+
),
|
| 120 |
+
unit="nodes",
|
| 121 |
+
namespace="autoscaler",
|
| 122 |
+
registry=self.registry,
|
| 123 |
+
)
|
| 124 |
+
self.recently_failed_nodes = Gauge(
|
| 125 |
+
"recently_failed_nodes",
|
| 126 |
+
"The number of recently failed nodes. This count could reset "
|
| 127 |
+
"at undefined times.",
|
| 128 |
+
labelnames=(
|
| 129 |
+
"NodeType",
|
| 130 |
+
"SessionName",
|
| 131 |
+
),
|
| 132 |
+
unit="nodes",
|
| 133 |
+
namespace="autoscaler",
|
| 134 |
+
registry=self.registry,
|
| 135 |
+
)
|
| 136 |
+
self.started_nodes: Counter = Counter(
|
| 137 |
+
"started_nodes",
|
| 138 |
+
"Number of nodes started.",
|
| 139 |
+
labelnames=("SessionName",),
|
| 140 |
+
unit="nodes",
|
| 141 |
+
namespace="autoscaler",
|
| 142 |
+
registry=self.registry,
|
| 143 |
+
).labels(SessionName=session_name)
|
| 144 |
+
self.stopped_nodes: Counter = Counter(
|
| 145 |
+
"stopped_nodes",
|
| 146 |
+
"Number of nodes stopped.",
|
| 147 |
+
labelnames=("SessionName",),
|
| 148 |
+
unit="nodes",
|
| 149 |
+
namespace="autoscaler",
|
| 150 |
+
registry=self.registry,
|
| 151 |
+
).labels(SessionName=session_name)
|
| 152 |
+
self.updating_nodes: Gauge = Gauge(
|
| 153 |
+
"updating_nodes",
|
| 154 |
+
"Number of nodes in the process of updating.",
|
| 155 |
+
labelnames=("SessionName",),
|
| 156 |
+
unit="nodes",
|
| 157 |
+
namespace="autoscaler",
|
| 158 |
+
registry=self.registry,
|
| 159 |
+
).labels(SessionName=session_name)
|
| 160 |
+
self.recovering_nodes: Gauge = Gauge(
|
| 161 |
+
"recovering_nodes",
|
| 162 |
+
"Number of nodes in the process of recovering.",
|
| 163 |
+
labelnames=("SessionName",),
|
| 164 |
+
unit="nodes",
|
| 165 |
+
namespace="autoscaler",
|
| 166 |
+
registry=self.registry,
|
| 167 |
+
).labels(SessionName=session_name)
|
| 168 |
+
self.running_workers: Gauge = Gauge(
|
| 169 |
+
"running_workers",
|
| 170 |
+
"Number of worker nodes running.",
|
| 171 |
+
labelnames=("SessionName",),
|
| 172 |
+
unit="nodes",
|
| 173 |
+
namespace="autoscaler",
|
| 174 |
+
registry=self.registry,
|
| 175 |
+
).labels(SessionName=session_name)
|
| 176 |
+
self.failed_create_nodes: Counter = Counter(
|
| 177 |
+
"failed_create_nodes",
|
| 178 |
+
"Number of nodes that failed to be created due to an "
|
| 179 |
+
"exception in the node provider's create_node method.",
|
| 180 |
+
labelnames=("SessionName",),
|
| 181 |
+
unit="nodes",
|
| 182 |
+
namespace="autoscaler",
|
| 183 |
+
registry=self.registry,
|
| 184 |
+
).labels(SessionName=session_name)
|
| 185 |
+
self.failed_updates: Counter = Counter(
|
| 186 |
+
"failed_updates",
|
| 187 |
+
"Number of failed worker node updates.",
|
| 188 |
+
labelnames=("SessionName",),
|
| 189 |
+
unit="updates",
|
| 190 |
+
namespace="autoscaler",
|
| 191 |
+
registry=self.registry,
|
| 192 |
+
).labels(SessionName=session_name)
|
| 193 |
+
self.successful_updates: Counter = Counter(
|
| 194 |
+
"successful_updates",
|
| 195 |
+
"Number of succesfful worker node updates.",
|
| 196 |
+
labelnames=("SessionName",),
|
| 197 |
+
unit="updates",
|
| 198 |
+
namespace="autoscaler",
|
| 199 |
+
registry=self.registry,
|
| 200 |
+
).labels(SessionName=session_name)
|
| 201 |
+
self.failed_recoveries: Counter = Counter(
|
| 202 |
+
"failed_recoveries",
|
| 203 |
+
"Number of failed node recoveries.",
|
| 204 |
+
labelnames=("SessionName",),
|
| 205 |
+
unit="recoveries",
|
| 206 |
+
namespace="autoscaler",
|
| 207 |
+
registry=self.registry,
|
| 208 |
+
).labels(SessionName=session_name)
|
| 209 |
+
self.successful_recoveries: Counter = Counter(
|
| 210 |
+
"successful_recoveries",
|
| 211 |
+
"Number of successful node recoveries.",
|
| 212 |
+
labelnames=("SessionName",),
|
| 213 |
+
unit="recoveries",
|
| 214 |
+
namespace="autoscaler",
|
| 215 |
+
registry=self.registry,
|
| 216 |
+
).labels(SessionName=session_name)
|
| 217 |
+
self.update_loop_exceptions: Counter = Counter(
|
| 218 |
+
"update_loop_exceptions",
|
| 219 |
+
"Number of exceptions raised in the update loop of the autoscaler.",
|
| 220 |
+
labelnames=("SessionName",),
|
| 221 |
+
unit="exceptions",
|
| 222 |
+
namespace="autoscaler",
|
| 223 |
+
registry=self.registry,
|
| 224 |
+
).labels(SessionName=session_name)
|
| 225 |
+
self.node_launch_exceptions: Counter = Counter(
|
| 226 |
+
"node_launch_exceptions",
|
| 227 |
+
"Number of exceptions raised while launching nodes.",
|
| 228 |
+
labelnames=("SessionName",),
|
| 229 |
+
unit="exceptions",
|
| 230 |
+
namespace="autoscaler",
|
| 231 |
+
registry=self.registry,
|
| 232 |
+
).labels(SessionName=session_name)
|
| 233 |
+
self.reset_exceptions: Counter = Counter(
|
| 234 |
+
"reset_exceptions",
|
| 235 |
+
"Number of exceptions raised while resetting the autoscaler.",
|
| 236 |
+
labelnames=("SessionName",),
|
| 237 |
+
unit="exceptions",
|
| 238 |
+
namespace="autoscaler",
|
| 239 |
+
registry=self.registry,
|
| 240 |
+
).labels(SessionName=session_name)
|
| 241 |
+
self.config_validation_exceptions: Counter = Counter(
|
| 242 |
+
"config_validation_exceptions",
|
| 243 |
+
"Number of exceptions raised while validating the config "
|
| 244 |
+
"during a reset.",
|
| 245 |
+
labelnames=("SessionName",),
|
| 246 |
+
unit="exceptions",
|
| 247 |
+
namespace="autoscaler",
|
| 248 |
+
registry=self.registry,
|
| 249 |
+
).labels(SessionName=session_name)
|
| 250 |
+
self.drain_node_exceptions: Counter = Counter(
|
| 251 |
+
"drain_node_exceptions",
|
| 252 |
+
"Number of exceptions raised when making a DrainNode rpc"
|
| 253 |
+
"prior to node termination.",
|
| 254 |
+
labelnames=("SessionName",),
|
| 255 |
+
unit="exceptions",
|
| 256 |
+
namespace="autoscaler",
|
| 257 |
+
registry=self.registry,
|
| 258 |
+
).labels(SessionName=session_name)
|
| 259 |
+
# This represents the autoscaler's view of essentially
|
| 260 |
+
# `ray.cluster_resources()`, it may be slightly different from the
|
| 261 |
+
# core metric from an eventual consistency perspective.
|
| 262 |
+
self.cluster_resources: Gauge = Gauge(
|
| 263 |
+
"cluster_resources",
|
| 264 |
+
"Total logical resources in the cluster.",
|
| 265 |
+
labelnames=("resource", "SessionName"),
|
| 266 |
+
unit="resources",
|
| 267 |
+
namespace="autoscaler",
|
| 268 |
+
registry=self.registry,
|
| 269 |
+
)
|
| 270 |
+
# This represents the pending launches + nodes being set up for the
|
| 271 |
+
# autoscaler.
|
| 272 |
+
self.pending_resources: Gauge = Gauge(
|
| 273 |
+
"pending_resources",
|
| 274 |
+
"Pending logical resources in the cluster.",
|
| 275 |
+
labelnames=("resource", "SessionName"),
|
| 276 |
+
unit="resources",
|
| 277 |
+
namespace="autoscaler",
|
| 278 |
+
registry=self.registry,
|
| 279 |
+
)
|
| 280 |
+
|
| 281 |
+
@property
|
| 282 |
+
def session_name(self):
|
| 283 |
+
return self._session_name
|
| 284 |
+
|
| 285 |
+
except ImportError:
|
| 286 |
+
|
| 287 |
+
class AutoscalerPrometheusMetrics(object):
|
| 288 |
+
def __init__(self, session_name: str = None):
|
| 289 |
+
pass
|
| 290 |
+
|
| 291 |
+
def __getattr__(self, attr):
|
| 292 |
+
return NullMetric()
|