koichi12 commited on Feb 12, 2025

Commit

293db81

verified ·

1 Parent(s): c590e6b

Add files using upload-large-folder tool

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.venv/lib/python3.11/site-packages/ray/air/_internal/__pycache__/__init__.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/air/_internal/__pycache__/config.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/autoscaler/__init__.py +8 -0
.venv/lib/python3.11/site-packages/ray/autoscaler/__pycache__/__init__.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/autoscaler/__pycache__/batching_node_provider.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/autoscaler/__pycache__/command_runner.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/autoscaler/__pycache__/launch_and_verify_cluster.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/autoscaler/__pycache__/node_launch_exception.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/autoscaler/__pycache__/node_provider.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/autoscaler/__pycache__/tags.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/autoscaler/_private/__init__.py +0 -0
.venv/lib/python3.11/site-packages/ray/autoscaler/_private/_azure/__init__.py +0 -0
.venv/lib/python3.11/site-packages/ray/autoscaler/_private/_azure/__pycache__/__init__.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/autoscaler/_private/_azure/__pycache__/config.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/autoscaler/_private/_azure/__pycache__/node_provider.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/autoscaler/_private/_azure/azure-config-template.json +130 -0
.venv/lib/python3.11/site-packages/ray/autoscaler/_private/_azure/azure-vm-template.json +294 -0
.venv/lib/python3.11/site-packages/ray/autoscaler/_private/_azure/config.py +208 -0
.venv/lib/python3.11/site-packages/ray/autoscaler/_private/_azure/node_provider.py +488 -0
.venv/lib/python3.11/site-packages/ray/autoscaler/_private/autoscaler.py +1508 -0
.venv/lib/python3.11/site-packages/ray/autoscaler/_private/cli_logger.py +825 -0
.venv/lib/python3.11/site-packages/ray/autoscaler/_private/cli_logger_demoall.py +40 -0
.venv/lib/python3.11/site-packages/ray/autoscaler/_private/cluster_dump.py +652 -0
.venv/lib/python3.11/site-packages/ray/autoscaler/_private/command_runner.py +921 -0
.venv/lib/python3.11/site-packages/ray/autoscaler/_private/commands.py +1631 -0
.venv/lib/python3.11/site-packages/ray/autoscaler/_private/constants.py +140 -0
.venv/lib/python3.11/site-packages/ray/autoscaler/_private/docker.py +129 -0
.venv/lib/python3.11/site-packages/ray/autoscaler/_private/event_summarizer.py +75 -0
.venv/lib/python3.11/site-packages/ray/autoscaler/_private/event_system.py +106 -0
.venv/lib/python3.11/site-packages/ray/autoscaler/_private/fake_multi_node/__init__.py +0 -0
.venv/lib/python3.11/site-packages/ray/autoscaler/_private/fake_multi_node/command_runner.py +91 -0
.venv/lib/python3.11/site-packages/ray/autoscaler/_private/fake_multi_node/docker_monitor.py +246 -0
.venv/lib/python3.11/site-packages/ray/autoscaler/_private/fake_multi_node/test_utils.py +398 -0
.venv/lib/python3.11/site-packages/ray/autoscaler/_private/legacy_info_string.py +37 -0
.venv/lib/python3.11/site-packages/ray/autoscaler/_private/load_metrics.py +375 -0
.venv/lib/python3.11/site-packages/ray/autoscaler/_private/loader.py +15 -0
.venv/lib/python3.11/site-packages/ray/autoscaler/_private/local/__init__.py +0 -0
.venv/lib/python3.11/site-packages/ray/autoscaler/_private/local/__pycache__/__init__.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/autoscaler/_private/local/__pycache__/config.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/autoscaler/_private/local/__pycache__/coordinator_node_provider.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/autoscaler/_private/local/__pycache__/node_provider.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/ray/autoscaler/_private/local/config.py +121 -0
.venv/lib/python3.11/site-packages/ray/autoscaler/_private/local/coordinator_node_provider.py +110 -0
.venv/lib/python3.11/site-packages/ray/autoscaler/_private/local/node_provider.py +304 -0
.venv/lib/python3.11/site-packages/ray/autoscaler/_private/log_timer.py +33 -0
.venv/lib/python3.11/site-packages/ray/autoscaler/_private/monitor.py +719 -0
.venv/lib/python3.11/site-packages/ray/autoscaler/_private/node_launcher.py +221 -0
.venv/lib/python3.11/site-packages/ray/autoscaler/_private/node_provider_availability_tracker.py +165 -0
.venv/lib/python3.11/site-packages/ray/autoscaler/_private/node_tracker.py +77 -0
.venv/lib/python3.11/site-packages/ray/autoscaler/_private/prom_metrics.py +292 -0

.venv/lib/python3.11/site-packages/ray/air/_internal/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (190 Bytes). View file

.venv/lib/python3.11/site-packages/ray/air/_internal/__pycache__/config.cpython-311.pyc ADDED Viewed

Binary file (2.43 kB). View file

.venv/lib/python3.11/site-packages/ray/autoscaler/__init__.py ADDED Viewed

	@@ -0,0 +1,8 @@

+import os
+from pathlib import Path
+from ray.autoscaler import sdk
+__all__ = ["sdk"]
+AUTOSCALER_DIR_PATH = Path(os.path.abspath(os.path.dirname(__file__)))

.venv/lib/python3.11/site-packages/ray/autoscaler/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (560 Bytes). View file

.venv/lib/python3.11/site-packages/ray/autoscaler/__pycache__/batching_node_provider.cpython-311.pyc ADDED Viewed

Binary file (12.9 kB). View file

.venv/lib/python3.11/site-packages/ray/autoscaler/__pycache__/command_runner.cpython-311.pyc ADDED Viewed

Binary file (4.84 kB). View file

.venv/lib/python3.11/site-packages/ray/autoscaler/__pycache__/launch_and_verify_cluster.cpython-311.pyc ADDED Viewed

Binary file (19 kB). View file

.venv/lib/python3.11/site-packages/ray/autoscaler/__pycache__/node_launch_exception.cpython-311.pyc ADDED Viewed

Binary file (1.99 kB). View file

.venv/lib/python3.11/site-packages/ray/autoscaler/__pycache__/node_provider.cpython-311.pyc ADDED Viewed

Binary file (13.4 kB). View file

.venv/lib/python3.11/site-packages/ray/autoscaler/__pycache__/tags.cpython-311.pyc ADDED Viewed

Binary file (1.32 kB). View file

.venv/lib/python3.11/site-packages/ray/autoscaler/_private/__init__.py ADDED Viewed

File without changes

.venv/lib/python3.11/site-packages/ray/autoscaler/_private/_azure/__init__.py ADDED Viewed

File without changes

.venv/lib/python3.11/site-packages/ray/autoscaler/_private/_azure/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (203 Bytes). View file

.venv/lib/python3.11/site-packages/ray/autoscaler/_private/_azure/__pycache__/config.cpython-311.pyc ADDED Viewed

Binary file (9.59 kB). View file

.venv/lib/python3.11/site-packages/ray/autoscaler/_private/_azure/__pycache__/node_provider.cpython-311.pyc ADDED Viewed

Binary file (26.4 kB). View file

.venv/lib/python3.11/site-packages/ray/autoscaler/_private/_azure/azure-config-template.json ADDED Viewed

	@@ -0,0 +1,130 @@

+{
+    "$schema": "https://schema.management.azure.com/schemas/2019-04-01/deploymentTemplate.json#",
+    "contentVersion": "1.0.0.0",
+    "parameters": {
+        "clusterId": {
+            "type": "string",
+            "metadata": {
+                "description": "Unique string appended to resource names to isolate resources from different ray clusters."
+            }
+        },
+        "subnet": {
+            "type": "string",
+            "metadata": {
+                "description": "Subnet parameters."
+            }
+        },
+        "msiName": {
+            "type": "string",
+            "metadata": {
+                "description": "Managed service identity."
+            }
+        },
+        "msiResourceGroup": {
+            "type": "string",
+            "metadata": {
+                "description": "Managed service identity resource group."
+            }
+        },
+        "createMsi": {
+            "type": "bool",
+            "defaultValue": "true"
+        }
+    },
+    "variables": {
+        "contributor": "[subscriptionResourceId('Microsoft.Authorization/roleDefinitions', 'b24988ac-6180-42a0-ab88-20f7382dd24c')]",
+        "location": "[resourceGroup().location]",
+        "roleAssignmentName": "[concat('ray-', parameters('clusterId'), '-ra')]",
+        "nsgName": "[concat('ray-', parameters('clusterId'), '-nsg')]",
+        "nsg": "[resourceId('Microsoft.Network/networkSecurityGroups', variables('nsgName'))]",
+        "vnetName": "[concat('ray-', parameters('clusterId'), '-vnet')]",
+        "subnetName": "[concat('ray-', parameters('clusterId'), '-subnet')]"
+    },
+    "resources": [
+       {
+            "condition": "[parameters('createMsi')]",
+            "type": "Microsoft.ManagedIdentity/userAssignedIdentities",
+            "apiVersion": "2018-11-30",
+            "location": "[variables('location')]",
+            "name": "[parameters('msiName')]"
+        },
+        {
+            "type": "Microsoft.Authorization/roleAssignments",
+            "apiVersion": "2020-08-01-preview",
+            "name": "[guid(variables('roleAssignmentName'))]",
+            "properties": {
+                "principalId": "[reference(resourceId(parameters('msiResourceGroup'), 'Microsoft.ManagedIdentity/userAssignedIdentities', parameters('msiName')), '2018-11-30').principalId]",
+                "roleDefinitionId": "[variables('contributor')]",
+                "scope": "[resourceGroup().id]",
+                "principalType": "ServicePrincipal"
+            },
+            "dependsOn": [
+                "[parameters('msiName')]"
+            ]
+        },
+        {
+            "type": "Microsoft.Network/networkSecurityGroups",
+            "apiVersion": "2019-02-01",
+            "name": "[variables('nsgName')]",
+            "location": "[variables('location')]",
+            "properties": {
+                "securityRules": [
+                    {
+                        "name": "SSH",
+                        "properties": {
+                            "priority": 1000,
+                            "protocol": "TCP",
+                            "access": "Allow",
+                            "direction": "Inbound",
+                            "sourceAddressPrefix": "*",
+                            "sourcePortRange": "*",
+                            "destinationAddressPrefix": "*",
+                            "destinationPortRange": "22"
+                        }
+                    }
+                ]
+            }
+        },
+        {
+            "type": "Microsoft.Network/virtualNetworks",
+            "apiVersion": "2019-11-01",
+            "name": "[variables('vnetName')]",
+            "location": "[variables('location')]",
+            "properties": {
+                "addressSpace": {
+                    "addressPrefixes": [
+                        "[parameters('subnet')]"
+                    ]
+                },
+                "subnets": [
+                    {
+                        "name": "[variables('subnetName')]",
+                        "properties": {
+                            "addressPrefix": "[parameters('subnet')]",
+                            "networkSecurityGroup": {
+                                "id": "[variables('nsg')]"
+                              }
+                        }
+                    }
+                ]
+            },
+            "dependsOn": [
+                "[variables('nsg')]"
+            ]
+        }
+    ],
+    "outputs": {
+        "subnet": {
+            "type": "string",
+            "value": "[resourceId('Microsoft.Network/virtualNetworks/subnets', variables('vnetName'), variables('subnetName'))]"
+        },
+        "nsg": {
+            "type": "string",
+            "value": "[variables('nsg')]"
+        },
+        "msi": {
+            "type": "string",
+            "value": "[resourceId(parameters('msiResourceGroup'), 'Microsoft.ManagedIdentity/userAssignedIdentities', parameters('msiName'))]"
+        }
+    }
+}

.venv/lib/python3.11/site-packages/ray/autoscaler/_private/_azure/azure-vm-template.json ADDED Viewed

	@@ -0,0 +1,294 @@

+{
+    "$schema": "https://schema.management.azure.com/schemas/2019-04-01/deploymentTemplate.json#",
+    "contentVersion": "1.0.0.0",
+    "parameters": {
+        "vmName": {
+            "type": "string",
+            "metadata": {
+                "description": "The name of you Virtual Machine."
+            }
+        },
+        "adminUsername": {
+            "type": "string",
+            "metadata": {
+                "description": "Username for the Virtual Machine."
+            }
+        },
+        "publicKey": {
+            "type": "securestring",
+            "metadata": {
+                "description": "SSH Key for the Virtual Machine"
+            }
+        },
+        "imagePublisher": {
+            "type": "string",
+            "metadata": {
+                "description": "The publisher of the VM image"
+            }
+        },
+        "imageOffer": {
+            "type": "string",
+            "metadata": {
+                "description": "The offer of the VM image"
+            }
+        },
+        "imageSku": {
+            "type": "string",
+            "metadata": {
+                "description": "The sku of the VM image"
+            }
+        },
+        "imageVersion": {
+            "type": "string",
+            "metadata": {
+                "description": "The version of the VM image"
+            }
+        },
+        "vmSize": {
+            "type": "string",
+            "metadata": {
+                "description": "The size of the VM"
+            }
+        },
+        "vmTags": {
+            "type": "object",
+            "metadata": {
+                "description": "Tags for the VM"
+            }
+        },
+        "vmCount": {
+            "type": "int",
+            "metadata": {
+                "description": "Number of VMs to deploy"
+            }
+        },
+        "provisionPublicIp": {
+            "type": "bool",
+            "defaultValue": true,
+            "metadata": {
+                "description": "If true creates a public ip"
+            }
+        },
+        "priority": {
+            "type": "string",
+            "defaultValue": "Regular",
+            "metadata": {
+                "description": "Specifies the priority for the virtual machine."
+            }
+        },
+        "evictionPolicy": {
+            "type": "string",
+            "defaultValue": "Delete",
+            "metadata": {
+                "description": "Specifies the eviction policy for the virtual machine."
+            }
+        },
+        "billingProfile": {
+            "type": "object",
+            "defaultValue": {},
+            "metadata": {
+                "description": "Specifies the maximum price to pay for Azure Spot VM."
+            }
+        },
+        "msi": {
+            "type": "string",
+            "metadata": {
+                "description": "Managed service identity resource id."
+            }
+        },
+        "nsg": {
+            "type": "string",
+            "metadata": {
+                "description": "Network security group resource id."
+            }
+        },
+        "subnet": {
+            "type": "string",
+            "metadata": {
+                "descriptions": "Subnet resource id."
+            }
+        },
+        "enableAcceleratedNetworking": {
+            "type": "bool",
+            "defaultValue": false,
+            "metadata": {
+                "descriptions": "Whether to enable accelerated networking."
+            }
+        }
+    },
+    "variables": {
+        "location": "[resourceGroup().location]",
+        "networkInterfaceNamePrivate": "[concat(parameters('vmName'), '-nic')]",
+        "networkInterfaceNamePublic": "[concat(parameters('vmName'), '-nic-public')]",
+        "networkInterfaceName": "[if(parameters('provisionPublicIp'), variables('networkInterfaceNamePublic'), variables('networkInterfaceNamePrivate'))]",
+        "networkIpConfig": "[guid(resourceGroup().id, parameters('vmName'))]",
+        "osDiskType": "Standard_LRS",
+        "publicIpAddressName": "[concat(parameters('vmName'), '-ip')]"
+    },
+    "resources": [
+        {
+            "type": "Microsoft.Network/networkInterfaces",
+            "apiVersion": "2020-06-01",
+            "name": "[concat(variables('networkInterfaceNamePublic'), copyIndex())]",
+            "location": "[variables('location')]",
+            "dependsOn": [
+                "[resourceId('Microsoft.Network/publicIpAddresses/', concat(variables('publicIpAddressName'), copyIndex()))]"
+            ],
+            "copy": {
+                "name": "NICPublicCopy",
+                "count": "[parameters('vmCount')]"
+            },
+            "properties": {
+                "ipConfigurations": [
+                    {
+                        "name": "[variables('networkIpConfig')]",
+                        "properties": {
+                            "subnet": {
+                                "id": "[parameters('subnet')]"
+                            },
+                            "privateIPAllocationMethod": "Dynamic",
+                            "publicIpAddress": {
+                                "id": "[resourceId('Microsoft.Network/publicIPAddresses', concat(variables('publicIPAddressName'), copyIndex()))]"
+                            }
+                        }
+                    }
+                ],
+                "networkSecurityGroup": {
+                    "id": "[parameters('nsg')]"
+                },
+                "enableAcceleratedNetworking": "[parameters('enableAcceleratedNetworking')]"
+            },
+            "condition": "[parameters('provisionPublicIp')]"
+        },
+        {
+            "type": "Microsoft.Network/networkInterfaces",
+            "apiVersion": "2020-06-01",
+            "name": "[concat(variables('networkInterfaceNamePrivate'), copyIndex())]",
+            "location": "[variables('location')]",
+            "copy": {
+                "name": "NICPrivateCopy",
+                "count": "[parameters('vmCount')]"
+            },
+            "properties": {
+                "ipConfigurations": [
+                    {
+                        "name": "[variables('networkIpConfig')]",
+                        "properties": {
+                            "subnet": {
+                                "id": "[parameters('subnet')]"
+                            },
+                            "privateIPAllocationMethod": "Dynamic"
+                        }
+                    }
+                ],
+                "networkSecurityGroup": {
+                    "id": "[parameters('nsg')]"
+                },
+                "enableAcceleratedNetworking": "[parameters('enableAcceleratedNetworking')]"
+            },
+            "condition": "[not(parameters('provisionPublicIp'))]"
+        },
+        {
+            "type": "Microsoft.Network/publicIpAddresses",
+            "apiVersion": "2019-02-01",
+            "name": "[concat(variables('publicIpAddressName'), copyIndex())]",
+            "location": "[variables('location')]",
+            "properties": {
+                "publicIpAllocationMethod": "Static",
+                "publicIPAddressVersion": "IPv4"
+            },
+            "copy": {
+                "name": "PublicIpCopy",
+                "count": "[parameters('vmCount')]"
+            },
+            "sku": {
+                "name": "Basic",
+                "tier": "Regional"
+            },
+            "condition": "[parameters('provisionPublicIp')]"
+        },
+        {
+            "type": "Microsoft.Compute/virtualMachines",
+            "apiVersion": "2019-03-01",
+            "name": "[concat(parameters('vmName'), copyIndex())]",
+            "location": "[variables('location')]",
+            "dependsOn": [
+                "[resourceId('Microsoft.Network/networkInterfaces/', concat(variables('networkInterfaceName'), copyIndex()))]"
+            ],
+            "copy": {
+                "name": "VmCopy",
+                "count": "[parameters('vmCount')]"
+            },
+            "tags": "[parameters('vmTags')]",
+            "properties": {
+                "hardwareProfile": {
+                    "vmSize": "[parameters('vmSize')]"
+                },
+                "storageProfile": {
+                    "osDisk": {
+                        "createOption": "fromImage",
+                        "managedDisk": {
+                            "storageAccountType": "[variables('osDiskType')]"
+                        }
+                    },
+                    "imageReference": {
+                        "publisher": "[parameters('imagePublisher')]",
+                        "offer": "[parameters('imageOffer')]",
+                        "sku": "[parameters('imageSku')]",
+                        "version": "[parameters('imageVersion')]"
+                    }
+                },
+                "networkProfile": {
+                    "networkInterfaces": [
+                        {
+                            "id": "[resourceId('Microsoft.Network/networkInterfaces', concat(variables('networkInterfaceName'), copyIndex()))]"
+                        }
+                    ]
+                },
+                "osProfile": {
+                    "computerName": "[concat(parameters('vmName'), copyIndex())]",
+                    "adminUsername": "[parameters('adminUsername')]",
+                    "adminPassword": "[parameters('publicKey')]",
+                    "linuxConfiguration": {
+                        "disablePasswordAuthentication": true,
+                        "ssh": {
+                            "publicKeys": [
+                                {
+                                    "path": "[concat('/home/', parameters('adminUsername'), '/.ssh/authorized_keys')]",
+                                    "keyData": "[parameters('publicKey')]"
+                                }
+                            ]
+                        }
+                    }
+                },
+                "priority": "[parameters('priority')]",
+                "evictionPolicy": "[if(equals(parameters('priority'), 'Spot'), parameters('evictionPolicy'), '')]",
+                "billingProfile": "[parameters('billingProfile')]"
+            },
+            "identity": {
+                "type": "UserAssigned",
+                "userAssignedIdentities": {
+                    "[parameters('msi')]": {
+                    }
+                }
+            }
+        }
+    ],
+    "outputs": {
+        "publicIp": {
+            "type": "array",
+            "copy": {
+                "count": "[parameters('vmCount')]",
+                "input": "[reference(concat(variables('publicIpAddressName'), copyIndex())).ipAddress]"
+            },
+            "condition": "[parameters('provisionPublicIp')]"
+        },
+        "privateIp": {
+            "type": "array",
+            "copy": {
+                "count": "[parameters('vmCount')]",
+                "input": "[reference(concat(variables('networkInterfaceName'), copyIndex())).ipConfigurations[0].properties.privateIPAddress]"
+            }
+        }
+    }
+}

.venv/lib/python3.11/site-packages/ray/autoscaler/_private/_azure/config.py ADDED Viewed

	@@ -0,0 +1,208 @@

+import json
+import logging
+import random
+from hashlib import sha256
+from pathlib import Path
+from typing import Any, Callable
+from azure.common.credentials import get_cli_profile
+from azure.identity import AzureCliCredential
+from azure.mgmt.resource import ResourceManagementClient
+from azure.mgmt.resource.resources.models import DeploymentMode
+UNIQUE_ID_LEN = 4
+logger = logging.getLogger(__name__)
+def get_azure_sdk_function(client: Any, function_name: str) -> Callable:
+    """Retrieve a callable function from Azure SDK client object.
+    Newer versions of the various client SDKs renamed function names to
+    have a begin_ prefix. This function supports both the old and new
+    versions of the SDK by first trying the old name and falling back to
+    the prefixed new name.
+    """
+    func = getattr(
+        client, function_name, getattr(client, f"begin_{function_name}", None)
+    )
+    if func is None:
+        raise AttributeError(
+            "'{obj}' object has no {func} or begin_{func} attribute".format(
+                obj={client.__name__}, func=function_name
+            )
+        )
+    return func
+def bootstrap_azure(config):
+    config = _configure_key_pair(config)
+    config = _configure_resource_group(config)
+    return config
+def _configure_resource_group(config):
+    # TODO: look at availability sets
+    # https://docs.microsoft.com/en-us/azure/virtual-machines/windows/tutorial-availability-sets
+    subscription_id = config["provider"].get("subscription_id")
+    if subscription_id is None:
+        subscription_id = get_cli_profile().get_subscription_id()
+    resource_client = ResourceManagementClient(AzureCliCredential(), subscription_id)
+    config["provider"]["subscription_id"] = subscription_id
+    logger.info("Using subscription id: %s", subscription_id)
+    assert (
+        "resource_group" in config["provider"]
+    ), "Provider config must include resource_group field"
+    resource_group = config["provider"]["resource_group"]
+    assert (
+        "location" in config["provider"]
+    ), "Provider config must include location field"
+    params = {"location": config["provider"]["location"]}
+    if "tags" in config["provider"]:
+        params["tags"] = config["provider"]["tags"]
+    logger.info("Creating/Updating resource group: %s", resource_group)
+    rg_create_or_update = get_azure_sdk_function(
+        client=resource_client.resource_groups, function_name="create_or_update"
+    )
+    rg_create_or_update(resource_group_name=resource_group, parameters=params)
+    # load the template file
+    current_path = Path(__file__).parent
+    template_path = current_path.joinpath("azure-config-template.json")
+    with open(template_path, "r") as template_fp:
+        template = json.load(template_fp)
+    logger.info("Using cluster name: %s", config["cluster_name"])
+    # set unique id for resources in this cluster
+    unique_id = config["provider"].get("unique_id")
+    if unique_id is None:
+        hasher = sha256()
+        hasher.update(config["provider"]["resource_group"].encode("utf-8"))
+        unique_id = hasher.hexdigest()[:UNIQUE_ID_LEN]
+    else:
+        unique_id = str(unique_id)
+    config["provider"]["unique_id"] = unique_id
+    logger.info("Using unique id: %s", unique_id)
+    cluster_id = "{}-{}".format(config["cluster_name"], unique_id)
+    subnet_mask = config["provider"].get("subnet_mask")
+    if subnet_mask is None:
+        # choose a random subnet, skipping most common value of 0
+        random.seed(unique_id)
+        subnet_mask = "10.{}.0.0/16".format(random.randint(1, 254))
+    logger.info("Using subnet mask: %s", subnet_mask)
+    # Copy over properties from existing subnet.
+    # Addresses issue (https://github.com/Azure/azure-quickstart-templates/issues/2786)
+    # where existing subnet properties will get overwritten unless explicitly specified
+    # during multiple deployments even if vnet/subnet do not change.
+    # May eventually be fixed by passing empty subnet list if they already exist:
+    # https://techcommunity.microsoft.com/t5/azure-networking-blog/azure-virtual-network-now-supports-updates-without-subnet/ba-p/4067952
+    list_by_rg = get_azure_sdk_function(
+        client=resource_client.resources, function_name="list_by_resource_group"
+    )
+    existing_vnets = list(
+        list_by_rg(
+            resource_group,
+            f"substringof('{unique_id}', name) and "
+            "resourceType eq 'Microsoft.Network/virtualNetworks'",
+        )
+    )
+    if len(existing_vnets) > 0:
+        vnid = existing_vnets[0].id
+        get_by_id = get_azure_sdk_function(
+            client=resource_client.resources, function_name="get_by_id"
+        )
+        subnet = get_by_id(vnid, resource_client.DEFAULT_API_VERSION).properties[
+            "subnets"
+        ][0]
+        template_vnet = next(
+            (
+                rs
+                for rs in template["resources"]
+                if rs["type"] == "Microsoft.Network/virtualNetworks"
+            ),
+            None,
+        )
+        if template_vnet is not None:
+            template_subnets = template_vnet["properties"].get("subnets")
+            if template_subnets is not None:
+                template_subnets[0]["properties"].update(subnet["properties"])
+    # Get or create an MSI name and resource group.
+    # Defaults to current resource group if not provided.
+    use_existing_msi = (
+        "msi_name" in config["provider"] and "msi_resource_group" in config["provider"]
+    )
+    msi_resource_group = config["provider"].get("msi_resource_group", resource_group)
+    msi_name = config["provider"].get("msi_name", f"ray-{cluster_id}-msi")
+    logger.info(
+        "Using msi_name: %s from msi_resource_group: %s", msi_name, msi_resource_group
+    )
+    parameters = {
+        "properties": {
+            "mode": DeploymentMode.incremental,
+            "template": template,
+            "parameters": {
+                "subnet": {"value": subnet_mask},
+                "clusterId": {"value": cluster_id},
+                "msiName": {"value": msi_name},
+                "msiResourceGroup": {"value": msi_resource_group},
+                "createMsi": {"value": not use_existing_msi},
+            },
+        }
+    }
+    create_or_update = get_azure_sdk_function(
+        client=resource_client.deployments, function_name="create_or_update"
+    )
+    outputs = (
+        create_or_update(
+            resource_group_name=resource_group,
+            deployment_name="ray-config",
+            parameters=parameters,
+        )
+        .result()
+        .properties.outputs
+    )
+    # append output resource ids to be used with vm creation
+    config["provider"]["msi"] = outputs["msi"]["value"]
+    config["provider"]["nsg"] = outputs["nsg"]["value"]
+    config["provider"]["subnet"] = outputs["subnet"]["value"]
+    return config
+def _configure_key_pair(config):
+    ssh_user = config["auth"]["ssh_user"]
+    public_key = None
+    # search if the keys exist
+    for key_type in ["ssh_private_key", "ssh_public_key"]:
+        try:
+            key_path = Path(config["auth"][key_type]).expanduser()
+        except KeyError:
+            raise Exception("Config must define {}".format(key_type))
+        except TypeError:
+            raise Exception("Invalid config value for {}".format(key_type))
+        assert key_path.is_file(), "Could not find ssh key: {}".format(key_path)
+        if key_type == "ssh_public_key":
+            with open(key_path, "r") as f:
+                public_key = f.read()
+    for node_type in config["available_node_types"].values():
+        azure_arm_parameters = node_type["node_config"].setdefault(
+            "azure_arm_parameters", {}
+        )
+        azure_arm_parameters["adminUsername"] = ssh_user
+        azure_arm_parameters["publicKey"] = public_key
+    return config

.venv/lib/python3.11/site-packages/ray/autoscaler/_private/_azure/node_provider.py ADDED Viewed

	@@ -0,0 +1,488 @@

+import json
+import logging
+import time
+from concurrent.futures import Future, ThreadPoolExecutor
+from pathlib import Path
+from threading import RLock
+from uuid import uuid4
+from azure.core.exceptions import ResourceNotFoundError
+from azure.identity import DefaultAzureCredential
+from azure.mgmt.compute import ComputeManagementClient
+from azure.mgmt.network import NetworkManagementClient
+from azure.mgmt.resource import ResourceManagementClient
+from azure.mgmt.resource.resources.models import DeploymentMode
+from ray.autoscaler._private._azure.config import (
+    bootstrap_azure,
+    get_azure_sdk_function,
+)
+from ray.autoscaler._private.constants import (
+    AUTOSCALER_NODE_START_WAIT_S,
+    AUTOSCALER_NODE_TERMINATE_WAIT_S,
+    MAX_PARALLEL_SHUTDOWN_WORKERS,
+)
+from ray.autoscaler.node_provider import NodeProvider
+from ray.autoscaler.tags import (
+    NODE_KIND_HEAD,
+    TAG_RAY_CLUSTER_NAME,
+    TAG_RAY_LAUNCH_CONFIG,
+    TAG_RAY_NODE_KIND,
+    TAG_RAY_NODE_NAME,
+    TAG_RAY_USER_NODE_TYPE,
+)
+VM_NAME_MAX_LEN = 64
+UNIQUE_ID_LEN = 4
+logger = logging.getLogger(__name__)
+azure_logger = logging.getLogger("azure.core.pipeline.policies.http_logging_policy")
+azure_logger.setLevel(logging.WARNING)
+def synchronized(f):
+    def wrapper(self, *args, **kwargs):
+        self.lock.acquire()
+        try:
+            return f(self, *args, **kwargs)
+        finally:
+            self.lock.release()
+    return wrapper
+class AzureNodeProvider(NodeProvider):
+    """Node Provider for Azure
+    This provider assumes Azure credentials are set by running ``az login``
+    and the default subscription is configured through ``az account``
+    or set in the ``provider`` field of the autoscaler configuration.
+    Nodes may be in one of three states: {pending, running, terminated}. Nodes
+    appear immediately once started by ``create_node``, and transition
+    immediately to terminated when ``terminate_node`` is called.
+    """
+    def __init__(self, provider_config, cluster_name):
+        NodeProvider.__init__(self, provider_config, cluster_name)
+        subscription_id = provider_config["subscription_id"]
+        self.cache_stopped_nodes = provider_config.get("cache_stopped_nodes", True)
+        credential = DefaultAzureCredential(exclude_shared_token_cache_credential=True)
+        self.compute_client = ComputeManagementClient(credential, subscription_id)
+        self.network_client = NetworkManagementClient(credential, subscription_id)
+        self.resource_client = ResourceManagementClient(credential, subscription_id)
+        self.lock = RLock()
+        # cache node objects
+        self.cached_nodes = {}
+        # Cache terminating node operations
+        self.terminating_nodes: dict[str, Future] = {}
+        self.termination_executor = ThreadPoolExecutor(
+            max_workers=MAX_PARALLEL_SHUTDOWN_WORKERS
+        )
+    @synchronized
+    def _get_filtered_nodes(self, tag_filters):
+        # add cluster name filter to only get nodes from this cluster
+        cluster_tag_filters = {**tag_filters, TAG_RAY_CLUSTER_NAME: self.cluster_name}
+        def match_tags(tags):
+            for k, v in cluster_tag_filters.items():
+                if tags.get(k) != v:
+                    return False
+            return True
+        vms = self.compute_client.virtual_machines.list(
+            resource_group_name=self.provider_config["resource_group"]
+        )
+        nodes = [self._extract_metadata(vm) for vm in vms]
+        self.cached_nodes = {node["name"]: node for node in nodes}
+        # Update terminating nodes list by removing nodes that
+        # have finished termination.
+        self.terminating_nodes = {
+            k: v for k, v in self.terminating_nodes.items() if not v.done()
+        }
+        return {k: v for k, v in self.cached_nodes.items() if match_tags(v["tags"])}
+    def _extract_metadata(self, vm):
+        # get tags
+        metadata = {"name": vm.name, "tags": vm.tags, "status": ""}
+        # get status
+        resource_group = self.provider_config["resource_group"]
+        try:
+            instance = self.compute_client.virtual_machines.instance_view(
+                resource_group_name=resource_group, vm_name=vm.name
+            ).as_dict()
+        except ResourceNotFoundError:
+            return metadata
+        for status in instance["statuses"]:
+            # If ProvisioningState is "failed" (e.g.,
+            # ProvisioningState/failed/RetryableError), we can get a third
+            # string here, so we need to limit to the first two outputs.
+            code, state = status["code"].split("/")[:2]
+            # skip provisioning status
+            if code == "PowerState":
+                metadata["status"] = state
+                break
+        # get ip data
+        nic_id = vm.network_profile.network_interfaces[0].id
+        metadata["nic_name"] = nic_id.split("/")[-1]
+        nic = self.network_client.network_interfaces.get(
+            resource_group_name=resource_group,
+            network_interface_name=metadata["nic_name"],
+        )
+        ip_config = nic.ip_configurations[0]
+        # Get public IP if not using internal IPs or if this is the
+        # head node and use_external_head_ip is True
+        if not self.provider_config.get("use_internal_ips", False) or (
+            self.provider_config.get("use_external_head_ip", False)
+            and metadata["tags"][TAG_RAY_NODE_KIND] == NODE_KIND_HEAD
+        ):
+            public_ip_id = ip_config.public_ip_address.id
+            metadata["public_ip_name"] = public_ip_id.split("/")[-1]
+            public_ip = self.network_client.public_ip_addresses.get(
+                resource_group_name=resource_group,
+                public_ip_address_name=metadata["public_ip_name"],
+            )
+            metadata["external_ip"] = public_ip.ip_address
+        metadata["internal_ip"] = ip_config.private_ip_address
+        return metadata
+    def stopped_nodes(self, tag_filters):
+        """Return a list of stopped node ids filtered by the specified tags dict."""
+        nodes = self._get_filtered_nodes(tag_filters=tag_filters)
+        return [k for k, v in nodes.items() if v["status"].startswith("deallocat")]
+    def non_terminated_nodes(self, tag_filters):
+        """Return a list of node ids filtered by the specified tags dict.
+        This list must not include terminated nodes. For performance reasons,
+        providers are allowed to cache the result of a call to nodes() to
+        serve single-node queries (e.g. is_running(node_id)). This means that
+        nodes() must be called again to refresh results.
+        Examples:
+            >>> from ray.autoscaler.tags import TAG_RAY_NODE_KIND
+            >>> provider = ... # doctest: +SKIP
+            >>> provider.non_terminated_nodes( # doctest: +SKIP
+            ...     {TAG_RAY_NODE_KIND: "worker"})
+            ["node-1", "node-2"]
+        """
+        nodes = self._get_filtered_nodes(tag_filters=tag_filters)
+        return [
+            k
+            for k, v in nodes.items()
+            if not v["status"].startswith("deallocat") or k in self.terminating_nodes
+        ]
+    def is_running(self, node_id):
+        """Return whether the specified node is running."""
+        # always get current status
+        node = self._get_node(node_id=node_id)
+        return node["status"] == "running"
+    def is_terminated(self, node_id):
+        """Return whether the specified node is terminated."""
+        # always get current status
+        node = self._get_node(node_id=node_id)
+        return node["status"].startswith("deallocat")
+    def node_tags(self, node_id):
+        """Returns the tags of the given node (string dict)."""
+        return self._get_cached_node(node_id=node_id)["tags"]
+    def external_ip(self, node_id):
+        """Returns the external ip of the given node."""
+        ip = (
+            self._get_cached_node(node_id=node_id)["external_ip"]
+            or self._get_node(node_id=node_id)["external_ip"]
+        )
+        return ip
+    def internal_ip(self, node_id):
+        """Returns the internal ip (Ray ip) of the given node."""
+        ip = (
+            self._get_cached_node(node_id=node_id)["internal_ip"]
+            or self._get_node(node_id=node_id)["internal_ip"]
+        )
+        return ip
+    def create_node(self, node_config, tags, count):
+        resource_group = self.provider_config["resource_group"]
+        if self.cache_stopped_nodes:
+            VALIDITY_TAGS = [
+                TAG_RAY_CLUSTER_NAME,
+                TAG_RAY_NODE_KIND,
+                TAG_RAY_LAUNCH_CONFIG,
+                TAG_RAY_USER_NODE_TYPE,
+            ]
+            filters = {tag: tags[tag] for tag in VALIDITY_TAGS if tag in tags}
+            reuse_nodes = self.stopped_nodes(filters)[:count]
+            logger.info(
+                f"Reusing nodes {list(reuse_nodes)}. "
+                "To disable reuse, set `cache_stopped_nodes: False` "
+                "under `provider` in the cluster configuration.",
+            )
+            start = get_azure_sdk_function(
+                client=self.compute_client.virtual_machines, function_name="start"
+            )
+            for node_id in reuse_nodes:
+                start(resource_group_name=resource_group, vm_name=node_id).wait()
+                self.set_node_tags(node_id, tags)
+            count -= len(reuse_nodes)
+        if count:
+            self._create_node(node_config, tags, count)
+    def _create_node(self, node_config, tags, count):
+        """Creates a number of nodes within the namespace."""
+        resource_group = self.provider_config["resource_group"]
+        # load the template file
+        current_path = Path(__file__).parent
+        template_path = current_path.joinpath("azure-vm-template.json")
+        with open(template_path, "r") as template_fp:
+            template = json.load(template_fp)
+        # get the tags
+        config_tags = node_config.get("tags", {}).copy()
+        config_tags.update(tags)
+        config_tags[TAG_RAY_CLUSTER_NAME] = self.cluster_name
+        vm_name = "{node}-{unique_id}-{vm_id}".format(
+            node=config_tags.get(TAG_RAY_NODE_NAME, "node"),
+            unique_id=self.provider_config["unique_id"],
+            vm_id=uuid4().hex[:UNIQUE_ID_LEN],
+        )[:VM_NAME_MAX_LEN]
+        template_params = node_config["azure_arm_parameters"].copy()
+        template_params["vmName"] = vm_name
+        # Provision public IP if not using internal IPs or if this is the
+        # head node and use_external_head_ip is True
+        template_params["provisionPublicIp"] = not self.provider_config.get(
+            "use_internal_ips", False
+        ) or (
+            self.provider_config.get("use_external_head_ip", False)
+            and config_tags[TAG_RAY_NODE_KIND] == NODE_KIND_HEAD
+        )
+        template_params["vmTags"] = config_tags
+        template_params["vmCount"] = count
+        template_params["msi"] = self.provider_config["msi"]
+        template_params["nsg"] = self.provider_config["nsg"]
+        template_params["subnet"] = self.provider_config["subnet"]
+        parameters = {
+            "properties": {
+                "mode": DeploymentMode.incremental,
+                "template": template,
+                "parameters": {
+                    key: {"value": value} for key, value in template_params.items()
+                },
+            }
+        }
+        # TODO: we could get the private/public ips back directly
+        create_or_update = get_azure_sdk_function(
+            client=self.resource_client.deployments, function_name="create_or_update"
+        )
+        create_or_update(
+            resource_group_name=resource_group,
+            deployment_name=vm_name,
+            parameters=parameters,
+        ).wait(timeout=AUTOSCALER_NODE_START_WAIT_S)
+    @synchronized
+    def set_node_tags(self, node_id, tags):
+        """Sets the tag values (string dict) for the specified node."""
+        node_tags = self._get_cached_node(node_id)["tags"]
+        node_tags.update(tags)
+        update = get_azure_sdk_function(
+            client=self.compute_client.virtual_machines, function_name="update"
+        )
+        update(
+            resource_group_name=self.provider_config["resource_group"],
+            vm_name=node_id,
+            parameters={"tags": node_tags},
+        )
+        self.cached_nodes[node_id]["tags"] = node_tags
+    def terminate_node(self, node_id):
+        """Terminates the specified node. This will delete the VM and
+        associated resources (NIC, IP, Storage) for the specified node."""
+        resource_group = self.provider_config["resource_group"]
+        if self.cache_stopped_nodes:
+            try:
+                # stop machine and leave all resources
+                logger.info(
+                    f"Stopping instance {node_id}"
+                    "(to fully terminate instead, "
+                    "set `cache_stopped_nodes: False` "
+                    "under `provider` in the cluster configuration)"
+                )
+                stop = get_azure_sdk_function(
+                    client=self.compute_client.virtual_machines,
+                    function_name="deallocate",
+                )
+                stop(resource_group_name=resource_group, vm_name=node_id)
+            except Exception as e:
+                logger.warning("Failed to stop VM: {}".format(e))
+        # If node_id is in terminating nodes dict, it's already terminating
+        # Otherwise, kick off termination and add it to the dict
+        elif node_id not in self.terminating_nodes:
+            self.terminating_nodes[node_id] = self.termination_executor.submit(
+                self._delete_node_and_resources, resource_group, node_id
+            )
+    def _delete_node_and_resources(self, resource_group, node_id):
+        try:
+            vm = self.compute_client.virtual_machines.get(
+                resource_group_name=resource_group, vm_name=node_id
+            )
+        except ResourceNotFoundError as e:
+            # Node no longer exists
+            logger.warning("Failed to delete VM: {}".format(e))
+            return
+        # Gather dependent disks
+        disks = set()
+        if vm.storage_profile is not None and vm.storage_profile.data_disks is not None:
+            for d in vm.storage_profile.data_disks:
+                if d.name is not None:
+                    disks.add(d.name)
+        if (
+            vm.storage_profile is not None
+            and vm.storage_profile.os_disk is not None
+            and vm.storage_profile.os_disk.name is not None
+        ):
+            disks.add(vm.storage_profile.os_disk.name)
+        # Gather dependent NICs and public IPs
+        nics = set()
+        ips = set()
+        if (
+            vm.network_profile is not None
+            and vm.network_profile.network_interfaces is not None
+        ):
+            for nint in vm.network_profile.network_interfaces:
+                if nint.id is not None:
+                    nic_name = nint.id.split("/")[-1]
+                    nics.add(nic_name)
+                    # Get public IP if not using internal IPs or if this is the
+                    # head node and use_external_head_ip is True
+                    if not self.provider_config.get("use_internal_ips", False) or (
+                        self.provider_config.get("use_external_head_ip", False)
+                        and vm.tags[TAG_RAY_NODE_KIND] == NODE_KIND_HEAD
+                    ):
+                        nic = self.network_client.network_interfaces.get(
+                            resource_group_name=resource_group,
+                            network_interface_name=nic_name,
+                        )
+                        if nic.ip_configurations is not None:
+                            for ipc in nic.ip_configurations:
+                                if ipc.public_ip_address.id is not None:
+                                    ips.add(ipc.public_ip_address.id.split("/")[-1])
+        # Delete VM
+        st = time.monotonic()
+        delete = get_azure_sdk_function(
+            client=self.compute_client.virtual_machines,
+            function_name="delete",
+        )
+        try:
+            delete(resource_group_name=resource_group, vm_name=node_id).wait(
+                timeout=AUTOSCALER_NODE_TERMINATE_WAIT_S
+            )
+        except Exception as e:
+            logger.warning("Failed to delete VM: {}".format(e))
+        # Delete disks (no need to wait for these, but gather the LROs for end)
+        disk_lros = []
+        delete = get_azure_sdk_function(
+            client=self.compute_client.disks, function_name="delete"
+        )
+        for d in disks:
+            try:
+                disk_lros.append(
+                    delete(
+                        resource_group_name=resource_group,
+                        disk_name=d,
+                    )
+                )
+            except Exception as e:
+                logger.warning("Failed to delete disk: {}".format(e))
+        # Delete NICs
+        nic_lros = []
+        delete = get_azure_sdk_function(
+            client=self.network_client.network_interfaces, function_name="delete"
+        )
+        for n in nics:
+            try:
+                nic_lros.append(
+                    delete(
+                        resource_group_name=resource_group,
+                        network_interface_name=n,
+                    )
+                )
+            except Exception as e:
+                logger.warning("Failed to delete NIC: {}".format(e))
+        while (
+            not all(nlro.done() for nlro in nic_lros)
+            and (time.monotonic() - st) < AUTOSCALER_NODE_TERMINATE_WAIT_S
+        ):
+            time.sleep(0.1)
+        # Delete Public IPs
+        delete = get_azure_sdk_function(
+            client=self.network_client.public_ip_addresses,
+            function_name="delete",
+        )
+        ip_lros = []
+        for ip in ips:
+            try:
+                ip_lros.append(
+                    delete(
+                        resource_group_name=resource_group,
+                        public_ip_address_name=ip,
+                    )
+                )
+            except Exception as e:
+                logger.warning("Failed to delete public IP: {}".format(e))
+        while (
+            not all(dlro.done() for dlro in disk_lros)
+            and (time.monotonic() - st) < AUTOSCALER_NODE_TERMINATE_WAIT_S
+        ):
+            time.sleep(0.1)
+        while (
+            not all(iplro.done() for iplro in ip_lros)
+            and (time.monotonic() - st) < AUTOSCALER_NODE_TERMINATE_WAIT_S
+        ):
+            time.sleep(0.1)
+    def _get_node(self, node_id):
+        self._get_filtered_nodes({})  # Side effect: updates cache
+        return self.cached_nodes[node_id]
+    def _get_cached_node(self, node_id):
+        return self.cached_nodes.get(node_id) or self._get_node(node_id=node_id)
+    @staticmethod
+    def bootstrap_config(cluster_config):
+        return bootstrap_azure(cluster_config)

.venv/lib/python3.11/site-packages/ray/autoscaler/_private/autoscaler.py ADDED Viewed

	@@ -0,0 +1,1508 @@

+import copy
+import logging
+import math
+import operator
+import os
+import queue
+import subprocess
+import threading
+import time
+from collections import Counter, defaultdict, namedtuple
+from dataclasses import dataclass, field
+from enum import Enum
+from typing import Any, Callable, Dict, FrozenSet, List, Optional, Set, Tuple, Union
+import yaml
+import ray
+import ray._private.ray_constants as ray_constants
+from ray.autoscaler._private.constants import (
+    AUTOSCALER_HEARTBEAT_TIMEOUT_S,
+    AUTOSCALER_MAX_CONCURRENT_LAUNCHES,
+    AUTOSCALER_MAX_LAUNCH_BATCH,
+    AUTOSCALER_MAX_NUM_FAILURES,
+    AUTOSCALER_STATUS_LOG,
+    AUTOSCALER_UPDATE_INTERVAL_S,
+    DISABLE_LAUNCH_CONFIG_CHECK_KEY,
+    DISABLE_NODE_UPDATERS_KEY,
+    FOREGROUND_NODE_LAUNCH_KEY,
+    WORKER_LIVENESS_CHECK_KEY,
+)
+from ray.autoscaler._private.event_summarizer import EventSummarizer
+from ray.autoscaler._private.legacy_info_string import legacy_log_info_string
+from ray.autoscaler._private.load_metrics import LoadMetrics
+from ray.autoscaler._private.local.node_provider import (
+    LocalNodeProvider,
+    record_local_head_state_if_needed,
+)
+from ray.autoscaler._private.node_launcher import BaseNodeLauncher, NodeLauncher
+from ray.autoscaler._private.node_provider_availability_tracker import (
+    NodeAvailabilitySummary,
+    NodeProviderAvailabilityTracker,
+)
+from ray.autoscaler._private.node_tracker import NodeTracker
+from ray.autoscaler._private.prom_metrics import AutoscalerPrometheusMetrics
+from ray.autoscaler._private.providers import _get_node_provider
+from ray.autoscaler._private.resource_demand_scheduler import (
+    ResourceDemandScheduler,
+    ResourceDict,
+    get_bin_pack_residual,
+)
+from ray.autoscaler._private.updater import NodeUpdaterThread
+from ray.autoscaler._private.util import (
+    ConcurrentCounter,
+    NodeCount,
+    NodeID,
+    NodeIP,
+    NodeType,
+    NodeTypeConfigDict,
+    format_info_string,
+    hash_launch_conf,
+    hash_runtime_conf,
+    validate_config,
+    with_head_node_ip,
+)
+from ray.autoscaler.node_provider import NodeProvider
+from ray.autoscaler.tags import (
+    NODE_KIND_HEAD,
+    NODE_KIND_UNMANAGED,
+    NODE_KIND_WORKER,
+    STATUS_UP_TO_DATE,
+    STATUS_UPDATE_FAILED,
+    TAG_RAY_FILE_MOUNTS_CONTENTS,
+    TAG_RAY_LAUNCH_CONFIG,
+    TAG_RAY_NODE_KIND,
+    TAG_RAY_NODE_STATUS,
+    TAG_RAY_RUNTIME_CONFIG,
+    TAG_RAY_USER_NODE_TYPE,
+)
+from ray.exceptions import RpcError
+logger = logging.getLogger(__name__)
+# Status of a node e.g. "up-to-date", see ray/autoscaler/tags.py
+NodeStatus = str
+# Tuple of modified fields for the given node_id returned by should_update
+# that will be passed into a NodeUpdaterThread.
+UpdateInstructions = namedtuple(
+    "UpdateInstructions",
+    ["node_id", "setup_commands", "ray_start_commands", "docker_config"],
+)
+NodeLaunchData = Tuple[NodeTypeConfigDict, NodeCount, Optional[NodeType]]
+@dataclass
+class AutoscalerSummary:
+    active_nodes: Dict[NodeType, int]
+    idle_nodes: Optional[Dict[NodeType, int]]
+    pending_nodes: List[Tuple[NodeIP, NodeType, NodeStatus]]
+    pending_launches: Dict[NodeType, int]
+    failed_nodes: List[Tuple[NodeIP, NodeType]]
+    node_availability_summary: NodeAvailabilitySummary = field(
+        default_factory=lambda: NodeAvailabilitySummary({})
+    )
+    # A dictionary of node IP to a list of reasons the node is not idle.
+    node_activities: Optional[Dict[str, Tuple[NodeIP, List[str]]]] = None
+    pending_resources: Dict[str, int] = field(default_factory=lambda: {})
+    # A mapping from node name (the same key as `usage_by_node`) to node type.
+    # Optional for deployment modes which have the concept of node types and
+    # backwards compatibility.
+    node_type_mapping: Optional[Dict[str, str]] = None
+    # Whether the autoscaler summary is v1 or v2.
+    legacy: bool = False
+class NonTerminatedNodes:
+    """Class to extract and organize information on non-terminated nodes."""
+    def __init__(self, provider: NodeProvider):
+        start_time = time.time()
+        # All non-terminated nodes
+        self.all_node_ids = provider.non_terminated_nodes({})
+        # Managed worker nodes (node kind "worker"):
+        self.worker_ids: List[NodeID] = []
+        # The head node (node kind "head")
+        self.head_id: Optional[NodeID] = None
+        for node in self.all_node_ids:
+            node_kind = provider.node_tags(node)[TAG_RAY_NODE_KIND]
+            if node_kind == NODE_KIND_WORKER:
+                self.worker_ids.append(node)
+            elif node_kind == NODE_KIND_HEAD:
+                self.head_id = node
+        # Note: For typical use-cases, self.all_node_ids == self.worker_ids +
+        # [self.head_id]. The difference being in the case of unmanaged nodes.
+        # Record the time of the non_terminated nodes call. This typically
+        # translates to a "describe" or "list" call on most cluster managers
+        # which can be quite expensive. Note that we include the processing
+        # time because on some clients, there may be pagination and the
+        # underlying api calls may be done lazily.
+        self.non_terminated_nodes_time = time.time() - start_time
+        logger.info(
+            f"The autoscaler took {round(self.non_terminated_nodes_time, 3)}"
+            " seconds to fetch the list of non-terminated nodes."
+        )
+    def remove_terminating_nodes(self, terminating_nodes: List[NodeID]) -> None:
+        """Remove nodes we're in the process of terminating from internal
+        state."""
+        def not_terminating(node):
+            return node not in terminating_nodes
+        self.worker_ids = list(filter(not_terminating, self.worker_ids))
+        self.all_node_ids = list(filter(not_terminating, self.all_node_ids))
+# Whether a worker should be kept based on the min_workers and
+# max_workers constraints.
+#
+# keep: should keep the worker
+# terminate: should terminate the worker
+# decide_later: the worker can be terminated if needed
+KeepOrTerminate = Enum("KeepOrTerminate", "keep terminate decide_later")
+class StandardAutoscaler:
+    """The autoscaling control loop for a Ray cluster.
+    There are two ways to start an autoscaling cluster: manually by running
+    `ray start --head --autoscaling-config=/path/to/config.yaml` on a instance
+    that has permission to launch other instances, or you can also use `ray up
+    /path/to/config.yaml` from your laptop, which will configure the right
+    AWS/Cloud roles automatically. See the Ray documentation
+    (https://docs.ray.io/en/latest/) for a full definition of autoscaling behavior.
+    StandardAutoscaler's `update` method is periodically called in
+    `monitor.py`'s monitoring loop.
+    StandardAutoscaler is also used to bootstrap clusters (by adding workers
+    until the cluster size that can handle the resource demand is met).
+    """
+    def __init__(
+        self,
+        # TODO(ekl): require config reader to be a callable always.
+        config_reader: Union[str, Callable[[], dict]],
+        load_metrics: LoadMetrics,
+        gcs_client: "ray._raylet.GcsClient",
+        session_name: Optional[str] = None,
+        max_launch_batch: int = AUTOSCALER_MAX_LAUNCH_BATCH,
+        max_concurrent_launches: int = AUTOSCALER_MAX_CONCURRENT_LAUNCHES,
+        max_failures: int = AUTOSCALER_MAX_NUM_FAILURES,
+        process_runner: Any = subprocess,
+        update_interval_s: int = AUTOSCALER_UPDATE_INTERVAL_S,
+        prefix_cluster_info: bool = False,
+        event_summarizer: Optional[EventSummarizer] = None,
+        prom_metrics: Optional[AutoscalerPrometheusMetrics] = None,
+    ):
+        """Create a StandardAutoscaler.
+        Args:
+            config_reader: Path to a Ray Autoscaler YAML, or a function to read
+                and return the latest config.
+            load_metrics: Provides metrics for the Ray cluster.
+            session_name: The session name of the cluster this autoscaler
+                is deployed.
+            max_launch_batch: Max number of nodes to launch in one request.
+            max_concurrent_launches: Max number of nodes that can be
+                concurrently launched. This value and `max_launch_batch`
+                determine the number of batches that are used to launch nodes.
+            max_failures: Number of failures that the autoscaler will tolerate
+                before exiting.
+            process_runner: Subproc-like interface used by the CommandRunner.
+            update_interval_s: Seconds between running the autoscaling loop.
+            prefix_cluster_info: Whether to add the cluster name to info strs.
+            event_summarizer: Utility to consolidate duplicated messages.
+            prom_metrics: Prometheus metrics for autoscaler-related operations.
+            gcs_client: client for interactions with the GCS. Used to drain nodes
+                before termination.
+        """
+        if isinstance(config_reader, str):
+            # Auto wrap with file reader.
+            def read_fn():
+                with open(config_reader) as f:
+                    new_config = yaml.safe_load(f.read())
+                return new_config
+            self.config_reader = read_fn
+        else:
+            self.config_reader = config_reader
+        self.node_provider_availability_tracker = NodeProviderAvailabilityTracker()
+        # Prefix each line of info string with cluster name if True
+        self.prefix_cluster_info = prefix_cluster_info
+        # Keep this before self.reset (self.provider needs to be created
+        # exactly once).
+        self.provider = None
+        # Keep this before self.reset (if an exception occurs in reset
+        # then prom_metrics must be instantitiated to increment the
+        # exception counter)
+        self.prom_metrics = prom_metrics or AutoscalerPrometheusMetrics(
+            session_name=session_name
+        )  # noqa
+        self.resource_demand_scheduler = None
+        self.reset(errors_fatal=True)
+        self.load_metrics = load_metrics
+        self.max_failures = max_failures
+        self.max_launch_batch = max_launch_batch
+        self.max_concurrent_launches = max_concurrent_launches
+        self.process_runner = process_runner
+        self.event_summarizer = event_summarizer or EventSummarizer()
+        # Map from node_id to NodeUpdater threads
+        self.updaters: Dict[NodeID, NodeUpdaterThread] = {}
+        self.num_failed_updates: Dict[NodeID, int] = defaultdict(int)
+        self.num_successful_updates: Dict[NodeID, int] = defaultdict(int)
+        self.num_failures = 0
+        self.last_update_time = 0.0
+        self.update_interval_s = update_interval_s
+        # Keeps track of pending and running nodes
+        self.non_terminated_nodes: Optional[NonTerminatedNodes] = None
+        # Tracks nodes scheduled for termination
+        self.nodes_to_terminate: List[NodeID] = []
+        # Disable NodeUpdater threads if true.
+        # Should be set to true in situations where another component, such as
+        # a Kubernetes operator, is responsible for Ray setup on nodes.
+        self.disable_node_updaters = self.config["provider"].get(
+            DISABLE_NODE_UPDATERS_KEY, False
+        )
+        logger.info(f"{DISABLE_NODE_UPDATERS_KEY}:{self.disable_node_updaters}")
+        # Disable launch config checking if true.
+        # This is set in the fake_multinode situations where there isn't any
+        # meaningful node "type" to enforce.
+        self.disable_launch_config_check = self.config["provider"].get(
+            DISABLE_LAUNCH_CONFIG_CHECK_KEY, False
+        )
+        logger.info(
+            f"{DISABLE_LAUNCH_CONFIG_CHECK_KEY}:{self.disable_launch_config_check}"
+        )
+        # By default, the autoscaler launches nodes in batches asynchronously in
+        # background threads.
+        # When the following flag is set, that behavior is disabled, so that nodes
+        # are launched in the main thread, all in one batch, blocking until all
+        # NodeProvider.create_node calls have returned.
+        self.foreground_node_launch = self.config["provider"].get(
+            FOREGROUND_NODE_LAUNCH_KEY, False
+        )
+        logger.info(f"{FOREGROUND_NODE_LAUNCH_KEY}:{self.foreground_node_launch}")
+        # By default, the autoscaler kills and/or tries to recover
+        # a worker node if it hasn't produced a resource heartbeat in the last 30
+        # seconds. The worker_liveness_check flag allows disabling this behavior in
+        # settings where another component, such as a Kubernetes operator, is
+        # responsible for healthchecks.
+        self.worker_liveness_check = self.config["provider"].get(
+            WORKER_LIVENESS_CHECK_KEY, True
+        )
+        logger.info(f"{WORKER_LIVENESS_CHECK_KEY}:{self.worker_liveness_check}")
+        # Node launchers
+        self.foreground_node_launcher: Optional[BaseNodeLauncher] = None
+        self.launch_queue: Optional[queue.Queue[NodeLaunchData]] = None
+        self.pending_launches = ConcurrentCounter()
+        if self.foreground_node_launch:
+            self.foreground_node_launcher = BaseNodeLauncher(
+                provider=self.provider,
+                pending=self.pending_launches,
+                event_summarizer=self.event_summarizer,
+                node_provider_availability_tracker=self.node_provider_availability_tracker,  # noqa: E501 Flake and black disagree how to format this.
+                session_name=session_name,
+                node_types=self.available_node_types,
+                prom_metrics=self.prom_metrics,
+            )
+        else:
+            self.launch_queue = queue.Queue()
+            max_batches = math.ceil(max_concurrent_launches / float(max_launch_batch))
+            for i in range(int(max_batches)):
+                node_launcher = NodeLauncher(
+                    provider=self.provider,
+                    queue=self.launch_queue,
+                    index=i,
+                    pending=self.pending_launches,
+                    event_summarizer=self.event_summarizer,
+                    node_provider_availability_tracker=self.node_provider_availability_tracker,  # noqa: E501 Flake and black disagreee how to format this.
+                    session_name=session_name,
+                    node_types=self.available_node_types,
+                    prom_metrics=self.prom_metrics,
+                )
+                node_launcher.daemon = True
+                node_launcher.start()
+        # NodeTracker maintains soft state to track the number of recently
+        # failed nodes. It is best effort only.
+        self.node_tracker = NodeTracker()
+        # Expand local file_mounts to allow ~ in the paths. This can't be done
+        # earlier when the config is written since we might be on different
+        # platform and the expansion would result in wrong path.
+        self.config["file_mounts"] = {
+            remote: os.path.expanduser(local)
+            for remote, local in self.config["file_mounts"].items()
+        }
+        self.gcs_client = gcs_client
+        for local_path in self.config["file_mounts"].values():
+            assert os.path.exists(local_path)
+        logger.info("StandardAutoscaler: {}".format(self.config))
+    @property
+    def all_node_types(self) -> Set[str]:
+        return self.config["available_node_types"].keys()
+    def update(self):
+        try:
+            self.reset(errors_fatal=False)
+            self._update()
+        except Exception as e:
+            self.prom_metrics.update_loop_exceptions.inc()
+            logger.exception("StandardAutoscaler: Error during autoscaling.")
+            self.num_failures += 1
+            if self.num_failures > self.max_failures:
+                logger.critical("StandardAutoscaler: Too many errors, abort.")
+                raise e
+    def _update(self):
+        # For type checking, assert that these objects have been instantitiated.
+        assert self.provider
+        assert self.resource_demand_scheduler
+        now = time.time()
+        # Throttle autoscaling updates to this interval to avoid exceeding
+        # rate limits on API calls.
+        if now - self.last_update_time < self.update_interval_s:
+            return
+        self.last_update_time = now
+        # Query the provider to update the list of non-terminated nodes
+        self.non_terminated_nodes = NonTerminatedNodes(self.provider)
+        # Back off the update if the provider says it's not safe to proceed.
+        if not self.provider.safe_to_scale():
+            logger.info(
+                "Backing off of autoscaler update."
+                f" Will try again in {self.update_interval_s} seconds."
+            )
+            return
+        # This will accumulate the nodes we need to terminate.
+        self.nodes_to_terminate = []
+        # Update running nodes gauge
+        num_workers = len(self.non_terminated_nodes.worker_ids)
+        self.prom_metrics.running_workers.set(num_workers)
+        # Remove from LoadMetrics the ips unknown to the NodeProvider.
+        self.load_metrics.prune_active_ips(
+            active_ips=[
+                self.provider.internal_ip(node_id)
+                for node_id in self.non_terminated_nodes.all_node_ids
+            ]
+        )
+        # Update status strings
+        if AUTOSCALER_STATUS_LOG:
+            logger.info(self.info_string())
+        legacy_log_info_string(self, self.non_terminated_nodes.worker_ids)
+        if not self.provider.is_readonly():
+            self.terminate_nodes_to_enforce_config_constraints(now)
+            if self.disable_node_updaters:
+                # Don't handle unhealthy nodes if the liveness check is disabled.
+                # self.worker_liveness_check is True by default.
+                if self.worker_liveness_check:
+                    self.terminate_unhealthy_nodes(now)
+            else:
+                self.process_completed_updates()
+                self.update_nodes()
+                # Don't handle unhealthy nodes if the liveness check is disabled.
+                # self.worker_liveness_check is True by default.
+                if self.worker_liveness_check:
+                    self.attempt_to_recover_unhealthy_nodes(now)
+                self.set_prometheus_updater_data()
+        # Dict[NodeType, int], List[ResourceDict]
+        to_launch, unfulfilled = self.resource_demand_scheduler.get_nodes_to_launch(
+            self.non_terminated_nodes.all_node_ids,
+            self.pending_launches.breakdown(),
+            self.load_metrics.get_resource_demand_vector(),
+            self.load_metrics.get_resource_utilization(),
+            self.load_metrics.get_pending_placement_groups(),
+            self.load_metrics.get_static_node_resources_by_ip(),
+            ensure_min_cluster_size=self.load_metrics.get_resource_requests(),
+            node_availability_summary=self.node_provider_availability_tracker.summary(),
+        )
+        self._report_pending_infeasible(unfulfilled)
+        if not self.provider.is_readonly():
+            self.launch_required_nodes(to_launch)
+        # Execute optional end-of-update logic.
+        # Keep this method call at the end of autoscaler._update().
+        self.provider.post_process()
+        # Record the amount of time the autoscaler took for
+        # this _update() iteration.
+        update_time = time.time() - self.last_update_time
+        logger.info(
+            f"The autoscaler took {round(update_time, 3)}"
+            " seconds to complete the update iteration."
+        )
+        self.prom_metrics.update_time.observe(update_time)
+    def terminate_nodes_to_enforce_config_constraints(self, now: float):
+        """Terminates nodes to enforce constraints defined by the autoscaling
+        config.
+        (1) Terminates nodes in excess of `max_workers`.
+        (2) Terminates nodes idle for longer than `idle_timeout_minutes`.
+        (3) Terminates outdated nodes,
+                namely nodes whose configs don't match `node_config` for the
+                relevant node type.
+        Avoids terminating non-outdated nodes required by
+        autoscaler.sdk.request_resources().
+        """
+        # For type checking, assert that these objects have been instantitiated.
+        assert self.non_terminated_nodes
+        assert self.provider
+        last_used = self.load_metrics.ray_nodes_last_used_time_by_ip
+        idle_timeout_s = 60 * self.config["idle_timeout_minutes"]
+        last_used_cutoff = now - idle_timeout_s
+        # Sort based on last used to make sure to keep min_workers that
+        # were most recently used. Otherwise, _keep_min_workers_of_node_type
+        # might keep a node that should be terminated.
+        sorted_node_ids = self._sort_based_on_last_used(
+            self.non_terminated_nodes.worker_ids, last_used
+        )
+        # Don't terminate nodes needed by request_resources()
+        nodes_not_allowed_to_terminate: FrozenSet[NodeID] = {}
+        if self.load_metrics.get_resource_requests():
+            nodes_not_allowed_to_terminate = (
+                self._get_nodes_needed_for_request_resources(sorted_node_ids)
+            )
+        # Tracks counts of nodes we intend to keep for each node type.
+        node_type_counts = defaultdict(int)
+        def keep_node(node_id: NodeID) -> None:
+            assert self.provider
+            # Update per-type counts.
+            tags = self.provider.node_tags(node_id)
+            if TAG_RAY_USER_NODE_TYPE in tags:
+                node_type = tags[TAG_RAY_USER_NODE_TYPE]
+                node_type_counts[node_type] += 1
+        # Nodes that we could terminate, if needed.
+        nodes_we_could_terminate: List[NodeID] = []
+        for node_id in sorted_node_ids:
+            # Make sure to not kill idle node types if the number of workers
+            # of that type is lower/equal to the min_workers of that type
+            # or it is needed for request_resources().
+            should_keep_or_terminate, reason = self._keep_worker_of_node_type(
+                node_id, node_type_counts
+            )
+            if should_keep_or_terminate == KeepOrTerminate.terminate:
+                self.schedule_node_termination(node_id, reason, logger.info)
+                continue
+            if (
+                should_keep_or_terminate == KeepOrTerminate.keep
+                or node_id in nodes_not_allowed_to_terminate
+            ) and self.launch_config_ok(node_id):
+                keep_node(node_id)
+                continue
+            node_ip = self.provider.internal_ip(node_id)
+            if node_ip in last_used and last_used[node_ip] < last_used_cutoff:
+                self.schedule_node_termination(node_id, "idle", logger.info)
+                # Get the local time of the node's last use as a string.
+                formatted_last_used_time = time.asctime(
+                    time.localtime(last_used[node_ip])
+                )
+                logger.info(f"Node last used: {formatted_last_used_time}.")
+                # Note that the current time will appear in the log prefix.
+            elif not self.launch_config_ok(node_id):
+                self.schedule_node_termination(node_id, "outdated", logger.info)
+            else:
+                keep_node(node_id)
+                nodes_we_could_terminate.append(node_id)
+        # Terminate nodes if there are too many
+        num_workers = len(self.non_terminated_nodes.worker_ids)
+        num_extra_nodes_to_terminate = (
+            num_workers - len(self.nodes_to_terminate) - self.config["max_workers"]
+        )
+        if num_extra_nodes_to_terminate > len(nodes_we_could_terminate):
+            logger.warning(
+                "StandardAutoscaler: trying to terminate "
+                f"{num_extra_nodes_to_terminate} nodes, while only "
+                f"{len(nodes_we_could_terminate)} are safe to terminate."
+                " Inconsistent config is likely."
+            )
+            num_extra_nodes_to_terminate = len(nodes_we_could_terminate)
+        # If num_extra_nodes_to_terminate is negative or zero,
+        # we would have less than max_workers nodes after terminating
+        # nodes_to_terminate and we do not need to terminate anything else.
+        if num_extra_nodes_to_terminate > 0:
+            extra_nodes_to_terminate = nodes_we_could_terminate[
+                -num_extra_nodes_to_terminate:
+            ]
+            for node_id in extra_nodes_to_terminate:
+                self.schedule_node_termination(node_id, "max workers", logger.info)
+        self.terminate_scheduled_nodes()
+    def schedule_node_termination(
+        self, node_id: NodeID, reason_opt: Optional[str], logger_method: Callable
+    ) -> None:
+        # For type checking, assert that this object has been instantitiated.
+        assert self.provider
+        if reason_opt is None:
+            raise Exception("reason should be not None.")
+        reason: str = reason_opt
+        node_ip = self.provider.internal_ip(node_id)
+        # Log, record an event, and add node_id to nodes_to_terminate.
+        logger_method(
+            "StandardAutoscaler: "
+            f"Terminating the node with id {node_id}"
+            f" and ip {node_ip}."
+            f" ({reason})"
+        )
+        self.event_summarizer.add(
+            "Removing {} nodes of type "
+            + self._get_node_type(node_id)
+            + " ({}).".format(reason),
+            quantity=1,
+            aggregate=operator.add,
+        )
+        self.nodes_to_terminate.append(node_id)
+    def terminate_scheduled_nodes(self):
+        """Terminate scheduled nodes and clean associated autoscaler state."""
+        # For type checking, assert that these objects have been instantitiated.
+        assert self.provider
+        assert self.non_terminated_nodes
+        if not self.nodes_to_terminate:
+            return
+        # Drain the nodes
+        self.drain_nodes_via_gcs(self.nodes_to_terminate)
+        # Terminate the nodes
+        self.provider.terminate_nodes(self.nodes_to_terminate)
+        for node in self.nodes_to_terminate:
+            self.node_tracker.untrack(node)
+            self.prom_metrics.stopped_nodes.inc()
+        # Update internal node lists
+        self.non_terminated_nodes.remove_terminating_nodes(self.nodes_to_terminate)
+        self.nodes_to_terminate = []
+    def drain_nodes_via_gcs(self, provider_node_ids_to_drain: List[NodeID]):
+        """Send an RPC request to the GCS to drain (prepare for termination)
+        the nodes with the given node provider ids.
+        note: The current implementation of DrainNode on the GCS side is to
+        de-register and gracefully shut down the Raylets. In the future,
+        the behavior may change to better reflect the name "Drain."
+        See https://github.com/ray-project/ray/pull/19350.
+        """
+        # For type checking, assert that this object has been instantitiated.
+        assert self.provider
+        # The GCS expects Raylet ids in the request, rather than NodeProvider
+        # ids. To get the Raylet ids of the nodes to we're draining, we make
+        # the following translations of identifiers:
+        # node provider node id -> ip -> raylet id
+        # Convert node provider node ids to ips.
+        node_ips = set()
+        failed_ip_fetch = False
+        for provider_node_id in provider_node_ids_to_drain:
+            # If the provider's call to fetch ip fails, the exception is not
+            # fatal. Log the exception and proceed.
+            try:
+                ip = self.provider.internal_ip(provider_node_id)
+                node_ips.add(ip)
+            except Exception:
+                logger.exception(
+                    "Failed to get ip of node with id"
+                    f" {provider_node_id} during scale-down."
+                )
+                failed_ip_fetch = True
+        if failed_ip_fetch:
+            self.prom_metrics.drain_node_exceptions.inc()
+        # Only attempt to drain connected nodes, i.e. nodes with ips in
+        # LoadMetrics.
+        connected_node_ips = node_ips & self.load_metrics.raylet_id_by_ip.keys()
+        # Convert ips to Raylet ids.
+        # (The assignment ip->raylet_id is well-defined under current
+        # assumptions. See "use_node_id_as_ip" in monitor.py)
+        raylet_ids_to_drain = {
+            self.load_metrics.raylet_id_by_ip[ip] for ip in connected_node_ips
+        }
+        if not raylet_ids_to_drain:
+            return
+        logger.info(f"Draining {len(raylet_ids_to_drain)} raylet(s).")
+        try:
+            # A successful response indicates that the GCS has marked the
+            # desired nodes as "drained." The cloud provider can then terminate
+            # the nodes without the GCS printing an error.
+            # Check if we succeeded in draining all of the intended nodes by
+            # looking at the RPC response.
+            drained_raylet_ids = set(
+                self.gcs_client.drain_nodes(raylet_ids_to_drain, timeout=5)
+            )
+            failed_to_drain = raylet_ids_to_drain - drained_raylet_ids
+            if failed_to_drain:
+                self.prom_metrics.drain_node_exceptions.inc()
+                logger.error(f"Failed to drain {len(failed_to_drain)} raylet(s).")
+        # If we get a gRPC error with an UNIMPLEMENTED code, fail silently.
+        # This error indicates that the GCS is using Ray version < 1.8.0,
+        # for which DrainNode is not implemented.
+        except RpcError as e:
+            # If the code is UNIMPLEMENTED, pass.
+            if e.rpc_code == ray._raylet.GRPC_STATUS_CODE_UNIMPLEMENTED:
+                pass
+            # Otherwise, it's a plain old gRPC error and we should log it.
+            else:
+                self.prom_metrics.drain_node_exceptions.inc()
+                logger.exception("Failed to drain Ray nodes. Traceback follows.")
+        except Exception:
+            # We don't need to interrupt the autoscaler update with an
+            # exception, but we should log what went wrong and record the
+            # failure in Prometheus.
+            self.prom_metrics.drain_node_exceptions.inc()
+            logger.exception("Failed to drain Ray nodes. Traceback follows.")
+    def launch_required_nodes(self, to_launch: Dict[NodeType, int]) -> None:
+        if to_launch:
+            for node_type, count in to_launch.items():
+                self.launch_new_node(count, node_type=node_type)
+    def update_nodes(self):
+        """Run NodeUpdaterThreads to run setup commands, sync files,
+        and/or start Ray.
+        """
+        # Update nodes with out-of-date files.
+        # TODO(edoakes): Spawning these threads directly seems to cause
+        # problems. They should at a minimum be spawned as daemon threads.
+        # See https://github.com/ray-project/ray/pull/5903 for more info.
+        T = []
+        for node_id, setup_commands, ray_start_commands, docker_config in (
+            self.should_update(node_id)
+            for node_id in self.non_terminated_nodes.worker_ids
+        ):
+            if node_id is not None:
+                resources = self._node_resources(node_id)
+                labels = self._node_labels(node_id)
+                logger.debug(f"{node_id}: Starting new thread runner.")
+                T.append(
+                    threading.Thread(
+                        target=self.spawn_updater,
+                        args=(
+                            node_id,
+                            setup_commands,
+                            ray_start_commands,
+                            resources,
+                            labels,
+                            docker_config,
+                        ),
+                    )
+                )
+        for t in T:
+            t.start()
+        for t in T:
+            t.join()
+    def process_completed_updates(self):
+        """Clean up completed NodeUpdaterThreads."""
+        completed_nodes = []
+        for node_id, updater in self.updaters.items():
+            if not updater.is_alive():
+                completed_nodes.append(node_id)
+        if completed_nodes:
+            failed_nodes = []
+            for node_id in completed_nodes:
+                updater = self.updaters[node_id]
+                if updater.exitcode == 0:
+                    self.num_successful_updates[node_id] += 1
+                    self.prom_metrics.successful_updates.inc()
+                    if updater.for_recovery:
+                        self.prom_metrics.successful_recoveries.inc()
+                    if updater.update_time:
+                        self.prom_metrics.worker_update_time.observe(
+                            updater.update_time
+                        )
+                    # Mark the node as active to prevent the node recovery
+                    # logic immediately trying to restart Ray on the new node.
+                    self.load_metrics.mark_active(self.provider.internal_ip(node_id))
+                else:
+                    failed_nodes.append(node_id)
+                    self.num_failed_updates[node_id] += 1
+                    self.prom_metrics.failed_updates.inc()
+                    if updater.for_recovery:
+                        self.prom_metrics.failed_recoveries.inc()
+                    self.node_tracker.untrack(node_id)
+                del self.updaters[node_id]
+            if failed_nodes:
+                # Some nodes in failed_nodes may already have been terminated
+                # during an update (for being idle after missing a heartbeat).
+                # Update the list of non-terminated workers.
+                for node_id in failed_nodes:
+                    # Check if the node has already been terminated.
+                    if node_id in self.non_terminated_nodes.worker_ids:
+                        self.schedule_node_termination(
+                            node_id, "launch failed", logger.error
+                        )
+                    else:
+                        logger.warning(
+                            f"StandardAutoscaler: {node_id}:"
+                            " Failed to update node."
+                            " Node has already been terminated."
+                        )
+                self.terminate_scheduled_nodes()
+    def set_prometheus_updater_data(self):
+        """Record total number of active NodeUpdaterThreads and how many of
+        these are being run to recover nodes.
+        """
+        self.prom_metrics.updating_nodes.set(len(self.updaters))
+        num_recovering = 0
+        for updater in self.updaters.values():
+            if updater.for_recovery:
+                num_recovering += 1
+        self.prom_metrics.recovering_nodes.set(num_recovering)
+    def _report_pending_infeasible(self, unfulfilled: List[ResourceDict]):
+        """Emit event messages for infeasible or unschedulable tasks.
+        This adds messages to the event summarizer for warning on infeasible
+        or "cluster full" resource requests.
+        Args:
+            unfulfilled: List of resource demands that would be unfulfilled
+                even after full scale-up.
+        """
+        # For type checking, assert that this object has been instantitiated.
+        assert self.resource_demand_scheduler
+        pending = []
+        infeasible = []
+        for bundle in unfulfilled:
+            placement_group = any(
+                "_group_" in k
+                or k == ray_constants.PLACEMENT_GROUP_BUNDLE_RESOURCE_NAME
+                for k in bundle
+            )
+            if placement_group:
+                continue
+            if self.resource_demand_scheduler.is_feasible(bundle):
+                pending.append(bundle)
+            else:
+                infeasible.append(bundle)
+        if pending:
+            if self.load_metrics.cluster_full_of_actors_detected:
+                for request in pending:
+                    self.event_summarizer.add_once_per_interval(
+                        "Warning: The following resource request cannot be "
+                        "scheduled right now: {}. This is likely due to all "
+                        "cluster resources being claimed by actors. Consider "
+                        "creating fewer actors or adding more nodes "
+                        "to this Ray cluster.".format(request),
+                        key="pending_{}".format(sorted(request.items())),
+                        interval_s=30,
+                    )
+        if infeasible:
+            for request in infeasible:
+                self.event_summarizer.add_once_per_interval(
+                    "Error: No available node types can fulfill resource "
+                    "request {}. Add suitable node types to this cluster to "
+                    "resolve this issue.".format(request),
+                    key="infeasible_{}".format(sorted(request.items())),
+                    interval_s=30,
+                )
+    def _sort_based_on_last_used(
+        self, nodes: List[NodeID], last_used: Dict[str, float]
+    ) -> List[NodeID]:
+        """Sort the nodes based on the last time they were used.
+        The first item in the return list is the most recently used.
+        """
+        last_used_copy = copy.deepcopy(last_used)
+        # Add the unconnected nodes as the least recently used (the end of
+        # list). This prioritizes connected nodes.
+        least_recently_used = -1
+        def last_time_used(node_id: NodeID):
+            assert self.provider
+            node_ip = self.provider.internal_ip(node_id)
+            if node_ip not in last_used_copy:
+                return least_recently_used
+            else:
+                return last_used_copy[node_ip]
+        return sorted(nodes, key=last_time_used, reverse=True)
+    def _get_nodes_needed_for_request_resources(
+        self, sorted_node_ids: List[NodeID]
+    ) -> FrozenSet[NodeID]:
+        # TODO(ameer): try merging this with resource_demand_scheduler
+        # code responsible for adding nodes for request_resources().
+        """Returns the nodes NOT allowed to terminate due to request_resources().
+        Args:
+            sorted_node_ids: the node ids sorted based on last used (LRU last).
+        Returns:
+            FrozenSet[NodeID]: a set of nodes (node ids) that
+            we should NOT terminate.
+        """
+        # For type checking, assert that this object has been instantitiated.
+        assert self.provider
+        nodes_not_allowed_to_terminate: Set[NodeID] = set()
+        static_node_resources: Dict[
+            NodeIP, ResourceDict
+        ] = self.load_metrics.get_static_node_resources_by_ip()
+        head_node_resources: ResourceDict = copy.deepcopy(
+            self.available_node_types[self.config["head_node_type"]]["resources"]
+        )
+        # TODO(ameer): this is somewhat duplicated in
+        # resource_demand_scheduler.py.
+        if not head_node_resources:
+            # Legacy yaml might include {} in the resources field.
+            head_node_ip = self.provider.internal_ip(self.non_terminated_nodes.head_id)
+            head_node_resources = static_node_resources.get(head_node_ip, {})
+        max_node_resources: List[ResourceDict] = [head_node_resources]
+        resource_demand_vector_worker_node_ids = []
+        # Get max resources on all the non terminated nodes.
+        for node_id in sorted_node_ids:
+            tags = self.provider.node_tags(node_id)
+            if TAG_RAY_USER_NODE_TYPE in tags:
+                node_type = tags[TAG_RAY_USER_NODE_TYPE]
+                node_resources: ResourceDict = copy.deepcopy(
+                    self.available_node_types[node_type]["resources"]
+                )
+                if not node_resources:
+                    # Legacy yaml might include {} in the resources field.
+                    node_ip = self.provider.internal_ip(node_id)
+                    node_resources = static_node_resources.get(node_ip, {})
+                max_node_resources.append(node_resources)
+                resource_demand_vector_worker_node_ids.append(node_id)
+        # Since it is sorted based on last used, we "keep" nodes that are
+        # most recently used when we binpack. We assume get_bin_pack_residual
+        # is following the given order here.
+        used_resource_requests: List[ResourceDict]
+        _, used_resource_requests = get_bin_pack_residual(
+            max_node_resources, self.load_metrics.get_resource_requests()
+        )
+        # Remove the first entry (the head node).
+        max_node_resources.pop(0)
+        # Remove the first entry (the head node).
+        used_resource_requests.pop(0)
+        for i, node_id in enumerate(resource_demand_vector_worker_node_ids):
+            if (
+                used_resource_requests[i] == max_node_resources[i]
+                and max_node_resources[i]
+            ):
+                # No resources of the node were needed for request_resources().
+                # max_node_resources[i] is an empty dict for legacy yamls
+                # before the node is connected.
+                pass
+            else:
+                nodes_not_allowed_to_terminate.add(node_id)
+        return frozenset(nodes_not_allowed_to_terminate)
+    def _keep_worker_of_node_type(
+        self, node_id: NodeID, node_type_counts: Dict[NodeType, int]
+    ) -> Tuple[KeepOrTerminate, Optional[str]]:
+        """Determines if a worker should be kept based on the min_workers
+        and max_workers constraint of the worker's node_type.
+        Returns KeepOrTerminate.keep when both of the following hold:
+        (a) The worker's node_type is present among the keys of the current
+            config's available_node_types dict.
+        (b) Deleting the node would violate the min_workers constraint for that
+            worker's node_type.
+        Returns KeepOrTerminate.terminate when both the following hold:
+        (a) The worker's node_type is not present among the keys of the current
+            config's available_node_types dict.
+        (b) Keeping the node would violate the max_workers constraint for that
+            worker's node_type.
+        Return KeepOrTerminate.decide_later otherwise.
+        Args:
+            node_type_counts(Dict[NodeType, int]): The non_terminated node
+                types counted so far.
+        Returns:
+            KeepOrTerminate: keep if the node should be kept, terminate if the
+            node should be terminated, decide_later if we are allowed
+            to terminate it, but do not have to.
+            Optional[str]: reason for termination. Not None on
+            KeepOrTerminate.terminate, None otherwise.
+        """
+        # For type checking, assert that this object has been instantitiated.
+        assert self.provider
+        tags = self.provider.node_tags(node_id)
+        if TAG_RAY_USER_NODE_TYPE in tags:
+            node_type = tags[TAG_RAY_USER_NODE_TYPE]
+            min_workers = self.available_node_types.get(node_type, {}).get(
+                "min_workers", 0
+            )
+            max_workers = self.available_node_types.get(node_type, {}).get(
+                "max_workers", 0
+            )
+            if node_type not in self.available_node_types:
+                # The node type has been deleted from the cluster config.
+                # Allow terminating it if needed.
+                available_node_types = list(self.available_node_types.keys())
+                return (
+                    KeepOrTerminate.terminate,
+                    f"not in available_node_types: {available_node_types}",
+                )
+            new_count = node_type_counts[node_type] + 1
+            if new_count <= min(min_workers, max_workers):
+                return KeepOrTerminate.keep, None
+            if new_count > max_workers:
+                return KeepOrTerminate.terminate, "max_workers_per_type"
+        return KeepOrTerminate.decide_later, None
+    def _node_resources(self, node_id):
+        node_type = self.provider.node_tags(node_id).get(TAG_RAY_USER_NODE_TYPE)
+        if self.available_node_types:
+            return self.available_node_types.get(node_type, {}).get("resources", {})
+        else:
+            return {}
+    def _node_labels(self, node_id):
+        node_type = self.provider.node_tags(node_id).get(TAG_RAY_USER_NODE_TYPE)
+        if self.available_node_types:
+            return self.available_node_types.get(node_type, {}).get("labels", {})
+        else:
+            return {}
+    def reset(self, errors_fatal=False):
+        sync_continuously = False
+        if hasattr(self, "config"):
+            sync_continuously = self.config.get("file_mounts_sync_continuously", False)
+        try:
+            new_config = self.config_reader()
+            if new_config != getattr(self, "config", None):
+                try:
+                    validate_config(new_config)
+                except Exception as e:
+                    self.prom_metrics.config_validation_exceptions.inc()
+                    logger.debug(
+                        "Cluster config validation failed. The version of "
+                        "the ray CLI you launched this cluster with may "
+                        "be higher than the version of ray being run on "
+                        "the cluster. Some new features may not be "
+                        "available until you upgrade ray on your cluster.",
+                        exc_info=e,
+                    )
+            logger.debug(
+                f"New config after validation: {new_config},"
+                f" of type: {type(new_config)}"
+            )
+            (new_runtime_hash, new_file_mounts_contents_hash) = hash_runtime_conf(
+                new_config["file_mounts"],
+                new_config["cluster_synced_files"],
+                [
+                    new_config["worker_setup_commands"],
+                    new_config["worker_start_ray_commands"],
+                ],
+                generate_file_mounts_contents_hash=sync_continuously,
+            )
+            self.config = new_config
+            self.runtime_hash = new_runtime_hash
+            self.file_mounts_contents_hash = new_file_mounts_contents_hash
+            if not self.provider:
+                self.provider = _get_node_provider(
+                    self.config["provider"], self.config["cluster_name"]
+                )
+            # If using the LocalNodeProvider, make sure the head node is marked
+            # non-terminated.
+            if isinstance(self.provider, LocalNodeProvider):
+                record_local_head_state_if_needed(self.provider)
+            self.available_node_types = self.config["available_node_types"]
+            upscaling_speed = self.config.get("upscaling_speed")
+            aggressive = self.config.get("autoscaling_mode") == "aggressive"
+            target_utilization_fraction = self.config.get("target_utilization_fraction")
+            if upscaling_speed:
+                upscaling_speed = float(upscaling_speed)
+            # TODO(ameer): consider adding (if users ask) an option of
+            # initial_upscaling_num_workers.
+            elif aggressive:
+                upscaling_speed = 99999
+                logger.warning(
+                    "Legacy aggressive autoscaling mode "
+                    "detected. Replacing it by setting upscaling_speed to "
+                    "99999."
+                )
+            elif target_utilization_fraction:
+                upscaling_speed = 1 / max(target_utilization_fraction, 0.001) - 1
+                logger.warning(
+                    "Legacy target_utilization_fraction config "
+                    "detected. Replacing it by setting upscaling_speed to "
+                    + "1 / target_utilization_fraction - 1."
+                )
+            else:
+                upscaling_speed = 1.0
+            if self.resource_demand_scheduler:
+                # The node types are autofilled internally for legacy yamls,
+                # overwriting the class will remove the inferred node resources
+                # for legacy yamls.
+                self.resource_demand_scheduler.reset_config(
+                    self.provider,
+                    self.available_node_types,
+                    self.config["max_workers"],
+                    self.config["head_node_type"],
+                    upscaling_speed,
+                )
+            else:
+                self.resource_demand_scheduler = ResourceDemandScheduler(
+                    self.provider,
+                    self.available_node_types,
+                    self.config["max_workers"],
+                    self.config["head_node_type"],
+                    upscaling_speed,
+                )
+        except Exception as e:
+            self.prom_metrics.reset_exceptions.inc()
+            if errors_fatal:
+                raise e
+            else:
+                logger.exception("StandardAutoscaler: Error parsing config.")
+    def launch_config_ok(self, node_id):
+        if self.disable_launch_config_check:
+            return True
+        node_tags = self.provider.node_tags(node_id)
+        tag_launch_conf = node_tags.get(TAG_RAY_LAUNCH_CONFIG)
+        node_type = node_tags.get(TAG_RAY_USER_NODE_TYPE)
+        if node_type not in self.available_node_types:
+            # The node type has been deleted from the cluster config.
+            # Don't keep the node.
+            return False
+        # The `worker_nodes` field is deprecated in favor of per-node-type
+        # node_configs. We allow it for backwards-compatibility.
+        launch_config = copy.deepcopy(self.config.get("worker_nodes", {}))
+        if node_type:
+            launch_config.update(
+                self.config["available_node_types"][node_type]["node_config"]
+            )
+        calculated_launch_hash = hash_launch_conf(launch_config, self.config["auth"])
+        if calculated_launch_hash != tag_launch_conf:
+            return False
+        return True
+    def files_up_to_date(self, node_id):
+        node_tags = self.provider.node_tags(node_id)
+        applied_config_hash = node_tags.get(TAG_RAY_RUNTIME_CONFIG)
+        applied_file_mounts_contents_hash = node_tags.get(TAG_RAY_FILE_MOUNTS_CONTENTS)
+        if applied_config_hash != self.runtime_hash or (
+            self.file_mounts_contents_hash is not None
+            and self.file_mounts_contents_hash != applied_file_mounts_contents_hash
+        ):
+            logger.info(
+                "StandardAutoscaler: "
+                "{}: Runtime state is ({},{}), want ({},{})".format(
+                    node_id,
+                    applied_config_hash,
+                    applied_file_mounts_contents_hash,
+                    self.runtime_hash,
+                    self.file_mounts_contents_hash,
+                )
+            )
+            return False
+        return True
+    def heartbeat_on_time(self, node_id: NodeID, now: float) -> bool:
+        """Determine whether we've received a heartbeat from a node within the
+        last AUTOSCALER_HEARTBEAT_TIMEOUT_S seconds.
+        """
+        # For type checking, assert that this object has been instantitiated.
+        assert self.provider
+        key = self.provider.internal_ip(node_id)
+        if key in self.load_metrics.last_heartbeat_time_by_ip:
+            last_heartbeat_time = self.load_metrics.last_heartbeat_time_by_ip[key]
+            delta = now - last_heartbeat_time
+            if delta < AUTOSCALER_HEARTBEAT_TIMEOUT_S:
+                return True
+        return False
+    def terminate_unhealthy_nodes(self, now: float):
+        """Terminated nodes for which we haven't received a heartbeat on time.
+        These nodes are subsequently terminated.
+        """
+        # For type checking, assert that these objects have been instantitiated.
+        assert self.provider
+        assert self.non_terminated_nodes
+        for node_id in self.non_terminated_nodes.worker_ids:
+            node_status = self.provider.node_tags(node_id)[TAG_RAY_NODE_STATUS]
+            # We're not responsible for taking down
+            # nodes with pending or failed status:
+            if not node_status == STATUS_UP_TO_DATE:
+                continue
+            # This node is up-to-date. If it hasn't had the chance to produce
+            # a heartbeat, fake the heartbeat now (see logic for completed node
+            # updaters).
+            ip = self.provider.internal_ip(node_id)
+            if ip not in self.load_metrics.last_heartbeat_time_by_ip:
+                self.load_metrics.mark_active(ip)
+            # Heartbeat indicates node is healthy:
+            if self.heartbeat_on_time(node_id, now):
+                continue
+            self.schedule_node_termination(
+                node_id, "lost contact with raylet", logger.warning
+            )
+        self.terminate_scheduled_nodes()
+    def attempt_to_recover_unhealthy_nodes(self, now):
+        for node_id in self.non_terminated_nodes.worker_ids:
+            self.recover_if_needed(node_id, now)
+    def recover_if_needed(self, node_id, now):
+        if not self.can_update(node_id):
+            return
+        if self.heartbeat_on_time(node_id, now):
+            return
+        logger.warning(
+            "StandardAutoscaler: "
+            "{}: No recent heartbeat, "
+            "restarting Ray to recover...".format(node_id)
+        )
+        self.event_summarizer.add(
+            "Restarting {} nodes of type "
+            + self._get_node_type(node_id)
+            + " (lost contact with raylet).",
+            quantity=1,
+            aggregate=operator.add,
+        )
+        head_node_ip = self.provider.internal_ip(self.non_terminated_nodes.head_id)
+        updater = NodeUpdaterThread(
+            node_id=node_id,
+            provider_config=self.config["provider"],
+            provider=self.provider,
+            auth_config=self.config["auth"],
+            cluster_name=self.config["cluster_name"],
+            file_mounts={},
+            initialization_commands=[],
+            setup_commands=[],
+            ray_start_commands=with_head_node_ip(
+                self.config["worker_start_ray_commands"], head_node_ip
+            ),
+            runtime_hash=self.runtime_hash,
+            file_mounts_contents_hash=self.file_mounts_contents_hash,
+            process_runner=self.process_runner,
+            use_internal_ip=True,
+            is_head_node=False,
+            docker_config=self.config.get("docker"),
+            node_resources=self._node_resources(node_id),
+            node_labels=self._node_labels(node_id),
+            for_recovery=True,
+        )
+        updater.start()
+        self.updaters[node_id] = updater
+    def _get_node_type(self, node_id: str) -> str:
+        # For type checking, assert that this object has been instantitiated.
+        assert self.provider
+        node_tags = self.provider.node_tags(node_id)
+        if TAG_RAY_USER_NODE_TYPE in node_tags:
+            return node_tags[TAG_RAY_USER_NODE_TYPE]
+        else:
+            return "unknown_node_type"
+    def _get_node_type_specific_fields(self, node_id: str, fields_key: str) -> Any:
+        # For type checking, assert that this object has been instantitiated.
+        assert self.provider
+        fields = self.config[fields_key]
+        node_tags = self.provider.node_tags(node_id)
+        if TAG_RAY_USER_NODE_TYPE in node_tags:
+            node_type = node_tags[TAG_RAY_USER_NODE_TYPE]
+            if node_type not in self.available_node_types:
+                raise ValueError(f"Unknown node type tag: {node_type}.")
+            node_specific_config = self.available_node_types[node_type]
+            if fields_key in node_specific_config:
+                fields = node_specific_config[fields_key]
+        return fields
+    def _get_node_specific_docker_config(self, node_id):
+        if "docker" not in self.config:
+            return {}
+        docker_config = copy.deepcopy(self.config.get("docker", {}))
+        node_specific_docker = self._get_node_type_specific_fields(node_id, "docker")
+        docker_config.update(node_specific_docker)
+        return docker_config
+    def should_update(self, node_id):
+        if not self.can_update(node_id):
+            return UpdateInstructions(None, None, None, None)  # no update
+        status = self.provider.node_tags(node_id).get(TAG_RAY_NODE_STATUS)
+        if status == STATUS_UP_TO_DATE and self.files_up_to_date(node_id):
+            return UpdateInstructions(None, None, None, None)  # no update
+        successful_updated = self.num_successful_updates.get(node_id, 0) > 0
+        if successful_updated and self.config.get("restart_only", False):
+            setup_commands = []
+            ray_start_commands = self.config["worker_start_ray_commands"]
+        elif successful_updated and self.config.get("no_restart", False):
+            setup_commands = self._get_node_type_specific_fields(
+                node_id, "worker_setup_commands"
+            )
+            ray_start_commands = []
+        else:
+            setup_commands = self._get_node_type_specific_fields(
+                node_id, "worker_setup_commands"
+            )
+            ray_start_commands = self.config["worker_start_ray_commands"]
+        docker_config = self._get_node_specific_docker_config(node_id)
+        return UpdateInstructions(
+            node_id=node_id,
+            setup_commands=setup_commands,
+            ray_start_commands=ray_start_commands,
+            docker_config=docker_config,
+        )
+    def spawn_updater(
+        self,
+        node_id,
+        setup_commands,
+        ray_start_commands,
+        node_resources,
+        node_labels,
+        docker_config,
+    ):
+        logger.info(
+            f"Creating new (spawn_updater) updater thread for node" f" {node_id}."
+        )
+        ip = self.provider.internal_ip(node_id)
+        node_type = self._get_node_type(node_id)
+        self.node_tracker.track(node_id, ip, node_type)
+        head_node_ip = self.provider.internal_ip(self.non_terminated_nodes.head_id)
+        updater = NodeUpdaterThread(
+            node_id=node_id,
+            provider_config=self.config["provider"],
+            provider=self.provider,
+            auth_config=self.config["auth"],
+            cluster_name=self.config["cluster_name"],
+            file_mounts=self.config["file_mounts"],
+            initialization_commands=with_head_node_ip(
+                self._get_node_type_specific_fields(node_id, "initialization_commands"),
+                head_node_ip,
+            ),
+            setup_commands=with_head_node_ip(setup_commands, head_node_ip),
+            ray_start_commands=with_head_node_ip(ray_start_commands, head_node_ip),
+            runtime_hash=self.runtime_hash,
+            file_mounts_contents_hash=self.file_mounts_contents_hash,
+            is_head_node=False,
+            cluster_synced_files=self.config["cluster_synced_files"],
+            rsync_options={
+                "rsync_exclude": self.config.get("rsync_exclude"),
+                "rsync_filter": self.config.get("rsync_filter"),
+            },
+            process_runner=self.process_runner,
+            use_internal_ip=True,
+            docker_config=docker_config,
+            node_resources=node_resources,
+            node_labels=node_labels,
+        )
+        updater.start()
+        self.updaters[node_id] = updater
+    def can_update(self, node_id):
+        if self.disable_node_updaters:
+            return False
+        if node_id in self.updaters:
+            return False
+        if not self.launch_config_ok(node_id):
+            return False
+        if self.num_failed_updates.get(node_id, 0) > 0:  # TODO(ekl) retry?
+            return False
+        logger.debug(
+            f"{node_id} is not being updated and "
+            "passes config check (can_update=True)."
+        )
+        return True
+    def launch_new_node(self, count: int, node_type: str) -> None:
+        logger.info("StandardAutoscaler: Queue {} new nodes for launch".format(count))
+        self.pending_launches.inc(node_type, count)
+        config = copy.deepcopy(self.config)
+        if self.foreground_node_launch:
+            assert self.foreground_node_launcher is not None
+            # Launch in the main thread and block.
+            self.foreground_node_launcher.launch_node(config, count, node_type)
+        else:
+            assert self.launch_queue is not None
+            # Split into individual launch requests of the max batch size.
+            while count > 0:
+                # Enqueue launch data for the background NodeUpdater threads.
+                self.launch_queue.put(
+                    (config, min(count, self.max_launch_batch), node_type)
+                )
+                count -= self.max_launch_batch
+    def kill_workers(self):
+        logger.error("StandardAutoscaler: kill_workers triggered")
+        nodes = self.workers()
+        if nodes:
+            self.provider.terminate_nodes(nodes)
+            for node in nodes:
+                self.node_tracker.untrack(node)
+                self.prom_metrics.stopped_nodes.inc()
+        logger.error("StandardAutoscaler: terminated {} node(s)".format(len(nodes)))
+    def summary(self) -> Optional[AutoscalerSummary]:
+        """Summarizes the active, pending, and failed node launches.
+        An active node is a node whose raylet is actively reporting heartbeats.
+        A pending node is non-active node whose node tag is uninitialized,
+        waiting for ssh, syncing files, or setting up.
+        If a node is not pending or active, it is failed.
+        Returns:
+            AutoscalerSummary: The summary.
+        """
+        # For type checking, assert that this object has been instantitiated.
+        assert self.provider
+        if not self.non_terminated_nodes:
+            return None
+        active_nodes: Dict[NodeType, int] = Counter()
+        pending_nodes = []
+        failed_nodes = []
+        non_failed = set()
+        node_type_mapping = {}
+        for node_id in self.non_terminated_nodes.all_node_ids:
+            ip = self.provider.internal_ip(node_id)
+            node_tags = self.provider.node_tags(node_id)
+            if not all(
+                tag in node_tags
+                for tag in (
+                    TAG_RAY_NODE_KIND,
+                    TAG_RAY_USER_NODE_TYPE,
+                    TAG_RAY_NODE_STATUS,
+                )
+            ):
+                # In some node providers, creation of a node and tags is not
+                # atomic, so just skip it.
+                continue
+            if node_tags[TAG_RAY_NODE_KIND] == NODE_KIND_UNMANAGED:
+                continue
+            node_type = node_tags[TAG_RAY_USER_NODE_TYPE]
+            node_type_mapping[ip] = node_type
+            # TODO (Alex): If a node's raylet has died, it shouldn't be marked
+            # as active.
+            is_active = self.load_metrics.is_active(ip)
+            if is_active:
+                active_nodes[node_type] += 1
+                non_failed.add(node_id)
+            else:
+                status = node_tags[TAG_RAY_NODE_STATUS]
+                completed_states = [STATUS_UP_TO_DATE, STATUS_UPDATE_FAILED]
+                is_pending = status not in completed_states
+                if is_pending:
+                    pending_nodes.append((node_id, ip, node_type, status))
+                    non_failed.add(node_id)
+        failed_nodes = self.node_tracker.get_all_failed_node_info(non_failed)
+        # The concurrent counter leaves some 0 counts in, so we need to
+        # manually filter those out.
+        pending_launches = {}
+        for node_type, count in self.pending_launches.breakdown().items():
+            if count:
+                pending_launches[node_type] = count
+        pending_resources = {}
+        for node_resources in self.resource_demand_scheduler.calculate_node_resources(
+            nodes=[node_id for node_id, _, _, _ in pending_nodes],
+            pending_nodes=pending_launches,
+            # We don't fill this field out because we're intentionally only
+            # passing pending nodes (which aren't tracked by load metrics
+            # anyways).
+            unused_resources_by_ip={},
+        )[0]:
+            for key, value in node_resources.items():
+                pending_resources[key] = value + pending_resources.get(key, 0)
+        return AutoscalerSummary(
+            # Convert active_nodes from counter to dict for later serialization
+            active_nodes=dict(active_nodes),
+            idle_nodes=None,
+            pending_nodes=[
+                (ip, node_type, status) for _, ip, node_type, status in pending_nodes
+            ],
+            pending_launches=pending_launches,
+            failed_nodes=failed_nodes,
+            node_availability_summary=self.node_provider_availability_tracker.summary(),
+            pending_resources=pending_resources,
+            node_type_mapping=node_type_mapping,
+            legacy=True,
+        )
+    def info_string(self):
+        lm_summary = self.load_metrics.summary()
+        autoscaler_summary = self.summary()
+        assert autoscaler_summary
+        return "\n" + format_info_string(lm_summary, autoscaler_summary)

.venv/lib/python3.11/site-packages/ray/autoscaler/_private/cli_logger.py ADDED Viewed

	@@ -0,0 +1,825 @@

+"""Logger implementing the Command Line Interface.
+A replacement for the standard Python `logging` API
+designed for implementing a better CLI UX for the cluster launcher.
+Supports color, bold text, italics, underlines, etc.
+(depending on TTY features)
+as well as indentation and other structured output.
+"""
+import inspect
+import logging
+import os
+import sys
+import time
+from contextlib import contextmanager
+from functools import wraps
+from typing import Any, Callable, Dict, List, Optional, Tuple
+import click
+import colorama
+# Import ray first to use the bundled colorama
+import ray  # noqa: F401
+if sys.platform == "win32":
+    import msvcrt
+else:
+    import select
+class _ColorfulMock:
+    def __init__(self):
+        # do not do any color work
+        self.identity = lambda x: x
+        self.colorful = self
+        self.colormode = None
+        self.NO_COLORS = None
+        self.ANSI_8_COLORS = None
+    def disable(self):
+        pass
+    @contextmanager
+    def with_style(self, x):
+        class IdentityClass:
+            def __getattr__(self, name):
+                return lambda y: y
+        yield IdentityClass()
+    def __getattr__(self, name):
+        if name == "with_style":
+            return self.with_style
+        return self.identity
+try:
+    import colorful as _cf
+    from colorful.core import ColorfulString
+    _cf.use_8_ansi_colors()
+except ModuleNotFoundError:
+    # We mock Colorful to restrict the colors used for consistency
+    # anyway, so we also allow for not having colorful at all.
+    # If the Ray Core dependency on colorful is ever removed,
+    # the CliLogger code will still work.
+    class ColorfulString:
+        pass
+    _cf = _ColorfulMock()
+# We want to only allow specific formatting
+# to prevent people from accidentally making bad looking color schemes.
+#
+# This is especially important since most will look bad on either light
+# or dark themes.
+class _ColorfulProxy:
+    _proxy_allowlist = [
+        "disable",
+        "reset",
+        "bold",
+        "italic",
+        "underlined",
+        # used instead of `gray` as `dimmed` adapts to
+        # both light and dark themes
+        "dimmed",
+        "dodgerBlue",  # group
+        "limeGreen",  # success
+        "red",  # error
+        "orange",  # warning
+        "skyBlue",  # label
+        "magenta",  # syntax highlighting key words and symbols
+        "yellow",  # syntax highlighting strings
+    ]
+    def __getattr__(self, name):
+        res = getattr(_cf, name)
+        if callable(res) and name not in _ColorfulProxy._proxy_allowlist:
+            raise ValueError(
+                "Usage of the colorful method '" + name + "' is forbidden "
+                "by the proxy to keep a consistent color scheme. "
+                "Check `cli_logger.py` for allowed methods"
+            )
+        return res
+cf = _ColorfulProxy()
+colorama.init(strip=False)
+def _external_caller_info():
+    """Get the info from the caller frame.
+    Used to override the logging function and line number with the correct
+    ones. See the comment on _patched_makeRecord for more info.
+    """
+    frame = inspect.currentframe()
+    caller = frame
+    levels = 0
+    while caller.f_code.co_filename == __file__:
+        caller = caller.f_back
+        levels += 1
+    return {
+        "lineno": caller.f_lineno,
+        "filename": os.path.basename(caller.f_code.co_filename),
+    }
+def _format_msg(
+    msg: str,
+    *args: Any,
+    no_format: bool = None,
+    _tags: Dict[str, Any] = None,
+    _numbered: Tuple[str, int, int] = None,
+    **kwargs: Any,
+):
+    """Formats a message for printing.
+    Renders `msg` using the built-in `str.format` and the passed-in
+    `*args` and `**kwargs`.
+    Args:
+        *args (Any): `.format` arguments for `msg`.
+        no_format (bool):
+            If `no_format` is `True`,
+            `.format` will not be called on the message.
+            Useful if the output is user-provided or may otherwise
+            contain an unexpected formatting string (e.g. "{}").
+        _tags (Dict[str, Any]):
+            key-value pairs to display at the end of
+            the message in square brackets.
+            If a tag is set to `True`, it is printed without the value,
+            the presence of the tag treated as a "flag".
+            E.g. `_format_msg("hello", _tags=dict(from=mom, signed=True))`
+                 `hello [from=Mom, signed]`
+        _numbered (Tuple[str, int, int]):
+            `(brackets, i, n)`
+            The `brackets` string is composed of two "bracket" characters,
+            `i` is the index, `n` is the total.
+            The string `{i}/{n}` surrounded by the "brackets" is
+            prepended to the message.
+            This is used to number steps in a procedure, with different
+            brackets specifying different major tasks.
+            E.g. `_format_msg("hello", _numbered=("[]", 0, 5))`
+                 `[0/5] hello`
+    Returns:
+        The formatted message.
+    """
+    if isinstance(msg, str) or isinstance(msg, ColorfulString):
+        tags_str = ""
+        if _tags is not None:
+            tags_list = []
+            for k, v in _tags.items():
+                if v is True:
+                    tags_list += [k]
+                    continue
+                if v is False:
+                    continue
+                tags_list += [k + "=" + v]
+            if tags_list:
+                tags_str = cf.reset(cf.dimmed(" [{}]".format(", ".join(tags_list))))
+        numbering_str = ""
+        if _numbered is not None:
+            chars, i, n = _numbered
+            numbering_str = cf.dimmed(chars[0] + str(i) + "/" + str(n) + chars[1]) + " "
+        if no_format:
+            # todo: throw if given args/kwargs?
+            return numbering_str + msg + tags_str
+        return numbering_str + msg.format(*args, **kwargs) + tags_str
+    if kwargs:
+        raise ValueError("We do not support printing kwargs yet.")
+    res = [msg, *args]
+    res = [str(x) for x in res]
+    return ", ".join(res)
+# TODO: come up with a plan to unify logging.
+# formatter = logging.Formatter(
+#     # TODO(maximsmol): figure out the required log level padding
+#     #                  width automatically
+#     fmt="[{asctime}] {levelname:6} {message}",
+#     datefmt="%x %X",
+#     # We want alignment on our level names
+#     style="{")
+def _isatty():
+    """More robust check for interactive terminal/tty."""
+    try:
+        # https://stackoverflow.com/questions/6108330/
+        # checking-for-interactive-shell-in-a-python-script
+        return sys.__stdin__.isatty()
+    except Exception:
+        # sometimes this can fail due to closed output
+        # either way, no-tty is generally safe fallback.
+        return False
+class _CliLogger:
+    """Singleton class for CLI logging.
+    Without calling 'cli_logger.configure', the CLILogger will default
+    to 'record' style logging.
+    Attributes:
+        color_mode (str):
+            Can be "true", "false", or "auto".
+            Enables or disables `colorful`.
+            If `color_mode` is "auto", is set to `not stdout.isatty()`
+        indent_level (int):
+            The current indentation level.
+            All messages will be indented by prepending `"  " * indent_level`
+        vebosity (int):
+            Output verbosity.
+            Low verbosity will disable `verbose` and `very_verbose` messages.
+    """
+    color_mode: str
+    # color_mode: Union[Literal["auto"], Literal["false"], Literal["true"]]
+    indent_level: int
+    interactive: bool
+    VALID_LOG_STYLES = ("auto", "record", "pretty")
+    _autodetected_cf_colormode: int
+    def __init__(self):
+        self.indent_level = 0
+        self._verbosity = 0
+        self._verbosity_overriden = False
+        self._color_mode = "auto"
+        self._log_style = "record"
+        self.pretty = False
+        self.interactive = False
+        # store whatever colorful has detected for future use if
+        # the color ouput is toggled (colorful detects # of supported colors,
+        # so it has some non-trivial logic to determine this)
+        self._autodetected_cf_colormode = cf.colorful.colormode
+        self.set_format()
+    def set_format(self, format_tmpl=None):
+        if not format_tmpl:
+            from ray.autoscaler._private.constants import LOGGER_FORMAT
+            format_tmpl = LOGGER_FORMAT
+        self._formatter = logging.Formatter(format_tmpl)
+    def configure(self, log_style=None, color_mode=None, verbosity=None):
+        """Configures the logger according to values."""
+        if log_style is not None:
+            self._set_log_style(log_style)
+        if color_mode is not None:
+            self._set_color_mode(color_mode)
+        if verbosity is not None:
+            self._set_verbosity(verbosity)
+        self.detect_colors()
+    @property
+    def log_style(self):
+        return self._log_style
+    def _set_log_style(self, x):
+        """Configures interactivity and formatting."""
+        self._log_style = x.lower()
+        self.interactive = _isatty()
+        if self._log_style == "auto":
+            self.pretty = _isatty()
+        elif self._log_style == "record":
+            self.pretty = False
+            self._set_color_mode("false")
+        elif self._log_style == "pretty":
+            self.pretty = True
+    @property
+    def color_mode(self):
+        return self._color_mode
+    def _set_color_mode(self, x):
+        self._color_mode = x.lower()
+        self.detect_colors()
+    @property
+    def verbosity(self):
+        if self._verbosity_overriden:
+            return self._verbosity
+        elif not self.pretty:
+            return 999
+        return self._verbosity
+    def _set_verbosity(self, x):
+        self._verbosity = x
+        self._verbosity_overriden = True
+    def detect_colors(self):
+        """Update color output settings.
+        Parse the `color_mode` string and optionally disable or force-enable
+        color output
+        (8-color ANSI if no terminal detected to be safe) in colorful.
+        """
+        if self.color_mode == "true":
+            if self._autodetected_cf_colormode != cf.NO_COLORS:
+                cf.colormode = self._autodetected_cf_colormode
+            else:
+                cf.colormode = cf.ANSI_8_COLORS
+            return
+        if self.color_mode == "false":
+            cf.disable()
+            return
+        if self.color_mode == "auto":
+            # colorful autodetects tty settings
+            return
+        raise ValueError("Invalid log color setting: " + self.color_mode)
+    def newline(self):
+        """Print a line feed."""
+        self.print("")
+    def _print(
+        self,
+        msg: str,
+        _level_str: str = "INFO",
+        _linefeed: bool = True,
+        end: str = None,
+    ):
+        """Proxy for printing messages.
+        Args:
+            msg: Message to print.
+            linefeed (bool):
+                If `linefeed` is `False` no linefeed is printed at the
+                end of the message.
+        """
+        if self.pretty:
+            rendered_message = "  " * self.indent_level + msg
+        else:
+            if msg.strip() == "":
+                return
+            caller_info = _external_caller_info()
+            record = logging.LogRecord(
+                name="cli",
+                # We override the level name later
+                # TODO(maximsmol): give approximate level #s to our log levels
+                level=0,
+                # The user-facing logs do not need this information anyway
+                # and it would be very tedious to extract since _print
+                # can be at varying depths in the call stack
+                # TODO(maximsmol): do it anyway to be extra
+                pathname=caller_info["filename"],
+                lineno=caller_info["lineno"],
+                msg=msg,
+                args={},
+                # No exception
+                exc_info=None,
+            )
+            record.levelname = _level_str
+            rendered_message = self._formatter.format(record)
+        # We aren't using standard python logging convention, so we hardcode
+        # the log levels for now.
+        if _level_str in ["WARNING", "ERROR", "PANIC"]:
+            stream = sys.stderr
+        else:
+            stream = sys.stdout
+        if not _linefeed:
+            stream.write(rendered_message)
+            stream.flush()
+            return
+        kwargs = {"end": end}
+        print(rendered_message, file=stream, **kwargs)
+    def indented(self):
+        """Context manager that starts an indented block of output."""
+        cli_logger = self
+        class IndentedContextManager:
+            def __enter__(self):
+                cli_logger.indent_level += 1
+            def __exit__(self, type, value, tb):
+                cli_logger.indent_level -= 1
+        return IndentedContextManager()
+    def group(self, msg: str, *args: Any, **kwargs: Any):
+        """Print a group title in a special color and start an indented block.
+        For arguments, see `_format_msg`.
+        """
+        self.print(cf.dodgerBlue(msg), *args, **kwargs)
+        return self.indented()
+    def verbatim_error_ctx(self, msg: str, *args: Any, **kwargs: Any):
+        """Context manager for printing multi-line error messages.
+        Displays a start sequence "!!! {optional message}"
+        and a matching end sequence "!!!".
+        The string "!!!" can be used as a "tombstone" for searching.
+        For arguments, see `_format_msg`.
+        """
+        cli_logger = self
+        class VerbatimErorContextManager:
+            def __enter__(self):
+                cli_logger.error(cf.bold("!!! ") + "{}", msg, *args, **kwargs)
+            def __exit__(self, type, value, tb):
+                cli_logger.error(cf.bold("!!!"))
+        return VerbatimErorContextManager()
+    def labeled_value(self, key: str, msg: str, *args: Any, **kwargs: Any):
+        """Displays a key-value pair with special formatting.
+        Args:
+            key: Label that is prepended to the message.
+        For other arguments, see `_format_msg`.
+        """
+        self._print(cf.skyBlue(key) + ": " + _format_msg(cf.bold(msg), *args, **kwargs))
+    def verbose(self, msg: str, *args: Any, **kwargs: Any):
+        """Prints a message if verbosity is not 0.
+        For arguments, see `_format_msg`.
+        """
+        if self.verbosity > 0:
+            self.print(msg, *args, _level_str="VINFO", **kwargs)
+    def verbose_warning(self, msg, *args, **kwargs):
+        """Prints a formatted warning if verbosity is not 0.
+        For arguments, see `_format_msg`.
+        """
+        if self.verbosity > 0:
+            self._warning(msg, *args, _level_str="VWARN", **kwargs)
+    def verbose_error(self, msg: str, *args: Any, **kwargs: Any):
+        """Logs an error if verbosity is not 0.
+        For arguments, see `_format_msg`.
+        """
+        if self.verbosity > 0:
+            self._error(msg, *args, _level_str="VERR", **kwargs)
+    def very_verbose(self, msg: str, *args: Any, **kwargs: Any):
+        """Prints if verbosity is > 1.
+        For arguments, see `_format_msg`.
+        """
+        if self.verbosity > 1:
+            self.print(msg, *args, _level_str="VVINFO", **kwargs)
+    def success(self, msg: str, *args: Any, **kwargs: Any):
+        """Prints a formatted success message.
+        For arguments, see `_format_msg`.
+        """
+        self.print(cf.limeGreen(msg), *args, _level_str="SUCC", **kwargs)
+    def _warning(self, msg: str, *args: Any, _level_str: str = None, **kwargs: Any):
+        """Prints a formatted warning message.
+        For arguments, see `_format_msg`.
+        """
+        if _level_str is None:
+            raise ValueError("Log level not set.")
+        self.print(cf.orange(msg), *args, _level_str=_level_str, **kwargs)
+    def warning(self, *args, **kwargs):
+        self._warning(*args, _level_str="WARN", **kwargs)
+    def _error(self, msg: str, *args: Any, _level_str: str = None, **kwargs: Any):
+        """Prints a formatted error message.
+        For arguments, see `_format_msg`.
+        """
+        if _level_str is None:
+            raise ValueError("Log level not set.")
+        self.print(cf.red(msg), *args, _level_str=_level_str, **kwargs)
+    def error(self, *args, **kwargs):
+        self._error(*args, _level_str="ERR", **kwargs)
+    def panic(self, *args, **kwargs):
+        self._error(*args, _level_str="PANIC", **kwargs)
+    # Fine to expose _level_str here, since this is a general log function.
+    def print(
+        self,
+        msg: str,
+        *args: Any,
+        _level_str: str = "INFO",
+        end: str = None,
+        **kwargs: Any,
+    ):
+        """Prints a message.
+        For arguments, see `_format_msg`.
+        """
+        self._print(_format_msg(msg, *args, **kwargs), _level_str=_level_str, end=end)
+    def info(self, msg: str, no_format=True, *args, **kwargs):
+        self.print(msg, no_format=no_format, *args, **kwargs)
+    def abort(
+        self, msg: Optional[str] = None, *args: Any, exc: Any = None, **kwargs: Any
+    ):
+        """Prints an error and aborts execution.
+        Print an error and throw an exception to terminate the program
+        (the exception will not print a message).
+        """
+        if msg is not None:
+            self._error(msg, *args, _level_str="PANIC", **kwargs)
+        if exc is not None:
+            raise exc
+        exc_cls = click.ClickException
+        if self.pretty:
+            exc_cls = SilentClickException
+        if msg is None:
+            msg = "Exiting due to cli_logger.abort()"
+        raise exc_cls(msg)
+    def doassert(self, val: bool, msg: str, *args: Any, **kwargs: Any):
+        """Handle assertion without throwing a scary exception.
+        Args:
+            val: Value to check.
+        For other arguments, see `_format_msg`.
+        """
+        if not val:
+            exc = None
+            if not self.pretty:
+                exc = AssertionError()
+            # TODO(maximsmol): rework asserts so that we get the expression
+            #                  that triggered the assert
+            #                  to do this, install a global try-catch
+            #                  for AssertionError and raise them normally
+            self.abort(msg, *args, exc=exc, **kwargs)
+    def render_list(self, xs: List[str], separator: str = cf.reset(", ")):
+        """Render a list of bolded values using a non-bolded separator."""
+        return separator.join([str(cf.bold(x)) for x in xs])
+    def confirm(
+        self,
+        yes: bool,
+        msg: str,
+        *args: Any,
+        _abort: bool = False,
+        _default: bool = False,
+        _timeout_s: Optional[float] = None,
+        **kwargs: Any,
+    ):
+        """Display a confirmation dialog.
+        Valid answers are "y/yes/true/1" and "n/no/false/0".
+        Args:
+            yes: If `yes` is `True` the dialog will default to "yes"
+                        and continue without waiting for user input.
+            _abort (bool):
+                If `_abort` is `True`,
+                "no" means aborting the program.
+            _default (bool):
+                The default action to take if the user just presses enter
+                with no input.
+            _timeout_s (float):
+                If user has no input within _timeout_s seconds, the default
+                action is taken. None means no timeout.
+        """
+        should_abort = _abort
+        default = _default
+        if not self.interactive and not yes:
+            # no formatting around --yes here since this is non-interactive
+            self.error(
+                "This command requires user confirmation. "
+                "When running non-interactively, supply --yes to skip."
+            )
+            raise ValueError("Non-interactive confirm without --yes.")
+        if default:
+            yn_str = "Y/n"
+        else:
+            yn_str = "y/N"
+        confirm_str = cf.underlined("Confirm [" + yn_str + "]:") + " "
+        rendered_message = _format_msg(msg, *args, **kwargs)
+        # the rendered message ends with ascii coding
+        if rendered_message and not msg.endswith("\n"):
+            rendered_message += " "
+        msg_len = len(rendered_message.split("\n")[-1])
+        complete_str = rendered_message + confirm_str
+        if yes:
+            self._print(complete_str + "y " + cf.dimmed("[automatic, due to --yes]"))
+            return True
+        self._print(complete_str, _linefeed=False)
+        res = None
+        yes_answers = ["y", "yes", "true", "1"]
+        no_answers = ["n", "no", "false", "0"]
+        try:
+            while True:
+                if _timeout_s is None:
+                    ans = sys.stdin.readline()
+                elif sys.platform == "win32":
+                    # Windows doesn't support select
+                    start_time = time.time()
+                    ans = ""
+                    while True:
+                        if (time.time() - start_time) >= _timeout_s:
+                            self.newline()
+                            ans = "\n"
+                            break
+                        elif msvcrt.kbhit():
+                            ch = msvcrt.getwch()
+                            if ch in ("\n", "\r"):
+                                self.newline()
+                                ans = ans + "\n"
+                                break
+                            elif ch == "\b":
+                                if ans:
+                                    ans = ans[:-1]
+                                    # Emulate backspace erasing
+                                    print("\b \b", end="", flush=True)
+                            else:
+                                ans = ans + ch
+                                print(ch, end="", flush=True)
+                        else:
+                            time.sleep(0.1)
+                else:
+                    ready, _, _ = select.select([sys.stdin], [], [], _timeout_s)
+                    if not ready:
+                        self.newline()
+                        ans = "\n"
+                    else:
+                        ans = sys.stdin.readline()
+                ans = ans.lower()
+                if ans == "\n":
+                    res = default
+                    break
+                ans = ans.strip()
+                if ans in yes_answers:
+                    res = True
+                    break
+                if ans in no_answers:
+                    res = False
+                    break
+                indent = " " * msg_len
+                self.error(
+                    "{}Invalid answer: {}. Expected {} or {}",
+                    indent,
+                    cf.bold(ans.strip()),
+                    self.render_list(yes_answers, "/"),
+                    self.render_list(no_answers, "/"),
+                )
+                self._print(indent + confirm_str, _linefeed=False)
+        except KeyboardInterrupt:
+            self.newline()
+            res = default
+        if not res and should_abort:
+            # todo: make sure we tell the user if they
+            # need to do cleanup
+            self._print("Exiting...")
+            raise SilentClickException(
+                "Exiting due to the response to confirm(should_abort=True)."
+            )
+        return res
+    def prompt(self, msg: str, *args, **kwargs):
+        """Prompt the user for some text input.
+        Args:
+            msg: The mesage to display to the user before the prompt.
+        Returns:
+            The string entered by the user.
+        """
+        complete_str = cf.underlined(msg)
+        rendered_message = _format_msg(complete_str, *args, **kwargs)
+        # the rendered message ends with ascii coding
+        if rendered_message and not msg.endswith("\n"):
+            rendered_message += " "
+        self._print(rendered_message, linefeed=False)
+        res = ""
+        try:
+            ans = sys.stdin.readline()
+            ans = ans.lower()
+            res = ans.strip()
+        except KeyboardInterrupt:
+            self.newline()
+        return res
+    def flush(self):
+        sys.stdout.flush()
+        sys.stderr.flush()
+class SilentClickException(click.ClickException):
+    """`ClickException` that does not print a message.
+    Some of our tooling relies on catching ClickException in particular.
+    However the default prints a message, which is undesirable since we expect
+    our code to log errors manually using `cli_logger.error()` to allow for
+    colors and other formatting.
+    """
+    def __init__(self, message: str):
+        super(SilentClickException, self).__init__(message)
+    def show(self, file=None):
+        pass
+cli_logger = _CliLogger()
+CLICK_LOGGING_OPTIONS = [
+    click.option(
+        "--log-style",
+        required=False,
+        type=click.Choice(cli_logger.VALID_LOG_STYLES, case_sensitive=False),
+        default="auto",
+        help=(
+            "If 'pretty', outputs with formatting and color. If 'record', "
+            "outputs record-style without formatting. "
+            "'auto' defaults to 'pretty', and disables pretty logging "
+            "if stdin is *not* a TTY."
+        ),
+    ),
+    click.option(
+        "--log-color",
+        required=False,
+        type=click.Choice(["auto", "false", "true"], case_sensitive=False),
+        default="auto",
+        help=("Use color logging. Auto enables color logging if stdout is a TTY."),
+    ),
+    click.option("-v", "--verbose", default=None, count=True),
+]
+def add_click_logging_options(f: Callable) -> Callable:
+    for option in reversed(CLICK_LOGGING_OPTIONS):
+        f = option(f)
+    @wraps(f)
+    def wrapper(*args, log_style=None, log_color=None, verbose=None, **kwargs):
+        cli_logger.configure(log_style, log_color, verbose)
+        return f(*args, **kwargs)
+    return wrapper

.venv/lib/python3.11/site-packages/ray/autoscaler/_private/cli_logger_demoall.py ADDED Viewed

	@@ -0,0 +1,40 @@

+#!/usr/bin/env python
+# This is an executable script that runs an example of every single CliLogger
+# function for demonstration purposes. Primarily useful for tuning color and
+# other formatting.
+from ray.autoscaler._private.cli_logger import cf, cli_logger
+cli_logger.configure(log_style="auto", verbosity=999)
+cli_logger.print(cf.bold("Bold ") + cf.italic("Italic ") + cf.underlined("Underlined"))
+cli_logger.labeled_value("Label", "value")
+cli_logger.print("List: {}", cli_logger.render_list([1, 2, 3]))
+cli_logger.newline()
+cli_logger.very_verbose("Very verbose")
+cli_logger.verbose("Verbose")
+cli_logger.verbose_warning("Verbose warning")
+cli_logger.verbose_error("Verbose error")
+cli_logger.print("Info")
+cli_logger.success("Success")
+cli_logger.warning("Warning")
+cli_logger.error("Error")
+cli_logger.newline()
+try:
+    cli_logger.abort("Abort")
+except Exception:
+    pass
+try:
+    cli_logger.doassert(False, "Assert")
+except Exception:
+    pass
+cli_logger.newline()
+cli_logger.confirm(True, "example")
+cli_logger.newline()
+with cli_logger.indented():
+    cli_logger.print("Indented")
+with cli_logger.group("Group"):
+    cli_logger.print("Group contents")
+with cli_logger.verbatim_error_ctx("Verbtaim error"):
+    cli_logger.print("Error contents")

.venv/lib/python3.11/site-packages/ray/autoscaler/_private/cluster_dump.py ADDED Viewed

	@@ -0,0 +1,652 @@

+import os
+import re
+import subprocess
+import sys
+import tarfile
+import tempfile
+import threading
+from concurrent.futures import ThreadPoolExecutor
+from contextlib import contextmanager
+from typing import List, Optional, Sequence, Tuple
+import yaml
+import ray  # noqa: F401
+from ray.autoscaler._private.cli_logger import cli_logger
+from ray.autoscaler._private.providers import _get_node_provider
+from ray.autoscaler.tags import NODE_KIND_HEAD, NODE_KIND_WORKER, TAG_RAY_NODE_KIND
+# Import psutil after ray so the packaged version is used.
+import psutil
+MAX_PARALLEL_SSH_WORKERS = 8
+DEFAULT_SSH_USER = "ubuntu"
+DEFAULT_SSH_KEYS = ["~/ray_bootstrap_key.pem", "~/.ssh/ray-autoscaler_2_us-west-2.pem"]
+class CommandFailed(RuntimeError):
+    pass
+class LocalCommandFailed(CommandFailed):
+    pass
+class RemoteCommandFailed(CommandFailed):
+    pass
+class GetParameters:
+    def __init__(
+        self,
+        logs: bool = True,
+        debug_state: bool = True,
+        pip: bool = True,
+        processes: bool = True,
+        processes_verbose: bool = True,
+        processes_list: Optional[List[Tuple[str, bool]]] = None,
+    ):
+        self.logs = logs
+        self.debug_state = debug_state
+        self.pip = pip
+        self.processes = processes
+        self.processes_verbose = processes_verbose
+        self.processes_list = processes_list
+class Node:
+    """Node (as in "machine")"""
+    def __init__(
+        self,
+        host: str,
+        ssh_user: str = "ubuntu",
+        ssh_key: str = "~/ray_bootstrap_key.pem",
+        docker_container: Optional[str] = None,
+        is_head: bool = False,
+    ):
+        self.host = host
+        self.ssh_user = ssh_user
+        self.ssh_key = ssh_key
+        self.docker_container = docker_container
+        self.is_head = is_head
+class Archive:
+    """Archive object to collect and compress files into a single file.
+    Objects of this class can be passed around to different data collection
+    functions. These functions can use the :meth:`subdir` method to add
+    files to a sub directory of the archive.
+    """
+    def __init__(self, file: Optional[str] = None):
+        self.file = file or tempfile.mkstemp(prefix="ray_logs_", suffix=".tar.gz")[1]
+        self.tar = None
+        self._lock = threading.Lock()
+    @property
+    def is_open(self):
+        return bool(self.tar)
+    def open(self):
+        self.tar = tarfile.open(self.file, "w:gz")
+    def close(self):
+        self.tar.close()
+        self.tar = None
+    def __enter__(self):
+        self.open()
+        return self
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        self.close()
+    @contextmanager
+    def subdir(self, subdir: str, root: Optional[str] = "/"):
+        """Open a context to add files to the archive.
+        Example:
+            .. code-block:: python
+                with Archive("file.tar.gz") as archive:
+                    with archive.subdir("logfiles", root="/tmp/logs") as sd:
+                        # Will be added as `logfiles/nested/file.txt`
+                        sd.add("/tmp/logs/nested/file.txt")
+        Args:
+            subdir: Subdir to which to add files to. Calling the
+                ``add(path)`` command will place files into the ``subdir``
+                directory of the archive.
+            root: Root path. Files without an explicit ``arcname``
+                will be named relatively to this path.
+        Yields:
+            A context object that can be used to add files to the archive.
+        """
+        root = os.path.abspath(root)
+        class _Context:
+            @staticmethod
+            def add(path: str, arcname: Optional[str] = None):
+                path = os.path.abspath(path)
+                arcname = arcname or os.path.join(subdir, os.path.relpath(path, root))
+                self._lock.acquire()
+                self.tar.add(path, arcname=arcname)
+                self._lock.release()
+        yield _Context()
+###
+# Functions to gather logs and information on the local node
+###
+def get_local_ray_logs(
+    archive: Archive,
+    exclude: Optional[Sequence[str]] = None,
+    session_log_dir: str = "/tmp/ray/session_latest",
+) -> Archive:
+    """Copy local log files into an archive.
+    Args:
+        archive: Archive object to add log files to.
+        exclude (Sequence[str]): Sequence of regex patterns. Files that match
+            any of these patterns will not be included in the archive.
+        session_dir: Path to the Ray session files. Defaults to
+            ``/tmp/ray/session_latest``
+    Returns:
+        Open archive object.
+    """
+    if not archive.is_open:
+        archive.open()
+    exclude = exclude or []
+    session_log_dir = os.path.join(os.path.expanduser(session_log_dir), "logs")
+    with archive.subdir("logs", root=session_log_dir) as sd:
+        for root, dirs, files in os.walk(session_log_dir):
+            for file in files:
+                file_path = os.path.join(root, file)
+                rel_path = os.path.relpath(file_path, start=session_log_dir)
+                # Skip file if it matches any pattern in `exclude`
+                if any(re.match(pattern, rel_path) for pattern in exclude):
+                    continue
+                sd.add(file_path)
+    return archive
+def get_local_debug_state(
+    archive: Archive, session_dir: str = "/tmp/ray/session_latest"
+) -> Archive:
+    """Copy local log files into an archive.
+    Args:
+        archive: Archive object to add log files to.
+        session_dir: Path to the Ray session files. Defaults to
+            ``/tmp/ray/session_latest``
+    Returns:
+        Open archive object.
+    """
+    if not archive.is_open:
+        archive.open()
+    session_dir = os.path.expanduser(session_dir)
+    debug_state_file = os.path.join(session_dir, "logs/debug_state.txt")
+    if not os.path.exists(debug_state_file):
+        raise LocalCommandFailed("No `debug_state.txt` file found.")
+    with archive.subdir("", root=session_dir) as sd:
+        sd.add(debug_state_file)
+    return archive
+def get_local_pip_packages(archive: Archive):
+    """Get currently installed pip packages and write into an archive.
+    Args:
+        archive: Archive object to add meta files to.
+    Returns:
+        Open archive object.
+    """
+    if not archive.is_open:
+        archive.open()
+    try:
+        from pip._internal.operations import freeze
+    except ImportError:  # pip < 10.0
+        from pip.operations import freeze
+    with tempfile.NamedTemporaryFile("wt") as fp:
+        for line in freeze.freeze():
+            fp.writelines([line, "\n"])
+        fp.flush()
+        with archive.subdir("") as sd:
+            sd.add(fp.name, "pip_packages.txt")
+    return archive
+def get_local_ray_processes(
+    archive: Archive,
+    processes: Optional[List[Tuple[str, bool]]] = None,
+    verbose: bool = False,
+):
+    """Get the status of all the relevant ray processes.
+    Args:
+        archive: Archive object to add process info files to.
+        processes: List of processes to get information on. The first
+            element of the tuple is a string to filter by, and the second
+            element is a boolean indicating if we should filter by command
+            name (True) or command line including parameters (False)
+        verbose: If True, show entire executable command line.
+            If False, show just the first term.
+    Returns:
+        Open archive object.
+    """
+    if not processes:
+        # local import to avoid circular dependencies
+        from ray.autoscaler._private.constants import RAY_PROCESSES
+        processes = RAY_PROCESSES
+    process_infos = []
+    for process in psutil.process_iter(["pid", "name", "cmdline", "status"]):
+        try:
+            with process.oneshot():
+                cmdline = " ".join(process.cmdline())
+                process_infos.append(
+                    (
+                        {
+                            "executable": cmdline
+                            if verbose
+                            else cmdline.split("--", 1)[0][:-1],
+                            "name": process.name(),
+                            "pid": process.pid,
+                            "status": process.status(),
+                        },
+                        process.cmdline(),
+                    )
+                )
+        except Exception as exc:
+            raise LocalCommandFailed(exc) from exc
+    relevant_processes = {}
+    for process_dict, cmdline in process_infos:
+        for keyword, filter_by_cmd in processes:
+            if filter_by_cmd:
+                corpus = process_dict["name"]
+            else:
+                corpus = subprocess.list2cmdline(cmdline)
+            if keyword in corpus and process_dict["pid"] not in relevant_processes:
+                relevant_processes[process_dict["pid"]] = process_dict
+    with tempfile.NamedTemporaryFile("wt") as fp:
+        for line in relevant_processes.values():
+            fp.writelines([yaml.dump(line), "\n"])
+        fp.flush()
+        with archive.subdir("meta") as sd:
+            sd.add(fp.name, "process_info.txt")
+    return archive
+def get_all_local_data(archive: Archive, parameters: GetParameters):
+    """Get all local data.
+    Gets:
+        - The Ray logs of the latest session
+        - The currently installed pip packages
+    Args:
+        archive: Archive object to add meta files to.
+        parameters: Parameters (settings) for getting data.
+    Returns:
+        Open archive object.
+    """
+    if not archive.is_open:
+        archive.open()
+    if parameters.logs:
+        try:
+            get_local_ray_logs(archive=archive)
+        except LocalCommandFailed as exc:
+            cli_logger.error(exc)
+    if parameters.debug_state:
+        try:
+            get_local_debug_state(archive=archive)
+        except LocalCommandFailed as exc:
+            cli_logger.error(exc)
+    if parameters.pip:
+        try:
+            get_local_pip_packages(archive=archive)
+        except LocalCommandFailed as exc:
+            cli_logger.error(exc)
+    if parameters.processes:
+        try:
+            get_local_ray_processes(
+                archive=archive,
+                processes=parameters.processes_list,
+                verbose=parameters.processes_verbose,
+            )
+        except LocalCommandFailed as exc:
+            cli_logger.error(exc)
+    return archive
+###
+# Functions to invoke remote scripts and gather data from remote nodes
+###
+def _wrap(items: List[str], quotes="'"):
+    return f"{quotes}{' '.join(items)}{quotes}"
+def create_and_get_archive_from_remote_node(
+    remote_node: Node, parameters: GetParameters, script_path: str = "ray"
+) -> Optional[str]:
+    """Create an archive containing logs on a remote node and transfer.
+    This will call ``ray local-dump --stream`` on the remote
+    node. The resulting file will be saved locally in a temporary file and
+    returned.
+    Args:
+        remote_node: Remote node to gather archive from.
+        script_path: Path to this script on the remote node.
+        parameters: Parameters (settings) for getting data.
+    Returns:
+        Path to a temporary file containing the node's collected data.
+    """
+    cmd = [
+        "ssh",
+        "-o StrictHostKeyChecking=no",
+        "-o UserKnownHostsFile=/dev/null",
+        "-o LogLevel=ERROR",
+        "-i",
+        remote_node.ssh_key,
+        f"{remote_node.ssh_user}@{remote_node.host}",
+    ]
+    if remote_node.docker_container:
+        cmd += [
+            "docker",
+            "exec",
+            remote_node.docker_container,
+        ]
+    collect_cmd = [script_path, "local-dump", "--stream"]
+    collect_cmd += ["--logs"] if parameters.logs else ["--no-logs"]
+    collect_cmd += ["--debug-state"] if parameters.debug_state else ["--no-debug-state"]
+    collect_cmd += ["--pip"] if parameters.pip else ["--no-pip"]
+    collect_cmd += ["--processes"] if parameters.processes else ["--no-processes"]
+    if parameters.processes:
+        collect_cmd += (
+            ["--processes-verbose"]
+            if parameters.processes_verbose
+            else ["--no-proccesses-verbose"]
+        )
+    cmd += ["/bin/bash", "-c", _wrap(collect_cmd, quotes='"')]
+    cat = "node" if not remote_node.is_head else "head"
+    cli_logger.print(f"Collecting data from remote node: {remote_node.host}")
+    tmp = tempfile.mkstemp(prefix=f"ray_{cat}_{remote_node.host}_", suffix=".tar.gz")[1]
+    with open(tmp, "wb") as fp:
+        try:
+            subprocess.check_call(cmd, stdout=fp, stderr=sys.stderr)
+        except subprocess.CalledProcessError as exc:
+            raise RemoteCommandFailed(
+                f"Gathering logs from remote node failed: {' '.join(cmd)}"
+            ) from exc
+    return tmp
+def create_and_add_remote_data_to_local_archive(
+    archive: Archive, remote_node: Node, parameters: GetParameters
+):
+    """Create and get data from remote node and add to local archive.
+    Args:
+        archive: Archive object to add remote data to.
+        remote_node: Remote node to gather archive from.
+        parameters: Parameters (settings) for getting data.
+    Returns:
+        Open archive object.
+    """
+    tmp = create_and_get_archive_from_remote_node(remote_node, parameters)
+    if not archive.is_open:
+        archive.open()
+    cat = "node" if not remote_node.is_head else "head"
+    with archive.subdir("", root=os.path.dirname(tmp)) as sd:
+        sd.add(tmp, arcname=f"ray_{cat}_{remote_node.host}.tar.gz")
+    return archive
+def create_and_add_local_data_to_local_archive(
+    archive: Archive, parameters: GetParameters
+):
+    """Create and get data from this node and add to archive.
+    Args:
+        archive: Archive object to add remote data to.
+        parameters: Parameters (settings) for getting data.
+    Returns:
+        Open archive object.
+    """
+    with Archive() as local_data_archive:
+        get_all_local_data(local_data_archive, parameters)
+    if not archive.is_open:
+        archive.open()
+    with archive.subdir("", root=os.path.dirname(local_data_archive.file)) as sd:
+        sd.add(local_data_archive.file, arcname="local_node.tar.gz")
+    os.remove(local_data_archive.file)
+    return archive
+def create_archive_for_remote_nodes(
+    archive: Archive, remote_nodes: Sequence[Node], parameters: GetParameters
+):
+    """Create an archive combining data from the remote nodes.
+    This will parallelize calls to get data from remote nodes.
+    Args:
+        archive: Archive object to add remote data to.
+        remote_nodes (Sequence[Node]): Sequence of remote nodes.
+        parameters: Parameters (settings) for getting data.
+    Returns:
+        Open archive object.
+    """
+    if not archive.is_open:
+        archive.open()
+    with ThreadPoolExecutor(max_workers=MAX_PARALLEL_SSH_WORKERS) as executor:
+        for remote_node in remote_nodes:
+            executor.submit(
+                create_and_add_remote_data_to_local_archive,
+                archive=archive,
+                remote_node=remote_node,
+                parameters=parameters,
+            )
+    return archive
+def create_archive_for_local_and_remote_nodes(
+    archive: Archive, remote_nodes: Sequence[Node], parameters: GetParameters
+):
+    """Create an archive combining data from the local and remote nodes.
+    This will parallelize calls to get data from remote nodes.
+    Args:
+        archive: Archive object to add data to.
+        remote_nodes (Sequence[Node]): Sequence of remote nodes.
+        parameters: Parameters (settings) for getting data.
+    Returns:
+        Open archive object.
+    """
+    if not archive.is_open:
+        archive.open()
+    try:
+        create_and_add_local_data_to_local_archive(archive, parameters)
+    except CommandFailed as exc:
+        cli_logger.error(exc)
+    create_archive_for_remote_nodes(archive, remote_nodes, parameters)
+    cli_logger.print(
+        f"Collected data from local node and {len(remote_nodes)} " f"remote nodes."
+    )
+    return archive
+###
+# Ray cluster info
+###
+def get_info_from_ray_cluster_config(
+    cluster_config: str,
+) -> Tuple[List[str], str, str, Optional[str], Optional[str]]:
+    """Get information from Ray cluster config.
+    Return list of host IPs, ssh user, ssh key file, and optional docker
+    container.
+    Args:
+        cluster_config: Path to ray cluster config.
+    Returns:
+        Tuple of list of host IPs, ssh user name, ssh key file path,
+            optional docker container name, optional cluster name.
+    """
+    from ray.autoscaler._private.commands import _bootstrap_config
+    cli_logger.print(
+        f"Retrieving cluster information from ray cluster file: " f"{cluster_config}"
+    )
+    cluster_config = os.path.expanduser(cluster_config)
+    config = yaml.safe_load(open(cluster_config).read())
+    config = _bootstrap_config(config, no_config_cache=True)
+    provider = _get_node_provider(config["provider"], config["cluster_name"])
+    head_nodes = provider.non_terminated_nodes({TAG_RAY_NODE_KIND: NODE_KIND_HEAD})
+    worker_nodes = provider.non_terminated_nodes({TAG_RAY_NODE_KIND: NODE_KIND_WORKER})
+    hosts = [provider.external_ip(node) for node in head_nodes + worker_nodes]
+    ssh_user = config["auth"]["ssh_user"]
+    ssh_key = config["auth"]["ssh_private_key"]
+    docker = None
+    docker_config = config.get("docker", None)
+    if docker_config:
+        docker = docker_config.get("container_name", None)
+    cluster_name = config.get("cluster_name", None)
+    return hosts, ssh_user, ssh_key, docker, cluster_name
+def _info_from_params(
+    cluster: Optional[str] = None,
+    host: Optional[str] = None,
+    ssh_user: Optional[str] = None,
+    ssh_key: Optional[str] = None,
+    docker: Optional[str] = None,
+):
+    """Parse command line arguments.
+    Note: This returns a list of hosts, not a comma separated string!
+    """
+    if not host and not cluster:
+        bootstrap_config = os.path.expanduser("~/ray_bootstrap_config.yaml")
+        if os.path.exists(bootstrap_config):
+            cluster = bootstrap_config
+            cli_logger.warning(
+                f"Detected cluster config file at {cluster}. "
+                f"If this is incorrect, specify with "
+                f"`ray cluster-dump <config>`"
+            )
+    elif cluster:
+        cluster = os.path.expanduser(cluster)
+    cluster_name = None
+    if cluster:
+        h, u, k, d, cluster_name = get_info_from_ray_cluster_config(cluster)
+        ssh_user = ssh_user or u
+        ssh_key = ssh_key or k
+        docker = docker or d
+        hosts = host.split(",") if host else h
+        if not hosts:
+            raise LocalCommandFailed(
+                f"Invalid cluster file or cluster has no running nodes: " f"{cluster}"
+            )
+    elif host:
+        hosts = host.split(",")
+    else:
+        raise LocalCommandFailed(
+            "You need to either specify a `<cluster_config>` or `--host`."
+        )
+    if not ssh_user:
+        ssh_user = DEFAULT_SSH_USER
+        cli_logger.warning(
+            f"Using default SSH user `{ssh_user}`. "
+            f"If this is incorrect, specify with `--ssh-user <user>`"
+        )
+    if not ssh_key:
+        for cand_key in DEFAULT_SSH_KEYS:
+            cand_key_file = os.path.expanduser(cand_key)
+            if os.path.exists(cand_key_file):
+                ssh_key = cand_key_file
+                cli_logger.warning(
+                    f"Auto detected SSH key file: {ssh_key}. "
+                    f"If this is incorrect, specify with `--ssh-key <key>`"
+                )
+                break
+    return cluster, hosts, ssh_user, ssh_key, docker, cluster_name

.venv/lib/python3.11/site-packages/ray/autoscaler/_private/command_runner.py ADDED Viewed

	@@ -0,0 +1,921 @@

+import hashlib
+import json
+import logging
+import os
+import subprocess
+import sys
+import time
+from getpass import getuser
+from shlex import quote
+from typing import Dict, List
+import click
+from ray._private.ray_constants import DEFAULT_OBJECT_STORE_MAX_MEMORY_BYTES
+from ray.autoscaler._private.cli_logger import cf, cli_logger
+from ray.autoscaler._private.constants import (
+    AUTOSCALER_NODE_SSH_INTERVAL_S,
+    AUTOSCALER_NODE_START_WAIT_S,
+    DEFAULT_OBJECT_STORE_MEMORY_PROPORTION,
+)
+from ray.autoscaler._private.docker import (
+    check_bind_mounts_cmd,
+    check_docker_image,
+    check_docker_running_cmd,
+    docker_start_cmds,
+    with_docker_exec,
+)
+from ray.autoscaler._private.log_timer import LogTimer
+from ray.autoscaler._private.subprocess_output_util import (
+    ProcessRunnerError,
+    is_output_redirected,
+    run_cmd_redirected,
+)
+from ray.autoscaler.command_runner import CommandRunnerInterface
+logger = logging.getLogger(__name__)
+# How long to wait for a node to start, in seconds
+HASH_MAX_LENGTH = 10
+KUBECTL_RSYNC = os.path.join(
+    os.path.dirname(os.path.abspath(__file__)), "_kubernetes/kubectl-rsync.sh"
+)
+MAX_HOME_RETRIES = 3
+HOME_RETRY_DELAY_S = 5
+_config = {"use_login_shells": True, "silent_rsync": True}
+def is_rsync_silent():
+    return _config["silent_rsync"]
+def set_rsync_silent(val):
+    """Choose whether to silence rsync output.
+    Most commands will want to list rsync'd files themselves rather than
+    print the default rsync spew.
+    """
+    _config["silent_rsync"] = val
+def is_using_login_shells():
+    return _config["use_login_shells"]
+def set_using_login_shells(val: bool):
+    """Choose between login and non-interactive shells.
+    Non-interactive shells have the benefit of receiving less output from
+    subcommands (since progress bars and TTY control codes are not printed).
+    Sometimes this can be significant since e.g. `pip install` prints
+    hundreds of progress bar lines when downloading.
+    Login shells have the benefit of working very close to how a proper bash
+    session does, regarding how scripts execute and how the environment is
+    setup. This is also how all commands were ran in the past. The only reason
+    to use login shells over non-interactive shells is if you need some weird
+    and non-robust tool to work.
+    Args:
+        val: If true, login shells will be used to run all commands.
+    """
+    _config["use_login_shells"] = val
+def _with_environment_variables(cmd: str, environment_variables: Dict[str, object]):
+    """Prepend environment variables to a shell command.
+    Args:
+        cmd: The base command.
+        environment_variables (Dict[str, object]): The set of environment
+            variables. If an environment variable value is a dict, it will
+            automatically be converted to a one line yaml string.
+    """
+    as_strings = []
+    for key, val in environment_variables.items():
+        val = json.dumps(val, separators=(",", ":"))
+        s = "export {}={};".format(key, quote(val))
+        as_strings.append(s)
+    all_vars = "".join(as_strings)
+    return all_vars + cmd
+def _with_interactive(cmd):
+    force_interactive = (
+        f"source ~/.bashrc; "
+        f"export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && ({cmd})"
+    )
+    return ["bash", "--login", "-c", "-i", quote(force_interactive)]
+class SSHOptions:
+    def __init__(self, ssh_key, control_path=None, **kwargs):
+        self.ssh_key = ssh_key
+        self.arg_dict = {
+            # Supresses initial fingerprint verification.
+            "StrictHostKeyChecking": "no",
+            # SSH IP and fingerprint pairs no longer added to known_hosts.
+            # This is to remove a "REMOTE HOST IDENTIFICATION HAS CHANGED"
+            # warning if a new node has the same IP as a previously
+            # deleted node, because the fingerprints will not match in
+            # that case.
+            "UserKnownHostsFile": os.devnull,
+            # Try fewer extraneous key pairs.
+            "IdentitiesOnly": "yes",
+            # Abort if port forwarding fails (instead of just printing to
+            # stderr).
+            "ExitOnForwardFailure": "yes",
+            # Quickly kill the connection if network connection breaks (as
+            # opposed to hanging/blocking).
+            "ServerAliveInterval": 5,
+            "ServerAliveCountMax": 3,
+        }
+        if control_path:
+            self.arg_dict.update(
+                {
+                    "ControlMaster": "auto",
+                    "ControlPath": "{}/%C".format(control_path),
+                    "ControlPersist": "10s",
+                }
+            )
+        self.arg_dict.update(kwargs)
+    def to_ssh_options_list(self, *, timeout=60):
+        self.arg_dict["ConnectTimeout"] = "{}s".format(timeout)
+        ssh_key_option = ["-i", self.ssh_key] if self.ssh_key else []
+        return ssh_key_option + [
+            x
+            for y in (
+                ["-o", "{}={}".format(k, v)]
+                for k, v in self.arg_dict.items()
+                if v is not None
+            )
+            for x in y
+        ]
+class SSHCommandRunner(CommandRunnerInterface):
+    def __init__(
+        self,
+        log_prefix,
+        node_id,
+        provider,
+        auth_config,
+        cluster_name,
+        process_runner,
+        use_internal_ip,
+    ):
+        ssh_control_hash = hashlib.sha1(cluster_name.encode()).hexdigest()
+        ssh_user_hash = hashlib.sha1(getuser().encode()).hexdigest()
+        ssh_control_path = "/tmp/ray_ssh_{}/{}".format(
+            ssh_user_hash[:HASH_MAX_LENGTH], ssh_control_hash[:HASH_MAX_LENGTH]
+        )
+        self.cluster_name = cluster_name
+        self.log_prefix = log_prefix
+        self.process_runner = process_runner
+        self.node_id = node_id
+        self.use_internal_ip = use_internal_ip
+        self.provider = provider
+        self.ssh_private_key = auth_config.get("ssh_private_key")
+        self.ssh_user = auth_config["ssh_user"]
+        self.ssh_control_path = ssh_control_path
+        self.ssh_ip = None
+        self.ssh_proxy_command = auth_config.get("ssh_proxy_command", None)
+        self.ssh_options = SSHOptions(
+            self.ssh_private_key,
+            self.ssh_control_path,
+            ProxyCommand=self.ssh_proxy_command,
+        )
+    def _get_node_ip(self):
+        if self.use_internal_ip:
+            return self.provider.internal_ip(self.node_id)
+        else:
+            return self.provider.external_ip(self.node_id)
+    def _wait_for_ip(self, deadline):
+        # if we have IP do not print waiting info
+        ip = self._get_node_ip()
+        if ip is not None:
+            cli_logger.labeled_value("Fetched IP", ip)
+            return ip
+        interval = AUTOSCALER_NODE_SSH_INTERVAL_S
+        with cli_logger.group("Waiting for IP"):
+            while time.time() < deadline and not self.provider.is_terminated(
+                self.node_id
+            ):
+                ip = self._get_node_ip()
+                if ip is not None:
+                    cli_logger.labeled_value("Received", ip)
+                    return ip
+                cli_logger.print(
+                    "Not yet available, retrying in {} seconds", cf.bold(str(interval))
+                )
+                time.sleep(interval)
+        return None
+    def _set_ssh_ip_if_required(self):
+        if self.ssh_ip is not None:
+            return
+        # We assume that this never changes.
+        #   I think that's reasonable.
+        deadline = time.time() + AUTOSCALER_NODE_START_WAIT_S
+        with LogTimer(self.log_prefix + "Got IP"):
+            ip = self._wait_for_ip(deadline)
+            cli_logger.doassert(ip is not None, "Could not get node IP.")  # todo: msg
+            assert ip is not None, "Unable to find IP of node"
+        self.ssh_ip = ip
+        # This should run before any SSH commands and therefore ensure that
+        #   the ControlPath directory exists, allowing SSH to maintain
+        #   persistent sessions later on.
+        try:
+            os.makedirs(self.ssh_control_path, mode=0o700, exist_ok=True)
+        except OSError as e:
+            cli_logger.warning("{}", str(e))  # todo: msg
+    def _run_helper(
+        self, final_cmd, with_output=False, exit_on_fail=False, silent=False
+    ):
+        """Run a command that was already setup with SSH and `bash` settings.
+        Args:
+            cmd (List[str]):
+                Full command to run. Should include SSH options and other
+                processing that we do.
+            with_output (bool):
+                If `with_output` is `True`, command stdout will be captured and
+                returned.
+            exit_on_fail (bool):
+                If `exit_on_fail` is `True`, the process will exit
+                if the command fails (exits with a code other than 0).
+        Raises:
+            ProcessRunnerError if using new log style and disabled
+                login shells.
+            click.ClickException if using login shells.
+        """
+        try:
+            # For now, if the output is needed we just skip the new logic.
+            # In the future we could update the new logic to support
+            # capturing output, but it is probably not needed.
+            if not with_output:
+                return run_cmd_redirected(
+                    final_cmd,
+                    process_runner=self.process_runner,
+                    silent=silent,
+                    use_login_shells=is_using_login_shells(),
+                )
+            else:
+                return self.process_runner.check_output(final_cmd)
+        except subprocess.CalledProcessError as e:
+            joined_cmd = " ".join(final_cmd)
+            if not is_using_login_shells():
+                raise ProcessRunnerError(
+                    "Command failed",
+                    "ssh_command_failed",
+                    code=e.returncode,
+                    command=joined_cmd,
+                )
+            if exit_on_fail:
+                raise click.ClickException(
+                    "Command failed:\n\n  {}\n".format(joined_cmd)
+                ) from None
+            else:
+                fail_msg = "SSH command failed."
+                if is_output_redirected():
+                    fail_msg += " See above for the output from the failure."
+                raise click.ClickException(fail_msg) from None
+        finally:
+            # Do our best to flush output to terminal.
+            # See https://github.com/ray-project/ray/pull/19473.
+            sys.stdout.flush()
+            sys.stderr.flush()
+    def run(
+        self,
+        cmd,
+        timeout=120,
+        exit_on_fail=False,
+        port_forward=None,
+        with_output=False,
+        environment_variables: Dict[str, object] = None,
+        run_env="auto",  # Unused argument.
+        ssh_options_override_ssh_key="",
+        shutdown_after_run=False,
+        silent=False,
+    ):
+        if shutdown_after_run:
+            cmd += "; sudo shutdown -h now"
+        if ssh_options_override_ssh_key:
+            if self.ssh_proxy_command:
+                ssh_options = SSHOptions(
+                    ssh_options_override_ssh_key, ProxyCommand=self.ssh_proxy_command
+                )
+            else:
+                ssh_options = SSHOptions(ssh_options_override_ssh_key)
+        else:
+            ssh_options = self.ssh_options
+        assert isinstance(
+            ssh_options, SSHOptions
+        ), "ssh_options must be of type SSHOptions, got {}".format(type(ssh_options))
+        self._set_ssh_ip_if_required()
+        if is_using_login_shells():
+            ssh = ["ssh", "-tt"]
+        else:
+            ssh = ["ssh"]
+        if port_forward:
+            with cli_logger.group("Forwarding ports"):
+                if not isinstance(port_forward, list):
+                    port_forward = [port_forward]
+                for local, remote in port_forward:
+                    cli_logger.verbose(
+                        "Forwarding port {} to port {} on localhost.",
+                        cf.bold(local),
+                        cf.bold(remote),
+                    )  # todo: msg
+                    ssh += ["-L", "{}:localhost:{}".format(local, remote)]
+        final_cmd = (
+            ssh
+            + ssh_options.to_ssh_options_list(timeout=timeout)
+            + ["{}@{}".format(self.ssh_user, self.ssh_ip)]
+        )
+        if cmd:
+            if environment_variables:
+                cmd = _with_environment_variables(cmd, environment_variables)
+            if is_using_login_shells():
+                final_cmd += _with_interactive(cmd)
+            else:
+                final_cmd += [cmd]
+        else:
+            # We do this because `-o ControlMaster` causes the `-N` flag to
+            # still create an interactive shell in some ssh versions.
+            final_cmd.append("while true; do sleep 86400; done")
+        cli_logger.verbose("Running `{}`", cf.bold(cmd))
+        with cli_logger.indented():
+            cli_logger.very_verbose(
+                "Full command is `{}`", cf.bold(" ".join(final_cmd))
+            )
+        if cli_logger.verbosity > 0:
+            with cli_logger.indented():
+                return self._run_helper(
+                    final_cmd, with_output, exit_on_fail, silent=silent
+                )
+        else:
+            return self._run_helper(final_cmd, with_output, exit_on_fail, silent=silent)
+    def _create_rsync_filter_args(self, options):
+        rsync_excludes = options.get("rsync_exclude") or []
+        rsync_filters = options.get("rsync_filter") or []
+        exclude_args = [
+            ["--exclude", rsync_exclude] for rsync_exclude in rsync_excludes
+        ]
+        filter_args = [
+            ["--filter", "dir-merge,- {}".format(rsync_filter)]
+            for rsync_filter in rsync_filters
+        ]
+        # Combine and flatten the two lists
+        return [arg for args_list in exclude_args + filter_args for arg in args_list]
+    def run_rsync_up(self, source, target, options=None):
+        self._set_ssh_ip_if_required()
+        options = options or {}
+        command = ["rsync"]
+        command += [
+            "--rsh",
+            subprocess.list2cmdline(
+                ["ssh"] + self.ssh_options.to_ssh_options_list(timeout=120)
+            ),
+        ]
+        command += ["-avz"]
+        command += self._create_rsync_filter_args(options=options)
+        command += [source, "{}@{}:{}".format(self.ssh_user, self.ssh_ip, target)]
+        cli_logger.verbose("Running `{}`", cf.bold(" ".join(command)))
+        self._run_helper(command, silent=is_rsync_silent())
+    def run_rsync_down(self, source, target, options=None):
+        self._set_ssh_ip_if_required()
+        command = ["rsync"]
+        command += [
+            "--rsh",
+            subprocess.list2cmdline(
+                ["ssh"] + self.ssh_options.to_ssh_options_list(timeout=120)
+            ),
+        ]
+        command += ["-avz"]
+        command += self._create_rsync_filter_args(options=options)
+        command += ["{}@{}:{}".format(self.ssh_user, self.ssh_ip, source), target]
+        cli_logger.verbose("Running `{}`", cf.bold(" ".join(command)))
+        self._run_helper(command, silent=is_rsync_silent())
+    def remote_shell_command_str(self):
+        if self.ssh_private_key:
+            return "ssh -o IdentitiesOnly=yes -i {} {}@{}\n".format(
+                self.ssh_private_key, self.ssh_user, self.ssh_ip
+            )
+        else:
+            return "ssh -o IdentitiesOnly=yes {}@{}\n".format(
+                self.ssh_user, self.ssh_ip
+            )
+class DockerCommandRunner(CommandRunnerInterface):
+    def __init__(self, docker_config, **common_args):
+        self.ssh_command_runner = SSHCommandRunner(**common_args)
+        self.container_name = docker_config["container_name"]
+        self.docker_config = docker_config
+        self.home_dir = None
+        self.initialized = False
+        # Optionally use 'podman' instead of 'docker'
+        use_podman = docker_config.get("use_podman", False)
+        self.docker_cmd = "podman" if use_podman else "docker"
+    def run(
+        self,
+        cmd,
+        timeout=120,
+        exit_on_fail=False,
+        port_forward=None,
+        with_output=False,
+        environment_variables: Dict[str, object] = None,
+        run_env="auto",
+        ssh_options_override_ssh_key="",
+        shutdown_after_run=False,
+    ):
+        if run_env == "auto":
+            run_env = (
+                "host"
+                if (not bool(cmd) or cmd.find(self.docker_cmd) == 0)
+                else self.docker_cmd
+            )
+        if environment_variables:
+            cmd = _with_environment_variables(cmd, environment_variables)
+        if run_env == "docker":
+            cmd = self._docker_expand_user(cmd, any_char=True)
+            if is_using_login_shells():
+                cmd = " ".join(_with_interactive(cmd))
+            cmd = with_docker_exec(
+                [cmd],
+                container_name=self.container_name,
+                with_interactive=is_using_login_shells(),
+                docker_cmd=self.docker_cmd,
+            )[0]
+        if shutdown_after_run:
+            # sudo shutdown should run after `with_docker_exec` command above
+            cmd += "; sudo shutdown -h now"
+        # Do not pass shutdown_after_run argument to ssh_command_runner.run()
+        # since it is handled above.
+        return self.ssh_command_runner.run(
+            cmd,
+            timeout=timeout,
+            exit_on_fail=exit_on_fail,
+            port_forward=port_forward,
+            with_output=with_output,
+            ssh_options_override_ssh_key=ssh_options_override_ssh_key,
+        )
+    def run_rsync_up(self, source, target, options=None):
+        options = options or {}
+        host_destination = os.path.join(
+            self._get_docker_host_mount_location(self.ssh_command_runner.cluster_name),
+            target.lstrip("/"),
+        )
+        host_mount_location = os.path.dirname(host_destination.rstrip("/"))
+        self.ssh_command_runner.run(
+            f"mkdir -p {host_mount_location} && chown -R "
+            f"{self.ssh_command_runner.ssh_user} {host_mount_location}",
+            silent=is_rsync_silent(),
+        )
+        self.ssh_command_runner.run_rsync_up(source, host_destination, options=options)
+        if self._check_container_status() and not options.get(
+            "docker_mount_if_possible", False
+        ):
+            if os.path.isdir(source):
+                # Adding a "." means that docker copies the *contents*
+                # Without it, docker copies the source *into* the target
+                host_destination += "/."
+            # This path may not exist inside the container. This ensures
+            # that the path is created!
+            prefix = with_docker_exec(
+                [
+                    "mkdir -p {}".format(
+                        os.path.dirname(self._docker_expand_user(target))
+                    )
+                ],
+                container_name=self.container_name,
+                with_interactive=is_using_login_shells(),
+                docker_cmd=self.docker_cmd,
+            )[0]
+            self.ssh_command_runner.run(
+                "{} && rsync -e '{} exec -i' -avz {} {}:{}".format(
+                    prefix,
+                    self.docker_cmd,
+                    host_destination,
+                    self.container_name,
+                    self._docker_expand_user(target),
+                ),
+                silent=is_rsync_silent(),
+            )
+    def run_rsync_down(self, source, target, options=None):
+        options = options or {}
+        host_source = os.path.join(
+            self._get_docker_host_mount_location(self.ssh_command_runner.cluster_name),
+            source.lstrip("/"),
+        )
+        host_mount_location = os.path.dirname(host_source.rstrip("/"))
+        self.ssh_command_runner.run(
+            f"mkdir -p {host_mount_location} && chown -R "
+            f"{self.ssh_command_runner.ssh_user} {host_mount_location}",
+            silent=is_rsync_silent(),
+        )
+        if source[-1] == "/":
+            source += "."
+            # Adding a "." means that docker copies the *contents*
+            # Without it, docker copies the source *into* the target
+        if not options.get("docker_mount_if_possible", False):
+            # NOTE: `--delete` is okay here because the container is the source
+            # of truth.
+            self.ssh_command_runner.run(
+                "rsync -e '{} exec -i' -avz --delete {}:{} {}".format(
+                    self.docker_cmd,
+                    self.container_name,
+                    self._docker_expand_user(source),
+                    host_source,
+                ),
+                silent=is_rsync_silent(),
+            )
+        self.ssh_command_runner.run_rsync_down(host_source, target, options=options)
+    def remote_shell_command_str(self):
+        inner_str = (
+            self.ssh_command_runner.remote_shell_command_str()
+            .replace("ssh", "ssh -tt", 1)
+            .strip("\n")
+        )
+        return inner_str + " {} exec -it {} /bin/bash\n".format(
+            self.docker_cmd, self.container_name
+        )
+    def _check_docker_installed(self):
+        no_exist = "NoExist"
+        output = self.ssh_command_runner.run(
+            f"command -v {self.docker_cmd} || echo '{no_exist}'", with_output=True
+        )
+        cleaned_output = output.decode().strip()
+        if no_exist in cleaned_output or "docker" not in cleaned_output:
+            if self.docker_cmd == "docker":
+                install_commands = [
+                    "curl -fsSL https://get.docker.com -o get-docker.sh",
+                    "sudo sh get-docker.sh",
+                    "sudo usermod -aG docker $USER",
+                    "sudo systemctl restart docker -f",
+                ]
+            else:
+                install_commands = [
+                    "sudo apt-get update",
+                    "sudo apt-get -y install podman",
+                ]
+            logger.error(
+                f"{self.docker_cmd.capitalize()} not installed. You can "
+                f"install {self.docker_cmd.capitalize()} by adding the "
+                "following commands to 'initialization_commands':\n"
+                + "\n".join(install_commands)
+            )
+    def _check_container_status(self):
+        if self.initialized:
+            return True
+        output = (
+            self.ssh_command_runner.run(
+                check_docker_running_cmd(self.container_name, self.docker_cmd),
+                with_output=True,
+            )
+            .decode("utf-8")
+            .strip()
+        )
+        # Checks for the false positive where "true" is in the container name
+        return "true" in output.lower() and "no such object" not in output.lower()
+    def _docker_expand_user(self, string, any_char=False):
+        user_pos = string.find("~")
+        if user_pos > -1:
+            if self.home_dir is None:
+                self.home_dir = (
+                    self.ssh_command_runner.run(
+                        f"{self.docker_cmd} exec {self.container_name} "
+                        "printenv HOME",
+                        with_output=True,
+                    )
+                    .decode("utf-8")
+                    .strip()
+                )
+            if any_char:
+                return string.replace("~/", self.home_dir + "/")
+            elif not any_char and user_pos == 0:
+                return string.replace("~", self.home_dir, 1)
+        return string
+    def _check_if_container_restart_is_needed(
+        self, image: str, cleaned_bind_mounts: Dict[str, str]
+    ) -> bool:
+        re_init_required = False
+        running_image = (
+            self.run(
+                check_docker_image(self.container_name, self.docker_cmd),
+                with_output=True,
+                run_env="host",
+            )
+            .decode("utf-8")
+            .strip()
+        )
+        if running_image != image:
+            cli_logger.error(
+                "A container with name {} is running image {} instead "
+                + "of {} (which was provided in the YAML)",
+                self.container_name,
+                running_image,
+                image,
+            )
+        mounts = (
+            self.run(
+                check_bind_mounts_cmd(self.container_name, self.docker_cmd),
+                with_output=True,
+                run_env="host",
+            )
+            .decode("utf-8")
+            .strip()
+        )
+        try:
+            active_mounts = json.loads(mounts)
+            active_remote_mounts = {
+                mnt["Destination"].strip("/") for mnt in active_mounts
+            }
+            # Ignore ray bootstrap files.
+            requested_remote_mounts = {
+                self._docker_expand_user(remote).strip("/")
+                for remote in cleaned_bind_mounts.keys()
+            }
+            unfulfilled_mounts = requested_remote_mounts - active_remote_mounts
+            if unfulfilled_mounts:
+                re_init_required = True
+                cli_logger.warning(
+                    "This Docker Container is already running. "
+                    "Restarting the Docker container on "
+                    "this node to pick up the following file_mounts {}",
+                    unfulfilled_mounts,
+                )
+        except json.JSONDecodeError:
+            cli_logger.verbose(
+                "Unable to check if file_mounts specified in the YAML "
+                "differ from those on the running container."
+            )
+        return re_init_required
+    def run_init(
+        self, *, as_head: bool, file_mounts: Dict[str, str], sync_run_yet: bool
+    ):
+        BOOTSTRAP_MOUNTS = ["~/ray_bootstrap_config.yaml", "~/ray_bootstrap_key.pem"]
+        specific_image = self.docker_config.get(
+            f"{'head' if as_head else 'worker'}_image", self.docker_config.get("image")
+        )
+        self._check_docker_installed()
+        if self.docker_config.get("pull_before_run", True):
+            assert specific_image, (
+                "Image must be included in config if " + "pull_before_run is specified"
+            )
+            self.run(
+                "{} pull {}".format(self.docker_cmd, specific_image), run_env="host"
+            )
+        else:
+            self.run(
+                f"{self.docker_cmd} image inspect {specific_image} "
+                "1> /dev/null  2>&1 || "
+                f"{self.docker_cmd} pull {specific_image}"
+            )
+        # Bootstrap files cannot be bind mounted because docker opens the
+        # underlying inode. When the file is switched, docker becomes outdated.
+        cleaned_bind_mounts = file_mounts.copy()
+        for mnt in BOOTSTRAP_MOUNTS:
+            cleaned_bind_mounts.pop(mnt, None)
+        docker_run_executed = False
+        container_running = self._check_container_status()
+        requires_re_init = False
+        if container_running:
+            requires_re_init = self._check_if_container_restart_is_needed(
+                specific_image, cleaned_bind_mounts
+            )
+            if requires_re_init:
+                self.run(
+                    f"{self.docker_cmd} stop {self.container_name}", run_env="host"
+                )
+        if (not container_running) or requires_re_init:
+            if not sync_run_yet:
+                # Do not start the actual image as we need to run file_sync
+                # first to ensure that all folders are created with the
+                # correct ownership. Docker will create the folders with
+                # `root` as the owner.
+                return True
+            # Get home directory
+            image_env = (
+                self.ssh_command_runner.run(
+                    f"{self.docker_cmd} "
+                    + "inspect -f '{{json .Config.Env}}' "
+                    + specific_image,
+                    with_output=True,
+                )
+                .decode()
+                .strip()
+            )
+            home_directory = "/root"
+            try:
+                for env_var in json.loads(image_env):
+                    if env_var.startswith("HOME="):
+                        home_directory = env_var.split("HOME=")[1]
+                        break
+            except json.JSONDecodeError as e:
+                cli_logger.error(
+                    "Unable to deserialize `image_env` to Python object. "
+                    f"The `image_env` is:\n{image_env}"
+                )
+                raise e
+            user_docker_run_options = self.docker_config.get(
+                "run_options", []
+            ) + self.docker_config.get(
+                f"{'head' if as_head else 'worker'}_run_options", []
+            )
+            start_command = docker_start_cmds(
+                self.ssh_command_runner.ssh_user,
+                specific_image,
+                cleaned_bind_mounts,
+                self.container_name,
+                self._configure_runtime(
+                    self._auto_configure_shm(user_docker_run_options)
+                ),
+                self.ssh_command_runner.cluster_name,
+                home_directory,
+                self.docker_cmd,
+            )
+            self.run(start_command, run_env="host")
+            docker_run_executed = True
+        # Explicitly copy in ray bootstrap files.
+        for mount in BOOTSTRAP_MOUNTS:
+            if mount in file_mounts:
+                if not sync_run_yet:
+                    # NOTE(ilr) This rsync is needed because when starting from
+                    #  a stopped instance,  /tmp may be deleted and `run_init`
+                    # is called before the first `file_sync` happens
+                    self.run_rsync_up(file_mounts[mount], mount)
+                self.ssh_command_runner.run(
+                    "rsync -e '{cmd} exec -i' -avz {src} {container}:{dst}".format(
+                        cmd=self.docker_cmd,
+                        src=os.path.join(
+                            self._get_docker_host_mount_location(
+                                self.ssh_command_runner.cluster_name
+                            ),
+                            mount,
+                        ),
+                        container=self.container_name,
+                        dst=self._docker_expand_user(mount),
+                    )
+                )
+                try:
+                    # Check if the current user has read permission.
+                    # If they do not, try to change ownership!
+                    self.run(
+                        f"cat {mount} >/dev/null 2>&1 || "
+                        f"sudo chown $(id -u):$(id -g) {mount}"
+                    )
+                except Exception:
+                    lsl_string = (
+                        self.run(f"ls -l {mount}", with_output=True)
+                        .decode("utf-8")
+                        .strip()
+                    )
+                    # The string is of format <Permission> <Links>
+                    # <Owner> <Group> <Size> <Date> <Name>
+                    permissions = lsl_string.split(" ")[0]
+                    owner = lsl_string.split(" ")[2]
+                    group = lsl_string.split(" ")[3]
+                    current_user = (
+                        self.run("whoami", with_output=True).decode("utf-8").strip()
+                    )
+                    cli_logger.warning(
+                        f"File ({mount}) is owned by user:{owner} and group:"
+                        f"{group} with permissions ({permissions}). The "
+                        f"current user ({current_user}) does not have "
+                        "permission to read these files, and Ray may not be "
+                        "able to autoscale. This can be resolved by "
+                        "installing `sudo` in your container, or adding a "
+                        f"command like 'chown {current_user} {mount}' to "
+                        "your `setup_commands`."
+                    )
+        self.initialized = True
+        return docker_run_executed
+    def _configure_runtime(self, run_options: List[str]) -> List[str]:
+        if self.docker_config.get("disable_automatic_runtime_detection"):
+            return run_options
+        runtime_output = (
+            self.ssh_command_runner.run(
+                f"{self.docker_cmd} " + "info -f '{{.Runtimes}}' ", with_output=True
+            )
+            .decode()
+            .strip()
+        )
+        if "nvidia-container-runtime" in runtime_output:
+            try:
+                self.ssh_command_runner.run("nvidia-smi", with_output=False)
+                return run_options + ["--runtime=nvidia"]
+            except Exception as e:
+                logger.warning(
+                    "Nvidia Container Runtime is present, but no GPUs found."
+                )
+                logger.debug(f"nvidia-smi error: {e}")
+                return run_options
+        return run_options
+    def _auto_configure_shm(self, run_options: List[str]) -> List[str]:
+        if self.docker_config.get("disable_shm_size_detection"):
+            return run_options
+        for run_opt in run_options:
+            if "--shm-size" in run_opt:
+                logger.info(
+                    "Bypassing automatic SHM-Detection because of "
+                    f"`run_option`: {run_opt}"
+                )
+                return run_options
+        try:
+            shm_output = (
+                self.ssh_command_runner.run(
+                    "cat /proc/meminfo || true", with_output=True
+                )
+                .decode()
+                .strip()
+            )
+            available_memory = int(
+                [ln for ln in shm_output.split("\n") if "MemAvailable" in ln][
+                    0
+                ].split()[1]
+            )
+            available_memory_bytes = available_memory * 1024
+            # Overestimate SHM size by 10%
+            shm_size = min(
+                (available_memory_bytes * DEFAULT_OBJECT_STORE_MEMORY_PROPORTION * 1.1),
+                DEFAULT_OBJECT_STORE_MAX_MEMORY_BYTES,
+            )
+            return run_options + [f"--shm-size='{shm_size}b'"]
+        except Exception as e:
+            logger.warning(f"Received error while trying to auto-compute SHM size {e}")
+            return run_options
+    def _get_docker_host_mount_location(self, cluster_name: str) -> str:
+        """Return the docker host mount directory location."""
+        # Imported here due to circular dependency in imports.
+        from ray.autoscaler.sdk import get_docker_host_mount_location
+        return get_docker_host_mount_location(cluster_name)

.venv/lib/python3.11/site-packages/ray/autoscaler/_private/commands.py ADDED Viewed

	@@ -0,0 +1,1631 @@

+import copy
+import datetime
+import hashlib
+import json
+import logging
+import os
+import random
+import shutil
+import subprocess
+import sys
+import tempfile
+import time
+from concurrent.futures import ThreadPoolExecutor
+from types import ModuleType
+from typing import Any, Dict, List, Optional, Tuple, Union
+import click
+import yaml
+import ray
+from ray._private.usage import usage_lib
+from ray.autoscaler._private import subprocess_output_util as cmd_output_util
+from ray.autoscaler._private.autoscaler import AutoscalerSummary
+from ray.autoscaler._private.cli_logger import cf, cli_logger
+from ray.autoscaler._private.cluster_dump import (
+    Archive,
+    GetParameters,
+    Node,
+    _info_from_params,
+    create_archive_for_local_and_remote_nodes,
+    create_archive_for_remote_nodes,
+    get_all_local_data,
+)
+from ray.autoscaler._private.command_runner import (
+    set_rsync_silent,
+    set_using_login_shells,
+)
+from ray.autoscaler._private.constants import (
+    AUTOSCALER_RESOURCE_REQUEST_CHANNEL,
+    MAX_PARALLEL_SHUTDOWN_WORKERS,
+)
+from ray.autoscaler._private.event_system import CreateClusterEvent, global_event_system
+from ray.autoscaler._private.log_timer import LogTimer
+from ray.autoscaler._private.node_provider_availability_tracker import (
+    NodeAvailabilitySummary,
+)
+from ray.autoscaler._private.providers import (
+    _NODE_PROVIDERS,
+    _PROVIDER_PRETTY_NAMES,
+    _get_node_provider,
+)
+from ray.autoscaler._private.updater import NodeUpdaterThread
+from ray.autoscaler._private.util import (
+    LoadMetricsSummary,
+    format_info_string,
+    hash_launch_conf,
+    hash_runtime_conf,
+    prepare_config,
+    validate_config,
+)
+from ray.autoscaler.node_provider import NodeProvider
+from ray.autoscaler.tags import (
+    NODE_KIND_HEAD,
+    NODE_KIND_WORKER,
+    STATUS_UNINITIALIZED,
+    STATUS_UP_TO_DATE,
+    TAG_RAY_LAUNCH_CONFIG,
+    TAG_RAY_NODE_KIND,
+    TAG_RAY_NODE_NAME,
+    TAG_RAY_NODE_STATUS,
+    TAG_RAY_USER_NODE_TYPE,
+)
+from ray.experimental.internal_kv import _internal_kv_put, internal_kv_get_gcs_client
+from ray.util.debug import log_once
+try:  # py3
+    from shlex import quote
+except ImportError:  # py2
+    from pipes import quote
+logger = logging.getLogger(__name__)
+RUN_ENV_TYPES = ["auto", "host", "docker"]
+POLL_INTERVAL = 5
+Port_forward = Union[Tuple[int, int], List[Tuple[int, int]]]
+def try_logging_config(config: Dict[str, Any]) -> None:
+    if config["provider"]["type"] == "aws":
+        from ray.autoscaler._private.aws.config import log_to_cli
+        log_to_cli(config)
+def try_get_log_state(provider_config: Dict[str, Any]) -> Optional[dict]:
+    if provider_config["type"] == "aws":
+        from ray.autoscaler._private.aws.config import get_log_state
+        return get_log_state()
+    return None
+def try_reload_log_state(provider_config: Dict[str, Any], log_state: dict) -> None:
+    if not log_state:
+        return
+    if provider_config["type"] == "aws":
+        from ray.autoscaler._private.aws.config import reload_log_state
+        return reload_log_state(log_state)
+def debug_status(
+    status, error, verbose: bool = False, address: Optional[str] = None
+) -> str:
+    """
+    Return a debug string for the autoscaler.
+    Args:
+        status: The autoscaler status string for v1
+        error: The autoscaler error string for v1
+        verbose: Whether to print verbose information.
+        address: The address of the cluster (gcs address).
+    Returns:
+        str: A debug string for the cluster's status.
+    """
+    from ray.autoscaler.v2.utils import is_autoscaler_v2
+    if is_autoscaler_v2():
+        from ray.autoscaler.v2.sdk import get_cluster_status
+        from ray.autoscaler.v2.utils import ClusterStatusFormatter
+        cluster_status = get_cluster_status(address)
+        status = ClusterStatusFormatter.format(cluster_status, verbose=verbose)
+    elif status:
+        status = status.decode("utf-8")
+        status_dict = json.loads(status)
+        lm_summary_dict = status_dict.get("load_metrics_report")
+        autoscaler_summary_dict = status_dict.get("autoscaler_report")
+        timestamp = status_dict.get("time")
+        gcs_request_time = status_dict.get("gcs_request_time")
+        non_terminated_nodes_time = status_dict.get("non_terminated_nodes_time")
+        if lm_summary_dict and autoscaler_summary_dict and timestamp:
+            lm_summary = LoadMetricsSummary(**lm_summary_dict)
+            node_availability_summary_dict = autoscaler_summary_dict.pop(
+                "node_availability_summary", {}
+            )
+            node_availability_summary = NodeAvailabilitySummary.from_fields(
+                **node_availability_summary_dict
+            )
+            autoscaler_summary = AutoscalerSummary(
+                node_availability_summary=node_availability_summary,
+                **autoscaler_summary_dict,
+            )
+            report_time = datetime.datetime.fromtimestamp(timestamp)
+            status = format_info_string(
+                lm_summary,
+                autoscaler_summary,
+                time=report_time,
+                gcs_request_time=gcs_request_time,
+                non_terminated_nodes_time=non_terminated_nodes_time,
+                verbose=verbose,
+            )
+        else:
+            status = (
+                "No cluster status. It may take a few seconds "
+                "for the Ray internal services to start up."
+            )
+    else:
+        status = (
+            "No cluster status. It may take a few seconds "
+            "for the Ray internal services to start up."
+        )
+    if error:
+        status += "\n"
+        status += error.decode("utf-8")
+    return status
+def request_resources(
+    num_cpus: Optional[int] = None, bundles: Optional[List[dict]] = None
+) -> None:
+    """Remotely request some CPU or GPU resources from the autoscaler.
+    This function is to be called e.g. on a node before submitting a bunch of
+    ray.remote calls to ensure that resources rapidly become available.
+    Args:
+        num_cpus: Scale the cluster to ensure this number of CPUs are
+            available. This request is persistent until another call to
+            request_resources() is made.
+        bundles (List[ResourceDict]): Scale the cluster to ensure this set of
+            resource shapes can fit. This request is persistent until another
+            call to request_resources() is made.
+    """
+    if not ray.is_initialized():
+        raise RuntimeError("Ray is not initialized yet")
+    to_request = []
+    if num_cpus:
+        to_request += [{"CPU": 1}] * num_cpus
+    if bundles:
+        to_request += bundles
+    _internal_kv_put(
+        AUTOSCALER_RESOURCE_REQUEST_CHANNEL, json.dumps(to_request), overwrite=True
+    )
+    from ray.autoscaler.v2.utils import is_autoscaler_v2
+    if is_autoscaler_v2():
+        from ray.autoscaler.v2.sdk import request_cluster_resources
+        gcs_address = internal_kv_get_gcs_client().address
+        request_cluster_resources(gcs_address, to_request)
+def create_or_update_cluster(
+    config_file: str,
+    override_min_workers: Optional[int],
+    override_max_workers: Optional[int],
+    no_restart: bool,
+    restart_only: bool,
+    yes: bool,
+    override_cluster_name: Optional[str] = None,
+    no_config_cache: bool = False,
+    redirect_command_output: Optional[bool] = False,
+    use_login_shells: bool = True,
+    no_monitor_on_head: bool = False,
+) -> Dict[str, Any]:
+    """Creates or updates an autoscaling Ray cluster from a config json."""
+    # no_monitor_on_head is an internal flag used by the Ray K8s operator.
+    # If True, prevents autoscaling config sync to the Ray head during cluster
+    # creation. See https://github.com/ray-project/ray/pull/13720.
+    set_using_login_shells(use_login_shells)
+    if not use_login_shells:
+        cmd_output_util.set_allow_interactive(False)
+    if redirect_command_output is None:
+        # Do not redirect by default.
+        cmd_output_util.set_output_redirected(False)
+    else:
+        cmd_output_util.set_output_redirected(redirect_command_output)
+    def handle_yaml_error(e):
+        cli_logger.error("Cluster config invalid")
+        cli_logger.newline()
+        cli_logger.error("Failed to load YAML file " + cf.bold("{}"), config_file)
+        cli_logger.newline()
+        with cli_logger.verbatim_error_ctx("PyYAML error:"):
+            cli_logger.error(e)
+        cli_logger.abort()
+    try:
+        config = yaml.safe_load(open(config_file).read())
+    except FileNotFoundError:
+        cli_logger.abort(
+            "Provided cluster configuration file ({}) does not exist",
+            cf.bold(config_file),
+        )
+    except yaml.parser.ParserError as e:
+        handle_yaml_error(e)
+        raise
+    except yaml.scanner.ScannerError as e:
+        handle_yaml_error(e)
+        raise
+    global_event_system.execute_callback(
+        CreateClusterEvent.up_started, {"cluster_config": config}
+    )
+    # todo: validate file_mounts, ssh keys, etc.
+    importer = _NODE_PROVIDERS.get(config["provider"]["type"])
+    if not importer:
+        cli_logger.abort(
+            "Unknown provider type " + cf.bold("{}") + "\n"
+            "Available providers are: {}",
+            config["provider"]["type"],
+            cli_logger.render_list(
+                [k for k in _NODE_PROVIDERS.keys() if _NODE_PROVIDERS[k] is not None]
+            ),
+        )
+    printed_overrides = False
+    def handle_cli_override(key, override):
+        if override is not None:
+            if key in config:
+                nonlocal printed_overrides
+                printed_overrides = True
+                cli_logger.warning(
+                    "`{}` override provided on the command line.\n"
+                    "  Using "
+                    + cf.bold("{}")
+                    + cf.dimmed(" [configuration file has " + cf.bold("{}") + "]"),
+                    key,
+                    override,
+                    config[key],
+                )
+            config[key] = override
+    handle_cli_override("min_workers", override_min_workers)
+    handle_cli_override("max_workers", override_max_workers)
+    handle_cli_override("cluster_name", override_cluster_name)
+    if printed_overrides:
+        cli_logger.newline()
+    cli_logger.labeled_value("Cluster", config["cluster_name"])
+    cli_logger.newline()
+    config = _bootstrap_config(config, no_config_cache=no_config_cache)
+    try_logging_config(config)
+    get_or_create_head_node(
+        config,
+        config_file,
+        no_restart,
+        restart_only,
+        yes,
+        override_cluster_name,
+        no_monitor_on_head,
+    )
+    return config
+CONFIG_CACHE_VERSION = 1
+def _bootstrap_config(
+    config: Dict[str, Any], no_config_cache: bool = False
+) -> Dict[str, Any]:
+    config = prepare_config(config)
+    # NOTE: multi-node-type autoscaler is guaranteed to be in use after this.
+    hasher = hashlib.sha1()
+    hasher.update(json.dumps([config], sort_keys=True).encode("utf-8"))
+    cache_key = os.path.join(
+        tempfile.gettempdir(), "ray-config-{}".format(hasher.hexdigest())
+    )
+    if os.path.exists(cache_key) and not no_config_cache:
+        config_cache = json.loads(open(cache_key).read())
+        if config_cache.get("_version", -1) == CONFIG_CACHE_VERSION:
+            # todo: is it fine to re-resolve? afaik it should be.
+            # we can have migrations otherwise or something
+            # but this seems overcomplicated given that resolving is
+            # relatively cheap
+            try_reload_log_state(
+                config_cache["config"]["provider"],
+                config_cache.get("provider_log_info"),
+            )
+            if log_once("_printed_cached_config_warning"):
+                cli_logger.verbose_warning(
+                    "Loaded cached provider configuration from " + cf.bold("{}"),
+                    cache_key,
+                )
+                if cli_logger.verbosity == 0:
+                    cli_logger.warning("Loaded cached provider configuration")
+                cli_logger.warning(
+                    "If you experience issues with "
+                    "the cloud provider, try re-running "
+                    "the command with {}.",
+                    cf.bold("--no-config-cache"),
+                )
+            return config_cache["config"]
+        else:
+            cli_logger.warning(
+                "Found cached cluster config "
+                "but the version " + cf.bold("{}") + " "
+                "(expected " + cf.bold("{}") + ") does not match.\n"
+                "This is normal if cluster launcher was updated.\n"
+                "Config will be re-resolved.",
+                config_cache.get("_version", "none"),
+                CONFIG_CACHE_VERSION,
+            )
+    importer = _NODE_PROVIDERS.get(config["provider"]["type"])
+    if not importer:
+        raise NotImplementedError("Unsupported provider {}".format(config["provider"]))
+    provider_cls = importer(config["provider"])
+    cli_logger.print(
+        "Checking {} environment settings",
+        _PROVIDER_PRETTY_NAMES.get(config["provider"]["type"]),
+    )
+    try:
+        config = provider_cls.fillout_available_node_types_resources(config)
+    except Exception as exc:
+        if cli_logger.verbosity > 2:
+            logger.exception("Failed to autodetect node resources.")
+        else:
+            cli_logger.warning(
+                f"Failed to autodetect node resources: {str(exc)}. "
+                "You can see full stack trace with higher verbosity."
+            )
+    try:
+        # NOTE: if `resources` field is missing, validate_config for providers
+        # other than AWS and Kubernetes will fail (the schema error will ask
+        # the user to manually fill the resources) as we currently support
+        # autofilling resources for AWS and Kubernetes only.
+        validate_config(config)
+    except (ModuleNotFoundError, ImportError):
+        cli_logger.abort(
+            "Not all Ray autoscaler dependencies were found. "
+            "In Ray 1.4+, the Ray CLI, autoscaler, and dashboard will "
+            'only be usable via `pip install "ray[default]"`. Please '
+            "update your install command."
+        )
+    resolved_config = provider_cls.bootstrap_config(config)
+    if not no_config_cache:
+        with open(cache_key, "w") as f:
+            config_cache = {
+                "_version": CONFIG_CACHE_VERSION,
+                "provider_log_info": try_get_log_state(resolved_config["provider"]),
+                "config": resolved_config,
+            }
+            f.write(json.dumps(config_cache))
+    return resolved_config
+def teardown_cluster(
+    config_file: str,
+    yes: bool,
+    workers_only: bool,
+    override_cluster_name: Optional[str],
+    keep_min_workers: bool,
+) -> None:
+    """Destroys all nodes of a Ray cluster described by a config json."""
+    config = yaml.safe_load(open(config_file).read())
+    if override_cluster_name is not None:
+        config["cluster_name"] = override_cluster_name
+    config = _bootstrap_config(config)
+    cli_logger.confirm(yes, "Destroying cluster.", _abort=True)
+    if not workers_only:
+        try:
+            exec_cluster(
+                config_file,
+                cmd="ray stop",
+                run_env="auto",
+                screen=False,
+                tmux=False,
+                stop=False,
+                start=False,
+                override_cluster_name=override_cluster_name,
+                port_forward=None,
+                with_output=False,
+            )
+        except Exception as e:
+            # todo: add better exception info
+            cli_logger.verbose_error("{}", str(e))
+            cli_logger.warning(
+                "Exception occurred when stopping the cluster Ray runtime "
+                "(use -v to dump teardown exceptions)."
+            )
+            cli_logger.warning(
+                "Ignoring the exception and "
+                "attempting to shut down the cluster nodes anyway."
+            )
+    provider = _get_node_provider(config["provider"], config["cluster_name"])
+    def remaining_nodes():
+        workers = provider.non_terminated_nodes({TAG_RAY_NODE_KIND: NODE_KIND_WORKER})
+        if keep_min_workers:
+            min_workers = config.get("min_workers", 0)
+            cli_logger.print(
+                "{} random worker nodes will not be shut down. "
+                + cf.dimmed("(due to {})"),
+                cf.bold(min_workers),
+                cf.bold("--keep-min-workers"),
+            )
+            workers = random.sample(workers, len(workers) - min_workers)
+        # todo: it's weird to kill the head node but not all workers
+        if workers_only:
+            cli_logger.print(
+                "The head node will not be shut down. " + cf.dimmed("(due to {})"),
+                cf.bold("--workers-only"),
+            )
+            return workers
+        head = provider.non_terminated_nodes({TAG_RAY_NODE_KIND: NODE_KIND_HEAD})
+        return head + workers
+    def run_docker_stop(node, container_name):
+        try:
+            updater = NodeUpdaterThread(
+                node_id=node,
+                provider_config=config["provider"],
+                provider=provider,
+                auth_config=config["auth"],
+                cluster_name=config["cluster_name"],
+                file_mounts=config["file_mounts"],
+                initialization_commands=[],
+                setup_commands=[],
+                ray_start_commands=[],
+                runtime_hash="",
+                file_mounts_contents_hash="",
+                is_head_node=False,
+                docker_config=config.get("docker"),
+            )
+            _exec(
+                updater,
+                f"docker stop {container_name}",
+                with_output=False,
+                run_env="host",
+            )
+        except Exception:
+            cli_logger.warning(f"Docker stop failed on {node}")
+    # Loop here to check that both the head and worker nodes are actually
+    #   really gone
+    A = remaining_nodes()
+    container_name = config.get("docker", {}).get("container_name")
+    if container_name:
+        # This is to ensure that the parallel SSH calls below do not mess with
+        # the users terminal.
+        output_redir = cmd_output_util.is_output_redirected()
+        cmd_output_util.set_output_redirected(True)
+        allow_interactive = cmd_output_util.does_allow_interactive()
+        cmd_output_util.set_allow_interactive(False)
+        with ThreadPoolExecutor(max_workers=MAX_PARALLEL_SHUTDOWN_WORKERS) as executor:
+            for node in A:
+                executor.submit(
+                    run_docker_stop, node=node, container_name=container_name
+                )
+        cmd_output_util.set_output_redirected(output_redir)
+        cmd_output_util.set_allow_interactive(allow_interactive)
+    with LogTimer("teardown_cluster: done."):
+        while A:
+            provider.terminate_nodes(A)
+            cli_logger.print(
+                "Requested {} nodes to shut down.",
+                cf.bold(len(A)),
+                _tags=dict(interval="1s"),
+            )
+            time.sleep(POLL_INTERVAL)  # todo: interval should be a variable
+            A = remaining_nodes()
+            cli_logger.print(
+                "{} nodes remaining after {} second(s).", cf.bold(len(A)), POLL_INTERVAL
+            )
+        cli_logger.success("No nodes remaining.")
+def kill_node(
+    config_file: str, yes: bool, hard: bool, override_cluster_name: Optional[str]
+) -> Optional[str]:
+    """Kills a random Raylet worker."""
+    config = yaml.safe_load(open(config_file).read())
+    if override_cluster_name is not None:
+        config["cluster_name"] = override_cluster_name
+    config = _bootstrap_config(config)
+    cli_logger.confirm(yes, "A random node will be killed.")
+    provider = _get_node_provider(config["provider"], config["cluster_name"])
+    nodes = provider.non_terminated_nodes({TAG_RAY_NODE_KIND: NODE_KIND_WORKER})
+    if not nodes:
+        cli_logger.print("No worker nodes detected.")
+        return None
+    node = random.choice(nodes)
+    cli_logger.print("Shutdown " + cf.bold("{}"), node)
+    if hard:
+        provider.terminate_node(node)
+    else:
+        updater = NodeUpdaterThread(
+            node_id=node,
+            provider_config=config["provider"],
+            provider=provider,
+            auth_config=config["auth"],
+            cluster_name=config["cluster_name"],
+            file_mounts=config["file_mounts"],
+            initialization_commands=[],
+            setup_commands=[],
+            ray_start_commands=[],
+            runtime_hash="",
+            file_mounts_contents_hash="",
+            is_head_node=False,
+            docker_config=config.get("docker"),
+        )
+        _exec(updater, "ray stop", False, False)
+    time.sleep(POLL_INTERVAL)
+    if config.get("provider", {}).get("use_internal_ips", False):
+        node_ip = provider.internal_ip(node)
+    else:
+        node_ip = provider.external_ip(node)
+    return node_ip
+def monitor_cluster(
+    cluster_config_file: str, num_lines: int, override_cluster_name: Optional[str]
+) -> None:
+    """Tails the autoscaler logs of a Ray cluster."""
+    cmd = f"tail -n {num_lines} -f /tmp/ray/session_latest/logs/monitor*"
+    exec_cluster(
+        cluster_config_file,
+        cmd=cmd,
+        run_env="auto",
+        screen=False,
+        tmux=False,
+        stop=False,
+        start=False,
+        override_cluster_name=override_cluster_name,
+        port_forward=None,
+    )
+def warn_about_bad_start_command(
+    start_commands: List[str], no_monitor_on_head: bool = False
+) -> None:
+    ray_start_cmd = list(filter(lambda x: "ray start" in x, start_commands))
+    if len(ray_start_cmd) == 0:
+        cli_logger.warning(
+            "Ray runtime will not be started because `{}` is not in `{}`.",
+            cf.bold("ray start"),
+            cf.bold("head_start_ray_commands"),
+        )
+    autoscaling_config_in_ray_start_cmd = any(
+        "autoscaling-config" in x for x in ray_start_cmd
+    )
+    if not (autoscaling_config_in_ray_start_cmd or no_monitor_on_head):
+        cli_logger.warning(
+            "The head node will not launch any workers because "
+            "`{}` does not have `{}` set.\n"
+            "Potential fix: add `{}` to the `{}` command under `{}`.",
+            cf.bold("ray start"),
+            cf.bold("--autoscaling-config"),
+            cf.bold("--autoscaling-config=~/ray_bootstrap_config.yaml"),
+            cf.bold("ray start"),
+            cf.bold("head_start_ray_commands"),
+        )
+def get_or_create_head_node(
+    config: Dict[str, Any],
+    printable_config_file: str,
+    no_restart: bool,
+    restart_only: bool,
+    yes: bool,
+    override_cluster_name: Optional[str],
+    no_monitor_on_head: bool = False,
+    _provider: Optional[NodeProvider] = None,
+    _runner: ModuleType = subprocess,
+) -> None:
+    """Create the cluster head node, which in turn creates the workers."""
+    global_event_system.execute_callback(CreateClusterEvent.cluster_booting_started)
+    provider = _provider or _get_node_provider(
+        config["provider"], config["cluster_name"]
+    )
+    config = copy.deepcopy(config)
+    head_node_tags = {
+        TAG_RAY_NODE_KIND: NODE_KIND_HEAD,
+    }
+    nodes = provider.non_terminated_nodes(head_node_tags)
+    if len(nodes) > 0:
+        head_node = nodes[0]
+    else:
+        head_node = None
+    if not head_node:
+        cli_logger.confirm(
+            yes, "No head node found. Launching a new cluster.", _abort=True
+        )
+        cli_logger.newline()
+        usage_lib.show_usage_stats_prompt(cli=True)
+    if head_node:
+        if restart_only:
+            cli_logger.confirm(
+                yes,
+                "Updating cluster configuration and "
+                "restarting the cluster Ray runtime. "
+                "Setup commands will not be run due to `{}`.\n",
+                cf.bold("--restart-only"),
+                _abort=True,
+            )
+            cli_logger.newline()
+            usage_lib.show_usage_stats_prompt(cli=True)
+        elif no_restart:
+            cli_logger.print(
+                "Cluster Ray runtime will not be restarted due to `{}`.",
+                cf.bold("--no-restart"),
+            )
+            cli_logger.confirm(
+                yes,
+                "Updating cluster configuration and running setup commands.",
+                _abort=True,
+            )
+        else:
+            cli_logger.print("Updating cluster configuration and running full setup.")
+            cli_logger.confirm(
+                yes, cf.bold("Cluster Ray runtime will be restarted."), _abort=True
+            )
+            cli_logger.newline()
+            usage_lib.show_usage_stats_prompt(cli=True)
+    cli_logger.newline()
+    # TODO(ekl) this logic is duplicated in node_launcher.py (keep in sync)
+    head_node_config = copy.deepcopy(config.get("head_node", {}))
+    # The above `head_node` field is deprecated in favor of per-node-type
+    # node_configs. We allow it for backwards-compatibility.
+    head_node_resources = None
+    head_node_labels = None
+    head_node_type = config.get("head_node_type")
+    if head_node_type:
+        head_node_tags[TAG_RAY_USER_NODE_TYPE] = head_node_type
+        head_config = config["available_node_types"][head_node_type]
+        head_node_config.update(head_config["node_config"])
+        # Not necessary to keep in sync with node_launcher.py
+        # Keep in sync with autoscaler.py _node_resources
+        head_node_resources = head_config.get("resources")
+        head_node_labels = head_config.get("labels")
+    launch_hash = hash_launch_conf(head_node_config, config["auth"])
+    creating_new_head = _should_create_new_head(
+        head_node, launch_hash, head_node_type, provider
+    )
+    if creating_new_head:
+        with cli_logger.group("Acquiring an up-to-date head node"):
+            global_event_system.execute_callback(
+                CreateClusterEvent.acquiring_new_head_node
+            )
+            if head_node is not None:
+                cli_logger.confirm(yes, "Relaunching the head node.", _abort=True)
+                provider.terminate_node(head_node)
+                cli_logger.print("Terminated head node {}", head_node)
+            head_node_tags[TAG_RAY_LAUNCH_CONFIG] = launch_hash
+            head_node_tags[TAG_RAY_NODE_NAME] = "ray-{}-head".format(
+                config["cluster_name"]
+            )
+            head_node_tags[TAG_RAY_NODE_STATUS] = STATUS_UNINITIALIZED
+            provider.create_node(head_node_config, head_node_tags, 1)
+            cli_logger.print("Launched a new head node")
+            start = time.time()
+            head_node = None
+            with cli_logger.group("Fetching the new head node"):
+                while True:
+                    if time.time() - start > 50:
+                        cli_logger.abort(
+                            "Head node fetch timed out. Failed to create head node."
+                        )
+                    nodes = provider.non_terminated_nodes(head_node_tags)
+                    if len(nodes) == 1:
+                        head_node = nodes[0]
+                        break
+                    time.sleep(POLL_INTERVAL)
+            cli_logger.newline()
+    global_event_system.execute_callback(CreateClusterEvent.head_node_acquired)
+    with cli_logger.group(
+        "Setting up head node",
+        _numbered=("<>", 1, 1),
+        # cf.bold(provider.node_tags(head_node)[TAG_RAY_NODE_NAME]),
+        _tags=dict(),
+    ):  # add id, ARN to tags?
+        # TODO(ekl) right now we always update the head node even if the
+        # hash matches.
+        # We could prompt the user for what they want to do here.
+        # No need to pass in cluster_sync_files because we use this
+        # hash to set up the head node
+        (runtime_hash, file_mounts_contents_hash) = hash_runtime_conf(
+            config["file_mounts"], None, config
+        )
+        if not no_monitor_on_head:
+            # Return remote_config_file to avoid prematurely closing it.
+            config, remote_config_file = _set_up_config_for_head_node(
+                config, provider, no_restart
+            )
+            cli_logger.print("Prepared bootstrap config")
+        if restart_only:
+            # Docker may re-launch nodes, requiring setup
+            # commands to be rerun.
+            if config.get("docker", {}).get("container_name"):
+                setup_commands = config["head_setup_commands"]
+            else:
+                setup_commands = []
+            ray_start_commands = config["head_start_ray_commands"]
+        # If user passed in --no-restart and we're not creating a new head,
+        # omit start commands.
+        elif no_restart and not creating_new_head:
+            setup_commands = config["head_setup_commands"]
+            ray_start_commands = []
+        else:
+            setup_commands = config["head_setup_commands"]
+            ray_start_commands = config["head_start_ray_commands"]
+        if not no_restart:
+            warn_about_bad_start_command(ray_start_commands, no_monitor_on_head)
+        updater = NodeUpdaterThread(
+            node_id=head_node,
+            provider_config=config["provider"],
+            provider=provider,
+            auth_config=config["auth"],
+            cluster_name=config["cluster_name"],
+            file_mounts=config["file_mounts"],
+            initialization_commands=config["initialization_commands"],
+            setup_commands=setup_commands,
+            ray_start_commands=ray_start_commands,
+            process_runner=_runner,
+            runtime_hash=runtime_hash,
+            file_mounts_contents_hash=file_mounts_contents_hash,
+            is_head_node=True,
+            node_resources=head_node_resources,
+            node_labels=head_node_labels,
+            rsync_options={
+                "rsync_exclude": config.get("rsync_exclude"),
+                "rsync_filter": config.get("rsync_filter"),
+            },
+            docker_config=config.get("docker"),
+            restart_only=restart_only,
+        )
+        updater.start()
+        updater.join()
+        # Refresh the node cache so we see the external ip if available
+        provider.non_terminated_nodes(head_node_tags)
+        if updater.exitcode != 0:
+            # todo: this does not follow the mockup and is not good enough
+            cli_logger.abort("Failed to setup head node.")
+            sys.exit(1)
+    global_event_system.execute_callback(
+        CreateClusterEvent.cluster_booting_completed,
+        {
+            "head_node_id": head_node,
+        },
+    )
+    monitor_str = "tail -n 100 -f /tmp/ray/session_latest/logs/monitor*"
+    if override_cluster_name:
+        modifiers = " --cluster-name={}".format(quote(override_cluster_name))
+    else:
+        modifiers = ""
+    cli_logger.newline()
+    with cli_logger.group("Useful commands:"):
+        printable_config_file = os.path.abspath(printable_config_file)
+        cli_logger.print("To terminate the cluster:")
+        cli_logger.print(cf.bold(f"  ray down {printable_config_file}{modifiers}"))
+        cli_logger.newline()
+        cli_logger.print("To retrieve the IP address of the cluster head:")
+        cli_logger.print(
+            cf.bold(f"  ray get-head-ip {printable_config_file}{modifiers}")
+        )
+        cli_logger.newline()
+        cli_logger.print(
+            "To port-forward the cluster's Ray Dashboard to the local machine:"
+        )
+        cli_logger.print(cf.bold(f"  ray dashboard {printable_config_file}{modifiers}"))
+        cli_logger.newline()
+        cli_logger.print(
+            "To submit a job to the cluster, port-forward the "
+            "Ray Dashboard in another terminal and run:"
+        )
+        cli_logger.print(
+            cf.bold(
+                "  ray job submit --address http://localhost:<dashboard-port> "
+                "--working-dir . -- python my_script.py"
+            )
+        )
+        cli_logger.newline()
+        cli_logger.print("To connect to a terminal on the cluster head for debugging:")
+        cli_logger.print(cf.bold(f"  ray attach {printable_config_file}{modifiers}"))
+        cli_logger.newline()
+        cli_logger.print("To monitor autoscaling:")
+        cli_logger.print(
+            cf.bold(
+                f"  ray exec {printable_config_file}{modifiers} {quote(monitor_str)}"
+            )
+        )
+        cli_logger.newline()
+def _should_create_new_head(
+    head_node_id: Optional[str],
+    new_launch_hash: str,
+    new_head_node_type: str,
+    provider: NodeProvider,
+) -> bool:
+    """Decides whether a new head node needs to be created.
+    We need a new head if at least one of the following holds:
+    (a) There isn't an existing head node
+    (b) The user-submitted head node_config differs from the existing head
+        node's node_config.
+    (c) The user-submitted head node_type key differs from the existing head
+        node's node_type.
+    Args:
+        head_node_id (Optional[str]): head node id if a head exists, else None
+        new_launch_hash: hash of current user-submitted head config
+        new_head_node_type: current user-submitted head node-type key
+    Returns:
+        bool: True if a new Ray head node should be launched, False otherwise
+    """
+    if not head_node_id:
+        # No head node exists, need to create it.
+        return True
+    # Pull existing head's data.
+    head_tags = provider.node_tags(head_node_id)
+    current_launch_hash = head_tags.get(TAG_RAY_LAUNCH_CONFIG)
+    current_head_type = head_tags.get(TAG_RAY_USER_NODE_TYPE)
+    # Compare to current head
+    hashes_mismatch = new_launch_hash != current_launch_hash
+    types_mismatch = new_head_node_type != current_head_type
+    new_head_required = hashes_mismatch or types_mismatch
+    # Warn user
+    if new_head_required:
+        with cli_logger.group(
+            "Currently running head node is out-of-date with cluster configuration"
+        ):
+            if hashes_mismatch:
+                cli_logger.print(
+                    "Current hash is {}, expected {}",
+                    cf.bold(current_launch_hash),
+                    cf.bold(new_launch_hash),
+                )
+            if types_mismatch:
+                cli_logger.print(
+                    "Current head node type is {}, expected {}",
+                    cf.bold(current_head_type),
+                    cf.bold(new_head_node_type),
+                )
+    return new_head_required
+def _set_up_config_for_head_node(
+    config: Dict[str, Any], provider: NodeProvider, no_restart: bool
+) -> Tuple[Dict[str, Any], Any]:
+    """Prepares autoscaling config and, if needed, ssh key, to be mounted onto
+    the Ray head node for use by the autoscaler.
+    Returns the modified config and the temporary config file that will be
+    mounted onto the head node.
+    """
+    # Rewrite the auth config so that the head
+    # node can update the workers
+    remote_config = copy.deepcopy(config)
+    # drop proxy options if they exist, otherwise
+    # head node won't be able to connect to workers
+    remote_config["auth"].pop("ssh_proxy_command", None)
+    # Drop the head_node field if it was introduced. It is technically not a
+    # valid field in the config, but it may have been introduced after
+    # validation (see _bootstrap_config() call to
+    # provider_cls.bootstrap_config(config)). The head node will never try to
+    # launch a head node so it doesn't need these defaults.
+    remote_config.pop("head_node", None)
+    if "ssh_private_key" in config["auth"]:
+        remote_key_path = "~/ray_bootstrap_key.pem"
+        remote_config["auth"]["ssh_private_key"] = remote_key_path
+    # Adjust for new file locations
+    new_mounts = {}
+    for remote_path in config["file_mounts"]:
+        new_mounts[remote_path] = remote_path
+    remote_config["file_mounts"] = new_mounts
+    remote_config["no_restart"] = no_restart
+    remote_config = provider.prepare_for_head_node(remote_config)
+    # Now inject the rewritten config and SSH key into the head node
+    remote_config_file = tempfile.NamedTemporaryFile("w", prefix="ray-bootstrap-")
+    remote_config_file.write(json.dumps(remote_config))
+    remote_config_file.flush()
+    config["file_mounts"].update(
+        {"~/ray_bootstrap_config.yaml": remote_config_file.name}
+    )
+    if "ssh_private_key" in config["auth"]:
+        config["file_mounts"].update(
+            {
+                remote_key_path: config["auth"]["ssh_private_key"],
+            }
+        )
+    return config, remote_config_file
+def attach_cluster(
+    config_file: str,
+    start: bool,
+    use_screen: bool,
+    use_tmux: bool,
+    override_cluster_name: Optional[str],
+    no_config_cache: bool = False,
+    new: bool = False,
+    port_forward: Optional[Port_forward] = None,
+) -> None:
+    """Attaches to a screen for the specified cluster.
+    Arguments:
+        config_file: path to the cluster yaml
+        start: whether to start the cluster if it isn't up
+        use_screen: whether to use screen as multiplexer
+        use_tmux: whether to use tmux as multiplexer
+        override_cluster_name: set the name of the cluster
+        new: whether to force a new screen
+        port_forward ( (int,int) or list[(int,int)] ): port(s) to forward
+    """
+    if use_tmux:
+        if new:
+            cmd = "tmux new"
+        else:
+            cmd = "tmux attach || tmux new"
+    elif use_screen:
+        if new:
+            cmd = "screen -L"
+        else:
+            cmd = "screen -L -xRR"
+    else:
+        if new:
+            raise ValueError("--new only makes sense if passing --screen or --tmux")
+        cmd = "$SHELL"
+    exec_cluster(
+        config_file,
+        cmd=cmd,
+        run_env="auto",
+        screen=False,
+        tmux=False,
+        stop=False,
+        start=start,
+        override_cluster_name=override_cluster_name,
+        no_config_cache=no_config_cache,
+        port_forward=port_forward,
+        _allow_uninitialized_state=True,
+    )
+def exec_cluster(
+    config_file: str,
+    *,
+    cmd: Optional[str] = None,
+    run_env: str = "auto",
+    screen: bool = False,
+    tmux: bool = False,
+    stop: bool = False,
+    start: bool = False,
+    override_cluster_name: Optional[str] = None,
+    no_config_cache: bool = False,
+    port_forward: Optional[Port_forward] = None,
+    with_output: bool = False,
+    _allow_uninitialized_state: bool = False,
+    extra_screen_args: Optional[str] = None,
+) -> str:
+    """Runs a command on the specified cluster.
+    Arguments:
+        config_file: path to the cluster yaml
+        cmd: command to run
+        run_env: whether to run the command on the host or in a container.
+            Select between "auto", "host" and "docker"
+        screen: whether to run in a screen
+        extra_screen_args: optional custom additional args to screen command
+        tmux: whether to run in a tmux session
+        stop: whether to stop the cluster after command run
+        start: whether to start the cluster if it isn't up
+        override_cluster_name: set the name of the cluster
+        port_forward ( (int, int) or list[(int, int)] ): port(s) to forward
+        _allow_uninitialized_state: whether to execute on an uninitialized head
+            node.
+    """
+    assert not (screen and tmux), "Can specify only one of `screen` or `tmux`."
+    assert run_env in RUN_ENV_TYPES, "--run_env must be in {}".format(RUN_ENV_TYPES)
+    # TODO(rliaw): We default this to True to maintain backwards-compat.
+    # In the future we would want to support disabling login-shells
+    # and interactivity.
+    cmd_output_util.set_allow_interactive(True)
+    config = yaml.safe_load(open(config_file).read())
+    if override_cluster_name is not None:
+        config["cluster_name"] = override_cluster_name
+    config = _bootstrap_config(config, no_config_cache=no_config_cache)
+    head_node = _get_running_head_node(
+        config,
+        config_file,
+        override_cluster_name,
+        create_if_needed=start,
+        _allow_uninitialized_state=_allow_uninitialized_state,
+    )
+    provider = _get_node_provider(config["provider"], config["cluster_name"])
+    updater = NodeUpdaterThread(
+        node_id=head_node,
+        provider_config=config["provider"],
+        provider=provider,
+        auth_config=config["auth"],
+        cluster_name=config["cluster_name"],
+        file_mounts=config["file_mounts"],
+        initialization_commands=[],
+        setup_commands=[],
+        ray_start_commands=[],
+        runtime_hash="",
+        file_mounts_contents_hash="",
+        is_head_node=True,
+        rsync_options={
+            "rsync_exclude": config.get("rsync_exclude"),
+            "rsync_filter": config.get("rsync_filter"),
+        },
+        docker_config=config.get("docker"),
+    )
+    if cmd and stop:
+        cmd = "; ".join(
+            [
+                cmd,
+                "ray stop",
+                "ray teardown ~/ray_bootstrap_config.yaml --yes --workers-only",
+                "sudo shutdown -h now",
+            ]
+        )
+    result = _exec(
+        updater,
+        cmd,
+        screen,
+        tmux,
+        port_forward=port_forward,
+        with_output=with_output,
+        run_env=run_env,
+        shutdown_after_run=False,
+        extra_screen_args=extra_screen_args,
+    )
+    if tmux or screen:
+        attach_command_parts = ["ray attach", config_file]
+        if override_cluster_name is not None:
+            attach_command_parts.append(
+                "--cluster-name={}".format(override_cluster_name)
+            )
+        if tmux:
+            attach_command_parts.append("--tmux")
+        elif screen:
+            attach_command_parts.append("--screen")
+        attach_command = " ".join(attach_command_parts)
+        cli_logger.print("Run `{}` to check command status.", cf.bold(attach_command))
+    return result
+def _exec(
+    updater: NodeUpdaterThread,
+    cmd: Optional[str] = None,
+    screen: bool = False,
+    tmux: bool = False,
+    port_forward: Optional[Port_forward] = None,
+    with_output: bool = False,
+    run_env: str = "auto",
+    shutdown_after_run: bool = False,
+    extra_screen_args: Optional[str] = None,
+) -> str:
+    if cmd:
+        if screen:
+            wrapped_cmd = [
+                "screen",
+                "-L",
+                "-dm",
+            ]
+            if extra_screen_args is not None and len(extra_screen_args) > 0:
+                wrapped_cmd += [extra_screen_args]
+            wrapped_cmd += [
+                "bash",
+                "-c",
+                quote(cmd + "; exec bash"),
+            ]
+            cmd = " ".join(wrapped_cmd)
+        elif tmux:
+            # TODO: Consider providing named session functionality
+            wrapped_cmd = [
+                "tmux",
+                "new",
+                "-d",
+                "bash",
+                "-c",
+                quote(cmd + "; exec bash"),
+            ]
+            cmd = " ".join(wrapped_cmd)
+    return updater.cmd_runner.run(
+        cmd,
+        exit_on_fail=True,
+        port_forward=port_forward,
+        with_output=with_output,
+        run_env=run_env,
+        shutdown_after_run=shutdown_after_run,
+    )
+def rsync(
+    config_file: str,
+    source: Optional[str],
+    target: Optional[str],
+    override_cluster_name: Optional[str],
+    down: bool,
+    ip_address: Optional[str] = None,
+    use_internal_ip: bool = False,
+    no_config_cache: bool = False,
+    all_nodes: bool = False,
+    should_bootstrap: bool = True,
+    _runner: ModuleType = subprocess,
+) -> None:
+    """Rsyncs files.
+    Arguments:
+        config_file: path to the cluster yaml
+        source: source dir
+        target: target dir
+        override_cluster_name: set the name of the cluster
+        down: whether we're syncing remote -> local
+        ip_address: Address of node. Raise Exception
+            if both ip_address and 'all_nodes' are provided.
+        use_internal_ip: Whether the provided ip_address is
+            public or private.
+        all_nodes: whether to sync worker nodes in addition to the head node
+        should_bootstrap: whether to bootstrap cluster config before syncing
+    """
+    if bool(source) != bool(target):
+        cli_logger.abort("Expected either both a source and a target, or neither.")
+    assert bool(source) == bool(
+        target
+    ), "Must either provide both or neither source and target."
+    if ip_address and all_nodes:
+        cli_logger.abort("Cannot provide both ip_address and 'all_nodes'.")
+    config = yaml.safe_load(open(config_file).read())
+    if override_cluster_name is not None:
+        config["cluster_name"] = override_cluster_name
+    if should_bootstrap:
+        config = _bootstrap_config(config, no_config_cache=no_config_cache)
+    is_file_mount = False
+    if source and target:
+        for remote_mount in config.get("file_mounts", {}).keys():
+            if (source if down else target).startswith(remote_mount):
+                is_file_mount = True
+                break
+    provider = _get_node_provider(config["provider"], config["cluster_name"])
+    def rsync_to_node(node_id, is_head_node):
+        updater = NodeUpdaterThread(
+            node_id=node_id,
+            provider_config=config["provider"],
+            provider=provider,
+            auth_config=config["auth"],
+            cluster_name=config["cluster_name"],
+            file_mounts=config["file_mounts"],
+            initialization_commands=[],
+            setup_commands=[],
+            ray_start_commands=[],
+            runtime_hash="",
+            use_internal_ip=use_internal_ip,
+            process_runner=_runner,
+            file_mounts_contents_hash="",
+            is_head_node=is_head_node,
+            rsync_options={
+                "rsync_exclude": config.get("rsync_exclude"),
+                "rsync_filter": config.get("rsync_filter"),
+            },
+            docker_config=config.get("docker"),
+        )
+        if down:
+            rsync = updater.rsync_down
+        else:
+            rsync = updater.rsync_up
+        if source and target:
+            # print rsync progress for single file rsync
+            if cli_logger.verbosity > 0:
+                cmd_output_util.set_output_redirected(False)
+                set_rsync_silent(False)
+            rsync(source, target, is_file_mount)
+        else:
+            updater.sync_file_mounts(rsync)
+    nodes = []
+    head_node = _get_running_head_node(
+        config, config_file, override_cluster_name, create_if_needed=False
+    )
+    if ip_address:
+        nodes = [provider.get_node_id(ip_address, use_internal_ip=use_internal_ip)]
+    else:
+        nodes = [head_node]
+        if all_nodes:
+            nodes.extend(_get_worker_nodes(config, override_cluster_name))
+    for node_id in nodes:
+        rsync_to_node(node_id, is_head_node=(node_id == head_node))
+def get_head_node_ip(
+    config_file: str, override_cluster_name: Optional[str] = None
+) -> str:
+    """Returns head node IP for given configuration file if exists."""
+    config = yaml.safe_load(open(config_file).read())
+    if override_cluster_name is not None:
+        config["cluster_name"] = override_cluster_name
+    provider = _get_node_provider(config["provider"], config["cluster_name"])
+    head_node = _get_running_head_node(config, config_file, override_cluster_name)
+    provider_cfg = config.get("provider", {})
+    # Get internal IP if using internal IPs and
+    # use_external_head_ip is not specified
+    if provider_cfg.get("use_internal_ips", False) and not provider_cfg.get(
+        "use_external_head_ip", False
+    ):
+        head_node_ip = provider.internal_ip(head_node)
+    else:
+        head_node_ip = provider.external_ip(head_node)
+    return head_node_ip
+def get_worker_node_ips(
+    config_file: str, override_cluster_name: Optional[str] = None
+) -> List[str]:
+    """Returns worker node IPs for given configuration file."""
+    config = yaml.safe_load(open(config_file).read())
+    if override_cluster_name is not None:
+        config["cluster_name"] = override_cluster_name
+    provider = _get_node_provider(config["provider"], config["cluster_name"])
+    nodes = provider.non_terminated_nodes({TAG_RAY_NODE_KIND: NODE_KIND_WORKER})
+    if config.get("provider", {}).get("use_internal_ips", False):
+        return [provider.internal_ip(node) for node in nodes]
+    else:
+        return [provider.external_ip(node) for node in nodes]
+def _get_worker_nodes(
+    config: Dict[str, Any], override_cluster_name: Optional[str]
+) -> List[str]:
+    """Returns worker node ids for given configuration."""
+    # todo: technically could be reused in get_worker_node_ips
+    if override_cluster_name is not None:
+        config["cluster_name"] = override_cluster_name
+    provider = _get_node_provider(config["provider"], config["cluster_name"])
+    return provider.non_terminated_nodes({TAG_RAY_NODE_KIND: NODE_KIND_WORKER})
+def _get_running_head_node(
+    config: Dict[str, Any],
+    printable_config_file: str,
+    override_cluster_name: Optional[str],
+    create_if_needed: bool = False,
+    _provider: Optional[NodeProvider] = None,
+    _allow_uninitialized_state: bool = False,
+) -> str:
+    """Get a valid, running head node.
+    Args:
+        config (Dict[str, Any]): Cluster Config dictionary
+        printable_config_file: Used for printing formatted CLI commands.
+        override_cluster_name: Passed to `get_or_create_head_node` to
+            override the cluster name present in `config`.
+        create_if_needed: Create a head node if one is not present.
+        _provider: [For testing], a Node Provider to use.
+        _allow_uninitialized_state: Whether to return a head node that
+            is not 'UP TO DATE'. This is used to allow `ray attach` and
+            `ray exec` to debug a cluster in a bad state.
+    """
+    provider = _provider or _get_node_provider(
+        config["provider"], config["cluster_name"]
+    )
+    head_node_tags = {
+        TAG_RAY_NODE_KIND: NODE_KIND_HEAD,
+    }
+    nodes = provider.non_terminated_nodes(head_node_tags)
+    head_node = None
+    _backup_head_node = None
+    for node in nodes:
+        node_state = provider.node_tags(node).get(TAG_RAY_NODE_STATUS)
+        if node_state == STATUS_UP_TO_DATE:
+            head_node = node
+        else:
+            _backup_head_node = node
+            cli_logger.warning(f"Head node ({node}) is in state {node_state}.")
+    if head_node is not None:
+        return head_node
+    elif create_if_needed:
+        get_or_create_head_node(
+            config,
+            printable_config_file=printable_config_file,
+            restart_only=False,
+            no_restart=False,
+            yes=True,
+            override_cluster_name=override_cluster_name,
+        )
+        # NOTE: `_allow_uninitialized_state` is forced to False if
+        # `create_if_needed` is set to True. This is to ensure that the
+        # commands executed after creation occur on an actually running
+        # cluster.
+        return _get_running_head_node(
+            config,
+            printable_config_file,
+            override_cluster_name,
+            create_if_needed=False,
+            _allow_uninitialized_state=False,
+        )
+    else:
+        if _allow_uninitialized_state and _backup_head_node is not None:
+            cli_logger.warning(
+                f"The head node being returned: {_backup_head_node} is not "
+                "`up-to-date`. If you are not debugging a startup issue "
+                "it is recommended to restart this head node with: {}",
+                cf.bold(f"  ray down  {printable_config_file}"),
+            )
+            return _backup_head_node
+        raise RuntimeError(
+            "Head node of cluster ({}) not found!".format(config["cluster_name"])
+        )
+def get_local_dump_archive(
+    stream: bool = False,
+    output: Optional[str] = None,
+    logs: bool = True,
+    debug_state: bool = True,
+    pip: bool = True,
+    processes: bool = True,
+    processes_verbose: bool = False,
+    tempfile: Optional[str] = None,
+) -> Optional[str]:
+    if stream and output:
+        raise ValueError(
+            "You can only use either `--output` or `--stream`, but not both."
+        )
+    parameters = GetParameters(
+        logs=logs,
+        debug_state=debug_state,
+        pip=pip,
+        processes=processes,
+        processes_verbose=processes_verbose,
+    )
+    with Archive(file=tempfile) as archive:
+        get_all_local_data(archive, parameters)
+    tmp = archive.file
+    if stream:
+        with open(tmp, "rb") as fp:
+            os.write(1, fp.read())
+        os.remove(tmp)
+        return None
+    target = output or os.path.join(os.getcwd(), os.path.basename(tmp))
+    shutil.move(tmp, target)
+    cli_logger.print(f"Created local data archive at {target}")
+    return target
+def get_cluster_dump_archive(
+    cluster_config_file: Optional[str] = None,
+    host: Optional[str] = None,
+    ssh_user: Optional[str] = None,
+    ssh_key: Optional[str] = None,
+    docker: Optional[str] = None,
+    local: Optional[bool] = None,
+    output: Optional[str] = None,
+    logs: bool = True,
+    debug_state: bool = True,
+    pip: bool = True,
+    processes: bool = True,
+    processes_verbose: bool = False,
+    tempfile: Optional[str] = None,
+) -> Optional[str]:
+    # Inform the user what kind of logs are collected (before actually
+    # collecting, so they can abort)
+    content_str = ""
+    if logs:
+        content_str += (
+            "  - The logfiles of your Ray session\n"
+            "    This usually includes Python outputs (stdout/stderr)\n"
+        )
+    if debug_state:
+        content_str += (
+            "  - Debug state information on your Ray cluster \n"
+            "    e.g. number of workers, drivers, objects, etc.\n"
+        )
+    if pip:
+        content_str += "  - Your installed Python packages (`pip freeze`)\n"
+    if processes:
+        content_str += (
+            "  - Information on your running Ray processes\n"
+            "    This includes command line arguments\n"
+        )
+    cli_logger.warning(
+        "You are about to create a cluster dump. This will collect data from "
+        "cluster nodes.\n\n"
+        "The dump will contain this information:\n\n"
+        f"{content_str}\n"
+        f"If you are concerned about leaking private information, extract "
+        f"the archive and inspect its contents before sharing it with "
+        f"anyone."
+    )
+    # Parse arguments (e.g. fetch info from cluster config)
+    (
+        cluster_config_file,
+        hosts,
+        ssh_user,
+        ssh_key,
+        docker,
+        cluster_name,
+    ) = _info_from_params(cluster_config_file, host, ssh_user, ssh_key, docker)
+    nodes = [
+        Node(host=h, ssh_user=ssh_user, ssh_key=ssh_key, docker_container=docker)
+        for h in hosts
+    ]
+    if not nodes:
+        cli_logger.error(
+            "No nodes found. Specify with `--host` or by passing a ray "
+            "cluster config to `--cluster`."
+        )
+        return None
+    if cluster_config_file:
+        nodes[0].is_head = True
+    if local is None:
+        # If called with a cluster config, this was probably started
+        # from a laptop
+        local = not bool(cluster_config_file)
+    parameters = GetParameters(
+        logs=logs,
+        debug_state=debug_state,
+        pip=pip,
+        processes=processes,
+        processes_verbose=processes_verbose,
+    )
+    with Archive(file=tempfile) as archive:
+        if local:
+            create_archive_for_local_and_remote_nodes(
+                archive, remote_nodes=nodes, parameters=parameters
+            )
+        else:
+            create_archive_for_remote_nodes(
+                archive, remote_nodes=nodes, parameters=parameters
+            )
+    if not output:
+        if cluster_name:
+            filename = (
+                f"{cluster_name}_" f"{datetime.datetime.now():%Y-%m-%d_%H-%M-%S}.tar.gz"
+            )
+        else:
+            filename = (
+                f"collected_logs_" f"{datetime.datetime.now():%Y-%m-%d_%H-%M-%S}.tar.gz"
+            )
+        output = os.path.join(os.getcwd(), filename)
+    else:
+        output = os.path.expanduser(output)
+    shutil.move(archive.file, output)
+    return output
+def confirm(msg: str, yes: bool) -> Optional[bool]:
+    return None if yes else click.confirm(msg, abort=True)

.venv/lib/python3.11/site-packages/ray/autoscaler/_private/constants.py ADDED Viewed

	@@ -0,0 +1,140 @@

+import os
+import sys
+from ray._private.ray_constants import (  # noqa F401
+    AUTOSCALER_RESOURCE_REQUEST_CHANNEL,
+    DEFAULT_OBJECT_STORE_MEMORY_PROPORTION,
+    LABELS_ENVIRONMENT_VARIABLE,
+    LOGGER_FORMAT,
+    RESOURCES_ENVIRONMENT_VARIABLE,
+)
+def env_integer(key, default):
+    if key in os.environ:
+        val = os.environ[key]
+        if val == "inf":
+            return sys.maxsize
+        else:
+            return int(val)
+    return default
+# Whether autoscaler cluster status logging is enabled. Set to 0 disable.
+AUTOSCALER_STATUS_LOG = env_integer("RAY_ENABLE_CLUSTER_STATUS_LOG", 1)
+# The name of the environment variable for plugging in a utilization scorer.
+AUTOSCALER_UTILIZATION_SCORER_KEY = "RAY_AUTOSCALER_UTILIZATION_SCORER"
+# Whether to avoid launching GPU nodes for CPU only tasks.
+AUTOSCALER_CONSERVE_GPU_NODES = env_integer("AUTOSCALER_CONSERVE_GPU_NODES", 1)
+# How long to wait for a node to start and terminate, in seconds.
+AUTOSCALER_NODE_START_WAIT_S = env_integer("AUTOSCALER_NODE_START_WAIT_S", 900)
+AUTOSCALER_NODE_TERMINATE_WAIT_S = env_integer("AUTOSCALER_NODE_TERMINATE_WAIT_S", 900)
+# Interval at which to check if node SSH became available.
+AUTOSCALER_NODE_SSH_INTERVAL_S = env_integer("AUTOSCALER_NODE_SSH_INTERVAL_S", 5)
+# Abort autoscaling if more than this number of errors are encountered. This
+# is a safety feature to prevent e.g. runaway node launches.
+AUTOSCALER_MAX_NUM_FAILURES = env_integer("AUTOSCALER_MAX_NUM_FAILURES", 5)
+# The maximum number of nodes to launch in a single request.
+# Multiple requests may be made for this batch size, up to
+# the limit of AUTOSCALER_MAX_CONCURRENT_LAUNCHES.
+AUTOSCALER_MAX_LAUNCH_BATCH = env_integer("AUTOSCALER_MAX_LAUNCH_BATCH", 5)
+# Max number of nodes to launch at a time.
+AUTOSCALER_MAX_CONCURRENT_LAUNCHES = env_integer(
+    "AUTOSCALER_MAX_CONCURRENT_LAUNCHES", 10
+)
+# Default upscaling speed for the autoscaler. This specifies how many nodes
+# to request at a time, where the desired number to upscale is
+#   min(1, upscaling_speed * current_num_nodes)
+# e.g. 1.0 means to request enough nodes to double
+# the cluster size in each round of requests.
+# When the upscaling speed is 0.0, the autoscaler will request 1 node.
+DEFAULT_UPSCALING_SPEED = 0.0
+# Interval at which to perform autoscaling updates.
+AUTOSCALER_UPDATE_INTERVAL_S = env_integer("AUTOSCALER_UPDATE_INTERVAL_S", 5)
+# The autoscaler will attempt to restart Ray on nodes it hasn't heard from
+# in more than this interval.
+AUTOSCALER_HEARTBEAT_TIMEOUT_S = env_integer("AUTOSCALER_HEARTBEAT_TIMEOUT_S", 30)
+# The maximum number of nodes (including failed nodes) that the autoscaler will
+# track for logging purposes.
+AUTOSCALER_MAX_NODES_TRACKED = 1500
+AUTOSCALER_MAX_FAILURES_DISPLAYED = 20
+AUTOSCALER_NODE_AVAILABILITY_MAX_STALENESS_S = env_integer(
+    "AUTOSCALER_NODE_AVAILABILITY_MAX_STALENESS_S", 30 * 60
+)
+AUTOSCALER_REPORT_PER_NODE_STATUS = (
+    env_integer("AUTOSCALER_REPORT_PER_NODE_STATUS", 1) == 1
+)
+# The maximum allowed resource demand vector size to guarantee the resource
+# demand scheduler bin packing algorithm takes a reasonable amount of time
+# to run.
+AUTOSCALER_MAX_RESOURCE_DEMAND_VECTOR_SIZE = 1000
+# Port that autoscaler prometheus metrics will be exported to
+AUTOSCALER_METRIC_PORT = env_integer("AUTOSCALER_METRIC_PORT", 44217)
+# Max number of retries to AWS (default is 5, time increases exponentially)
+BOTO_MAX_RETRIES = env_integer("BOTO_MAX_RETRIES", 12)
+# Max number of retries to create an EC2 node (retry different subnet)
+BOTO_CREATE_MAX_RETRIES = env_integer("BOTO_CREATE_MAX_RETRIES", 5)
+# ray home path in the container image
+RAY_HOME = "/home/ray"
+# The order of this list matters! `scripts.py` kills the ray processes in order of this
+# list. Think twice when you add to this list.
+# Invariants:
+# RAYLET must be the first in the list.
+# GCS SERVER must be the last in the list.
+RAY_PROCESSES = [
+    # The first element is the substring to filter.
+    # The second element, if True, is to filter ps results by command name
+    # (only the first 15 charactors of the executable name on Linux);
+    # if False, is to filter ps results by command with all its arguments.
+    # See STANDARD FORMAT SPECIFIERS section of
+    # http://man7.org/linux/man-pages/man1/ps.1.html
+    # about comm and args. This can help avoid killing non-ray processes.
+    # Format:
+    # Keyword to filter, filter by command (True)/filter by args (False)
+    ["raylet", True],
+    ["plasma_store", True],
+    ["monitor.py", False],
+    ["ray.util.client.server", False],
+    ["default_worker.py", False],  # Python worker.
+    ["setup_worker.py", False],  # Python environment setup worker.
+    # For mac osx, setproctitle doesn't change the process name returned
+    # by psutil but only cmdline.
+    [
+        "ray::",
+        sys.platform != "darwin",
+    ],  # Python worker. TODO(mehrdadn): Fix for Windows
+    ["io.ray.runtime.runner.worker.DefaultWorker", False],  # Java worker.
+    ["log_monitor.py", False],
+    ["reporter.py", False],
+    [os.path.join("dashboard", "agent.py"), False],
+    [os.path.join("dashboard", "dashboard.py"), False],
+    [os.path.join("runtime_env", "agent", "main.py"), False],
+    ["ray_process_reaper.py", False],
+    ["gcs_server", True],
+]
+# Max Concurrent SSH Calls to stop Docker
+MAX_PARALLEL_SHUTDOWN_WORKERS = env_integer("MAX_PARALLEL_SHUTDOWN_WORKERS", 50)
+DISABLE_NODE_UPDATERS_KEY = "disable_node_updaters"
+DISABLE_LAUNCH_CONFIG_CHECK_KEY = "disable_launch_config_check"
+FOREGROUND_NODE_LAUNCH_KEY = "foreground_node_launch"
+WORKER_LIVENESS_CHECK_KEY = "worker_liveness_check"

.venv/lib/python3.11/site-packages/ray/autoscaler/_private/docker.py ADDED Viewed

	@@ -0,0 +1,129 @@

+from pathlib import Path
+from typing import Any, Dict
+from ray.autoscaler._private.cli_logger import cli_logger
+try:  # py3
+    from shlex import quote
+except ImportError:  # py2
+    from pipes import quote
+def _check_docker_file_mounts(file_mounts: Dict[str, str]) -> None:
+    """Checks if files are passed as file_mounts. This is a problem for Docker
+    based clusters because when a file is bind-mounted in Docker, updates to
+    the file on the host do not always propagate to the container. Using
+    directories is recommended.
+    """
+    for remote, local in file_mounts.items():
+        if Path(local).is_file():
+            cli_logger.warning(
+                f"File Mount: ({remote}:{local}) refers to a file.\n To ensure"
+                " this mount updates properly, please use a directory."
+            )
+def validate_docker_config(config: Dict[str, Any]) -> None:
+    """Checks whether the Docker configuration is valid."""
+    if "docker" not in config:
+        return
+    _check_docker_file_mounts(config.get("file_mounts", {}))
+    docker_image = config["docker"].get("image")
+    cname = config["docker"].get("container_name")
+    head_docker_image = config["docker"].get("head_image", docker_image)
+    worker_docker_image = config["docker"].get("worker_image", docker_image)
+    image_present = docker_image or (head_docker_image and worker_docker_image)
+    if (not cname) and (not image_present):
+        return
+    else:
+        assert cname and image_present, "Must provide a container & image name"
+    return None
+def with_docker_exec(
+    cmds, container_name, docker_cmd, env_vars=None, with_interactive=False
+):
+    assert docker_cmd, "Must provide docker command"
+    env_str = ""
+    if env_vars:
+        env_str = " ".join(["-e {env}=${env}".format(env=env) for env in env_vars])
+    return [
+        "docker exec {interactive} {env} {container} /bin/bash -c {cmd} ".format(
+            interactive="-it" if with_interactive else "",
+            env=env_str,
+            container=container_name,
+            cmd=quote(cmd),
+        )
+        for cmd in cmds
+    ]
+def _check_helper(cname, template, docker_cmd):
+    return " ".join(
+        [docker_cmd, "inspect", "-f", "'{{" + template + "}}'", cname, "||", "true"]
+    )
+def check_docker_running_cmd(cname, docker_cmd):
+    return _check_helper(cname, ".State.Running", docker_cmd)
+def check_bind_mounts_cmd(cname, docker_cmd):
+    return _check_helper(cname, "json .Mounts", docker_cmd)
+def check_docker_image(cname, docker_cmd):
+    return _check_helper(cname, ".Config.Image", docker_cmd)
+def docker_start_cmds(
+    user,
+    image,
+    mount_dict,
+    container_name,
+    user_options,
+    cluster_name,
+    home_directory,
+    docker_cmd,
+):
+    # Imported here due to circular dependency.
+    from ray.autoscaler.sdk import get_docker_host_mount_location
+    docker_mount_prefix = get_docker_host_mount_location(cluster_name)
+    mount = {f"{docker_mount_prefix}/{dst}": dst for dst in mount_dict}
+    mount_flags = " ".join(
+        [
+            "-v {src}:{dest}".format(src=k, dest=v.replace("~/", home_directory + "/"))
+            for k, v in mount.items()
+        ]
+    )
+    # for click, used in ray cli
+    env_vars = {"LC_ALL": "C.UTF-8", "LANG": "C.UTF-8"}
+    env_flags = " ".join(
+        ["-e {name}={val}".format(name=k, val=v) for k, v in env_vars.items()]
+    )
+    user_options_str = " ".join(user_options)
+    docker_run = [
+        docker_cmd,
+        "run",
+        "--rm",
+        "--name {}".format(container_name),
+        "-d",
+        "-it",
+        mount_flags,
+        env_flags,
+        user_options_str,
+        "--net=host",
+        image,
+        "bash",
+    ]
+    return " ".join(docker_run)

.venv/lib/python3.11/site-packages/ray/autoscaler/_private/event_summarizer.py ADDED Viewed

	@@ -0,0 +1,75 @@

+import time
+from threading import RLock
+from typing import Any, Callable, Dict, List
+class EventSummarizer:
+    """Utility that aggregates related log messages to reduce log spam."""
+    def __init__(self):
+        self.events_by_key: Dict[str, int] = {}
+        # Messages to send in next summary batch.
+        self.messages_to_send: List[str] = []
+        # Tracks TTL of messages. A message will not be re-sent once it is
+        # added here, until its TTL expires.
+        self.throttled_messages: Dict[str, float] = {}
+        # Event summarizer is used by the main thread and
+        # by node launcher child threads.
+        self.lock = RLock()
+    def add(
+        self, template: str, *, quantity: Any, aggregate: Callable[[Any, Any], Any]
+    ) -> None:
+        """Add a log message, which will be combined by template.
+        Args:
+            template: Format string with one placeholder for quantity.
+            quantity: Quantity to aggregate.
+            aggregate: Aggregation function used to combine the
+                quantities. The result is inserted into the template to
+                produce the final log message.
+        """
+        with self.lock:
+            # Enforce proper sentence structure.
+            if not template.endswith("."):
+                template += "."
+            if template in self.events_by_key:
+                self.events_by_key[template] = aggregate(
+                    self.events_by_key[template], quantity
+                )
+            else:
+                self.events_by_key[template] = quantity
+    def add_once_per_interval(self, message: str, key: str, interval_s: int):
+        """Add a log message, which is throttled once per interval by a key.
+        Args:
+            message: The message to log.
+            key: The key to use to deduplicate the message.
+            interval_s: Throttling interval in seconds.
+        """
+        with self.lock:
+            if key not in self.throttled_messages:
+                self.throttled_messages[key] = time.time() + interval_s
+                self.messages_to_send.append(message)
+    def summary(self) -> List[str]:
+        """Generate the aggregated log summary of all added events."""
+        with self.lock:
+            out = []
+            for template, quantity in self.events_by_key.items():
+                out.append(template.format(quantity))
+            out.extend(self.messages_to_send)
+        return out
+    def clear(self) -> None:
+        """Clear the events added."""
+        with self.lock:
+            self.events_by_key.clear()
+            self.messages_to_send.clear()
+            # Expire any messages that have reached their TTL. This allows them
+            # to be sent again.
+            for k, t in list(self.throttled_messages.items()):
+                if time.time() > t:
+                    del self.throttled_messages[k]

.venv/lib/python3.11/site-packages/ray/autoscaler/_private/event_system.py ADDED Viewed

	@@ -0,0 +1,106 @@

+from enum import Enum, auto
+from typing import Any, Callable, Dict, List, Optional, Union
+from ray.autoscaler._private.cli_logger import cli_logger
+class CreateClusterEvent(Enum):
+    """Events to track in ray.autoscaler.sdk.create_or_update_cluster.
+    Attributes:
+        up_started : Invoked at the beginning of create_or_update_cluster.
+        ssh_keypair_downloaded : Invoked when the ssh keypair is downloaded.
+        cluster_booting_started : Invoked when when the cluster booting starts.
+        acquiring_new_head_node : Invoked before the head node is acquired.
+        head_node_acquired : Invoked after the head node is acquired.
+        ssh_control_acquired : Invoked when the node is being updated.
+        run_initialization_cmd : Invoked before all initialization
+            commands are called and again before each initialization command.
+        run_setup_cmd : Invoked before all setup commands are
+            called and again before each setup command.
+        start_ray_runtime : Invoked before ray start commands are run.
+        start_ray_runtime_completed : Invoked after ray start commands
+            are run.
+        cluster_booting_completed : Invoked after cluster booting
+            is completed.
+    """
+    up_started = auto()
+    ssh_keypair_downloaded = auto()
+    cluster_booting_started = auto()
+    acquiring_new_head_node = auto()
+    head_node_acquired = auto()
+    ssh_control_acquired = auto()
+    run_initialization_cmd = auto()
+    run_setup_cmd = auto()
+    start_ray_runtime = auto()
+    start_ray_runtime_completed = auto()
+    cluster_booting_completed = auto()
+class _EventSystem:
+    """Event system that handles storing and calling callbacks for events.
+    Attributes:
+        callback_map (Dict[str, List[Callable]]) : Stores list of callbacks
+            for events when registered.
+    """
+    def __init__(self):
+        self.callback_map = {}
+    def add_callback_handler(
+        self,
+        event: str,
+        callback: Union[Callable[[Dict], None], List[Callable[[Dict], None]]],
+    ):
+        """Stores callback handler for event.
+        Args:
+            event: Event that callback should be called on. See
+                CreateClusterEvent for details on the events available to be
+                registered against.
+            callback (Callable[[Dict], None]): Callable object that is invoked
+                when specified event occurs.
+        """
+        if event not in CreateClusterEvent.__members__.values():
+            cli_logger.warning(
+                f"{event} is not currently tracked, and this"
+                " callback will not be invoked."
+            )
+        self.callback_map.setdefault(event, []).extend(
+            [callback] if type(callback) is not list else callback
+        )
+    def execute_callback(
+        self, event: CreateClusterEvent, event_data: Optional[Dict[str, Any]] = None
+    ):
+        """Executes all callbacks for event.
+        Args:
+            event: Event that is invoked. See CreateClusterEvent
+                for details on the available events.
+            event_data (Dict[str, Any]): Argument that is passed to each
+                callable object stored for this particular event.
+        """
+        if event_data is None:
+            event_data = {}
+        event_data["event_name"] = event
+        if event in self.callback_map:
+            for callback in self.callback_map[event]:
+                callback(event_data)
+    def clear_callbacks_for_event(self, event: str):
+        """Clears stored callable objects for event.
+        Args:
+            event: Event that has callable objects stored in map.
+                See CreateClusterEvent for details on the available events.
+        """
+        if event in self.callback_map:
+            del self.callback_map[event]
+global_event_system = _EventSystem()

.venv/lib/python3.11/site-packages/ray/autoscaler/_private/fake_multi_node/__init__.py ADDED Viewed

File without changes

.venv/lib/python3.11/site-packages/ray/autoscaler/_private/fake_multi_node/command_runner.py ADDED Viewed

	@@ -0,0 +1,91 @@

+import os
+import subprocess
+from typing import Dict, List, Tuple
+from ray.autoscaler._private.docker import with_docker_exec
+from ray.autoscaler.command_runner import CommandRunnerInterface
+class FakeDockerCommandRunner(CommandRunnerInterface):
+    """Command runner for the fke docker multinode cluster.
+    This command runner uses ``docker exec`` and ``docker cp`` to
+    run commands and copy files, respectively.
+    The regular ``DockerCommandRunner`` is made for use in SSH settings
+    where Docker runs on a remote hose. In contrast, this command runner
+    does not wrap the docker commands in ssh calls.
+    """
+    def __init__(self, docker_config, **common_args):
+        self.container_name = docker_config["container_name"]
+        self.docker_config = docker_config
+        self.home_dir = None
+        self.initialized = False
+        # Optionally use 'podman' instead of 'docker'
+        use_podman = docker_config.get("use_podman", False)
+        self.docker_cmd = "podman" if use_podman else "docker"
+    def _run_shell(self, cmd: str, timeout: int = 120) -> str:
+        return subprocess.check_output(
+            cmd, shell=True, timeout=timeout, encoding="utf-8"
+        )
+    def run(
+        self,
+        cmd: str = None,
+        timeout: int = 120,
+        exit_on_fail: bool = False,
+        port_forward: List[Tuple[int, int]] = None,
+        with_output: bool = False,
+        environment_variables: Dict[str, object] = None,
+        run_env: str = "auto",
+        ssh_options_override_ssh_key: str = "",
+        shutdown_after_run: bool = False,
+    ) -> str:
+        prefix = with_docker_exec(
+            [cmd],
+            container_name=self.container_name,
+            with_interactive=False,
+            docker_cmd=self.docker_cmd,
+        )[0]
+        return self._run_shell(prefix)
+    def run_init(
+        self, *, as_head: bool, file_mounts: Dict[str, str], sync_run_yet: bool
+    ):
+        pass
+    def remote_shell_command_str(self):
+        return "{} exec -it {} bash".format(self.docker_cmd, self.container_name)
+    def run_rsync_down(self, source, target, options=None):
+        docker_dir = os.path.dirname(self._docker_expand_user(source))
+        self._run_shell(f"docker cp {self.container_name}:{docker_dir} {target}")
+    def run_rsync_up(self, source, target, options=None):
+        docker_dir = os.path.dirname(self._docker_expand_user(target))
+        self.run(cmd=f"mkdir -p {docker_dir}")
+        self._run_shell(f"docker cp {source} {self.container_name}:{docker_dir}")
+    def _docker_expand_user(self, string, any_char=False):
+        user_pos = string.find("~")
+        if user_pos > -1:
+            if self.home_dir is None:
+                self.home_dir = self._run_shell(
+                    with_docker_exec(
+                        ["printenv HOME"],
+                        container_name=self.container_name,
+                        docker_cmd=self.docker_cmd,
+                    )
+                ).strip()
+            if any_char:
+                return string.replace("~/", self.home_dir + "/")
+            elif not any_char and user_pos == 0:
+                return string.replace("~", self.home_dir, 1)
+        return string

.venv/lib/python3.11/site-packages/ray/autoscaler/_private/fake_multi_node/docker_monitor.py ADDED Viewed

	@@ -0,0 +1,246 @@

+"""Fake multinode docker monitoring script.
+This script is the "docker compose server" for the fake_multinode
+provider using Docker compose. It should be started before running
+`RAY_FAKE_CLUSTER=1 ray up <cluster_config>`.
+This script reads the volume directory from a supplied fake multinode
+docker cluster config file.
+It then waits until a docker-compose.yaml file is created in the same
+directory, which is done by the `ray up` command.
+It then watches for changes in the docker-compose.yaml file and runs
+`docker compose up` whenever changes are detected. This will start docker
+containers as requested by the autoscaler.
+Generally, the docker-compose.yaml will be mounted in the head node of the
+cluster, which will then continue to change it according to the autoscaler
+requirements.
+Additionally, this script monitors the docker container status using
+`docker status` and writes it into a `status.json`. This information is
+again used by the autoscaler to determine if any nodes have died.
+"""
+import argparse
+import json
+import os
+import shutil
+import subprocess
+import time
+from typing import Any, Dict, List, Optional
+import yaml
+def _read_yaml(path: str):
+    with open(path, "rt") as f:
+        return yaml.safe_load(f)
+def _update_docker_compose(
+    docker_compose_path: str, project_name: str, status: Optional[Dict[str, Any]]
+) -> bool:
+    docker_compose_config = _read_yaml(docker_compose_path)
+    if not docker_compose_config:
+        print("Docker compose currently empty")
+        return False
+    cmd = ["up", "-d"]
+    if status and len(status) > 0:
+        cmd += ["--no-recreate"]
+    shutdown = False
+    if not docker_compose_config["services"]:
+        # If no more nodes, run `down` instead of `up`
+        print("Shutting down nodes")
+        cmd = ["down"]
+        shutdown = True
+    try:
+        subprocess.check_call(
+            ["docker", "compose", "-f", docker_compose_path, "-p", project_name]
+            + cmd
+            + [
+                "--remove-orphans",
+            ]
+        )
+    except Exception as e:
+        print(f"Ran into error when updating docker compose: {e}")
+        # Ignore error
+    return shutdown
+def _get_ip(
+    project_name: str,
+    container_name: str,
+    override_network: Optional[str] = None,
+    retry_times: int = 3,
+) -> Optional[str]:
+    network = override_network or f"{project_name}_ray_local"
+    cmd = [
+        "docker",
+        "inspect",
+        "-f",
+        '"{{ .NetworkSettings.Networks' f".{network}.IPAddress" ' }}"',
+        f"{container_name}",
+    ]
+    for i in range(retry_times):
+        try:
+            ip_address = subprocess.check_output(cmd, encoding="utf-8")
+        except Exception:
+            time.sleep(1)
+        else:
+            return ip_address.strip().strip('"').strip('\\"')
+    return None
+def _update_docker_status(
+    docker_compose_path: str, project_name: str, docker_status_path: str
+):
+    data_str = ""
+    try:
+        data_str = (
+            subprocess.check_output(
+                [
+                    "docker",
+                    "compose",
+                    "-f",
+                    docker_compose_path,
+                    "-p",
+                    project_name,
+                    "ps",
+                    "--format",
+                    "json",
+                ]
+            )
+            .decode("utf-8")
+            .strip()
+            .split("\n")
+        )
+        data: List[Dict[str, str]] = []
+        for line in data_str:
+            line = line.strip()
+            if line:
+                data.append(json.loads(line))
+    except Exception as e:
+        print(f"Ran into error when fetching status: {e}")
+        print(f"docker compose ps output: {data_str}")
+        return None
+    status = {}
+    for container in data:
+        node_id = container["Service"]
+        container_name = container["Name"]
+        if container["State"] == "running":
+            ip = _get_ip(project_name, container_name)
+        else:
+            ip = ""
+        container["IP"] = ip
+        status[node_id] = container
+    with open(docker_status_path, "wt") as f:
+        json.dump(status, f)
+    return status
+def monitor_docker(
+    docker_compose_path: str,
+    status_path: str,
+    project_name: str,
+    update_interval: float = 1.0,
+):
+    while not os.path.exists(docker_compose_path):
+        # Wait until cluster is created
+        time.sleep(0.5)
+    print("Docker compose config detected, starting status monitoring")
+    # Make sure this is always writeable from inside the containers
+    os.chmod(docker_compose_path, 0o777)
+    docker_config = {"force_update": True}
+    # Force update
+    next_update = time.monotonic() - 1.0
+    shutdown = False
+    status = None
+    # Loop:
+    # If the config changed, update cluster.
+    # Every `update_interval` seconds, update docker status.
+    while not shutdown:
+        new_docker_config = _read_yaml(docker_compose_path)
+        if new_docker_config != docker_config:
+            # Update cluster
+            shutdown = _update_docker_compose(docker_compose_path, project_name, status)
+            # Force status update
+            next_update = time.monotonic() - 1.0
+        if time.monotonic() > next_update:
+            # Update docker status
+            status = _update_docker_status(
+                docker_compose_path, project_name, status_path
+            )
+            next_update = time.monotonic() + update_interval
+        docker_config = new_docker_config
+        time.sleep(0.1)
+    print("Cluster shut down, terminating monitoring script.")
+def start_monitor(config_file: str):
+    cluster_config = _read_yaml(config_file)
+    provider_config = cluster_config["provider"]
+    assert provider_config["type"] == "fake_multinode_docker", (
+        f"The docker monitor only works with providers of type "
+        f"`fake_multinode_docker`, got `{provider_config['type']}`"
+    )
+    project_name = provider_config["project_name"]
+    volume_dir = provider_config["shared_volume_dir"]
+    os.makedirs(volume_dir, mode=0o755, exist_ok=True)
+    # Create bootstrap config
+    bootstrap_config_path = os.path.join(volume_dir, "bootstrap_config.yaml")
+    shutil.copy(config_file, bootstrap_config_path)
+    # These two files usually don't exist, yet
+    docker_compose_config_path = os.path.join(volume_dir, "docker-compose.yaml")
+    docker_status_path = os.path.join(volume_dir, "status.json")
+    if os.path.exists(docker_compose_config_path):
+        # We wait until this file exists, so remove it if it exists
+        # from a previous run.
+        os.remove(docker_compose_config_path)
+    if os.path.exists(docker_status_path):
+        os.remove(docker_status_path)
+        # Create empty file so it can be mounted
+        with open(docker_status_path, "wt") as f:
+            f.write("{}")
+    print(
+        f"Starting monitor process. Please start Ray cluster with:\n"
+        f"   RAY_FAKE_CLUSTER=1 ray up {config_file}"
+    )
+    monitor_docker(docker_compose_config_path, docker_status_path, project_name)
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "config_file",
+        help="Path to cluster config file containing a fake docker "
+        "cluster configuration.",
+    )
+    args = parser.parse_args()
+    start_monitor(args.config_file)

.venv/lib/python3.11/site-packages/ray/autoscaler/_private/fake_multi_node/test_utils.py ADDED Viewed

	@@ -0,0 +1,398 @@

+import json
+import logging
+import os
+import random
+import shutil
+import subprocess
+import sys
+import tempfile
+import threading
+import time
+from typing import Any, Dict, Optional
+import yaml
+import ray
+from ray._private.dict import deep_update
+from ray.autoscaler._private.fake_multi_node.node_provider import (
+    FAKE_DOCKER_DEFAULT_CLIENT_PORT,
+    FAKE_DOCKER_DEFAULT_GCS_PORT,
+)
+from ray.util.queue import Empty, Queue
+logger = logging.getLogger(__name__)
+DEFAULT_DOCKER_IMAGE = "rayproject/ray:nightly-py{major}{minor}-cpu"
+class ResourcesNotReadyError(RuntimeError):
+    pass
+class DockerCluster:
+    """Docker cluster wrapper.
+    Creates a directory for starting a fake multinode docker cluster.
+    Includes APIs to update the cluster config as needed in tests,
+    and to start and connect to the cluster.
+    """
+    def __init__(self, config: Optional[Dict[str, Any]] = None):
+        self._base_config_file = os.path.join(
+            os.path.dirname(__file__), "example_docker.yaml"
+        )
+        self._tempdir = None
+        self._config_file = None
+        self._nodes_file = None
+        self._nodes = {}
+        self._status_file = None
+        self._status = {}
+        self._partial_config = config
+        self._cluster_config = None
+        self._docker_image = None
+        self._monitor_script = os.path.join(
+            os.path.dirname(__file__), "docker_monitor.py"
+        )
+        self._monitor_process = None
+        self._execution_thread = None
+        self._execution_event = threading.Event()
+        self._execution_queue = None
+    @property
+    def config_file(self):
+        return self._config_file
+    @property
+    def cluster_config(self):
+        return self._cluster_config
+    @property
+    def cluster_dir(self):
+        return self._tempdir
+    @property
+    def gcs_port(self):
+        return self._cluster_config.get("provider", {}).get(
+            "host_gcs_port", FAKE_DOCKER_DEFAULT_GCS_PORT
+        )
+    @property
+    def client_port(self):
+        return self._cluster_config.get("provider", {}).get(
+            "host_client_port", FAKE_DOCKER_DEFAULT_CLIENT_PORT
+        )
+    def connect(self, client: bool = True, timeout: int = 120, **init_kwargs):
+        """Connect to the docker-compose Ray cluster.
+        Assumes the cluster is at RAY_TESTHOST (defaults to
+        ``127.0.0.1``).
+        Args:
+            client: If True, uses Ray client to connect to the
+                cluster. If False, uses GCS to connect to the cluster.
+            timeout: Connection timeout in seconds.
+            **init_kwargs: kwargs to pass to ``ray.init()``.
+        """
+        host = os.environ.get("RAY_TESTHOST", "127.0.0.1")
+        if client:
+            port = self.client_port
+            address = f"ray://{host}:{port}"
+        else:
+            port = self.gcs_port
+            address = f"{host}:{port}"
+        timeout_at = time.monotonic() + timeout
+        while time.monotonic() < timeout_at:
+            try:
+                ray.init(address, **init_kwargs)
+                self.wait_for_resources({"CPU": 1})
+            except ResourcesNotReadyError:
+                time.sleep(1)
+                continue
+            else:
+                break
+        try:
+            ray.cluster_resources()
+        except Exception as e:
+            raise RuntimeError(f"Timed out connecting to Ray: {e}")
+    def remote_execution_api(self) -> "RemoteAPI":
+        """Create an object to control cluster state from within the cluster."""
+        self._execution_queue = Queue(actor_options={"num_cpus": 0})
+        stop_event = self._execution_event
+        def entrypoint():
+            while not stop_event.is_set():
+                try:
+                    cmd, kwargs = self._execution_queue.get(timeout=1)
+                except Empty:
+                    continue
+                if cmd == "kill_node":
+                    self.kill_node(**kwargs)
+        self._execution_thread = threading.Thread(target=entrypoint)
+        self._execution_thread.start()
+        return RemoteAPI(self._execution_queue)
+    @staticmethod
+    def wait_for_resources(resources: Dict[str, float], timeout: int = 60):
+        """Wait until Ray cluster resources are available
+        Args:
+            resources: Minimum resources needed before
+                this function returns.
+            timeout: Timeout in seconds.
+        """
+        timeout = time.monotonic() + timeout
+        available = ray.cluster_resources()
+        while any(available.get(k, 0.0) < v for k, v in resources.items()):
+            if time.monotonic() > timeout:
+                raise ResourcesNotReadyError(
+                    f"Timed out waiting for resources: {resources}"
+                )
+            time.sleep(1)
+            available = ray.cluster_resources()
+    def update_config(self, config: Optional[Dict[str, Any]] = None):
+        """Update autoscaling config.
+        Does a deep update of the base config with a new configuration.
+        This can change autoscaling behavior.
+        Args:
+            config: Partial config to update current
+                config with.
+        """
+        assert self._tempdir, "Call setup() first"
+        config = config or {}
+        if config:
+            self._partial_config = config
+        if not config.get("provider", {}).get("image"):
+            # No image specified, trying to parse from buildkite
+            docker_image = os.environ.get("RAY_DOCKER_IMAGE", None)
+            if not docker_image:
+                # If still no docker image, use one according to Python version
+                mj = sys.version_info.major
+                mi = sys.version_info.minor
+                docker_image = DEFAULT_DOCKER_IMAGE.format(major=mj, minor=mi)
+            self._docker_image = docker_image
+        with open(self._base_config_file, "rt") as f:
+            cluster_config = yaml.safe_load(f)
+        if self._partial_config:
+            deep_update(cluster_config, self._partial_config, new_keys_allowed=True)
+        if self._docker_image:
+            cluster_config["provider"]["image"] = self._docker_image
+        cluster_config["provider"]["shared_volume_dir"] = self._tempdir
+        self._cluster_config = cluster_config
+        with open(self._config_file, "wt") as f:
+            yaml.safe_dump(self._cluster_config, f)
+        logging.info(f"Updated cluster config to: {self._cluster_config}")
+    def maybe_pull_image(self):
+        if self._docker_image:
+            try:
+                images_str = subprocess.check_output(
+                    f"docker image inspect {self._docker_image}", shell=True
+                )
+                images = json.loads(images_str)
+            except Exception as e:
+                logger.error(f"Error inspecting image {self._docker_image}: {e}")
+                return
+            if not images:
+                try:
+                    subprocess.check_call(
+                        f"docker pull {self._docker_image}", shell=True
+                    )
+                except Exception as e:
+                    logger.error(f"Error pulling image {self._docker_image}: {e}")
+    def setup(self):
+        """Setup docker compose cluster environment.
+        Creates the temporary directory, writes the initial config file,
+        and pulls the docker image, if required.
+        """
+        self._tempdir = tempfile.mkdtemp(dir=os.environ.get("RAY_TEMPDIR", None))
+        os.chmod(self._tempdir, 0o777)
+        self._config_file = os.path.join(self._tempdir, "cluster.yaml")
+        self._nodes_file = os.path.join(self._tempdir, "nodes.json")
+        self._status_file = os.path.join(self._tempdir, "status.json")
+        self.update_config()
+        self.maybe_pull_image()
+    def teardown(self, keep_dir: bool = False):
+        """Tear down docker compose cluster environment.
+        Args:
+            keep_dir: If True, cluster directory
+                will not be removed after termination.
+        """
+        if not keep_dir:
+            shutil.rmtree(self._tempdir)
+        self._tempdir = None
+        self._config_file = None
+    def _start_monitor(self):
+        self._monitor_process = subprocess.Popen(
+            [sys.executable, self._monitor_script, self.config_file]
+        )
+        time.sleep(2)
+    def _stop_monitor(self):
+        if self._monitor_process:
+            self._monitor_process.wait(timeout=30)
+            if self._monitor_process.poll() is None:
+                self._monitor_process.terminate()
+        self._monitor_process = None
+    def start(self):
+        """Start docker compose cluster.
+        Starts the monitor process and runs ``ray up``.
+        """
+        self._start_monitor()
+        subprocess.check_call(
+            f"RAY_FAKE_CLUSTER=1 ray up -y {self.config_file}", shell=True
+        )
+    def stop(self):
+        """Stop docker compose cluster.
+        Runs ``ray down`` and stops the monitor process.
+        """
+        if ray.is_initialized:
+            ray.shutdown()
+        subprocess.check_call(
+            f"RAY_FAKE_CLUSTER=1 ray down -y {self.config_file}", shell=True
+        )
+        self._stop_monitor()
+        self._execution_event.set()
+    def _update_nodes(self):
+        with open(self._nodes_file, "rt") as f:
+            self._nodes = json.load(f)
+    def _update_status(self):
+        with open(self._status_file, "rt") as f:
+            self._status = json.load(f)
+    def _get_node(
+        self,
+        node_id: Optional[str] = None,
+        num: Optional[int] = None,
+        rand: Optional[str] = None,
+    ) -> str:
+        self._update_nodes()
+        if node_id:
+            assert (
+                not num and not rand
+            ), "Only provide either `node_id`, `num`, or `random`."
+        elif num:
+            assert (
+                not node_id and not rand
+            ), "Only provide either `node_id`, `num`, or `random`."
+            base = "fffffffffffffffffffffffffffffffffffffffffffffffffff"
+            node_id = base + str(num).zfill(5)
+        elif rand:
+            assert (
+                not node_id and not num
+            ), "Only provide either `node_id`, `num`, or `random`."
+            assert rand in [
+                "worker",
+                "any",
+            ], "`random` must be one of ['worker', 'any']"
+            choices = list(self._nodes.keys())
+            if rand == "worker":
+                choices.remove(
+                    "fffffffffffffffffffffffffffffffffffffffffffffffffff00000"
+                )
+            # Else: any
+            node_id = random.choice(choices)
+        assert node_id in self._nodes, f"Node with ID {node_id} is not in active nodes."
+        return node_id
+    def _get_docker_container(self, node_id: str) -> Optional[str]:
+        self._update_status()
+        node_status = self._status.get(node_id)
+        if not node_status:
+            return None
+        return node_status["Name"]
+    def kill_node(
+        self,
+        node_id: Optional[str] = None,
+        num: Optional[int] = None,
+        rand: Optional[str] = None,
+    ):
+        """Kill node.
+        If ``node_id`` is given, kill that node.
+        If ``num`` is given, construct node_id from this number, and kill
+        that node.
+        If ``rand`` is given (as either ``worker`` or ``any``), kill a random
+        node.
+        """
+        node_id = self._get_node(node_id=node_id, num=num, rand=rand)
+        container = self._get_docker_container(node_id=node_id)
+        subprocess.check_call(f"docker kill {container}", shell=True)
+class RemoteAPI:
+    """Remote API to control cluster state from within cluster tasks.
+    This API uses a Ray queue to interact with an execution thread on the
+    host machine that will execute commands passed to the queue.
+    Instances of this class can be serialized and passed to Ray remote actors
+    to interact with cluster state (but they can also be used outside actors).
+    The API subset is limited to specific commands.
+    Args:
+        queue: Ray queue to push command instructions to.
+    """
+    def __init__(self, queue: Queue):
+        self._queue = queue
+    def kill_node(
+        self,
+        node_id: Optional[str] = None,
+        num: Optional[int] = None,
+        rand: Optional[str] = None,
+    ):
+        self._queue.put(("kill_node", dict(node_id=node_id, num=num, rand=rand)))

.venv/lib/python3.11/site-packages/ray/autoscaler/_private/legacy_info_string.py ADDED Viewed

	@@ -0,0 +1,37 @@

+import logging
+from ray._private.ray_constants import DEBUG_AUTOSCALING_STATUS_LEGACY
+from ray.experimental.internal_kv import _internal_kv_initialized, _internal_kv_put
+"""This file provides legacy support for the old info string in order to
+ensure the dashboard's `api/cluster_status` does not break backwards
+compatibilty.
+"""
+logger = logging.getLogger(__name__)
+def legacy_log_info_string(autoscaler, nodes):
+    tmp = "Cluster status: "
+    tmp += info_string(autoscaler, nodes)
+    tmp += "\n"
+    tmp += autoscaler.load_metrics.info_string()
+    tmp += "\n"
+    tmp += autoscaler.resource_demand_scheduler.debug_string(
+        nodes,
+        autoscaler.pending_launches.breakdown(),
+        autoscaler.load_metrics.get_resource_utilization(),
+    )
+    if _internal_kv_initialized():
+        _internal_kv_put(DEBUG_AUTOSCALING_STATUS_LEGACY, tmp, overwrite=True)
+    logger.debug(tmp)
+def info_string(autoscaler, nodes):
+    suffix = ""
+    if autoscaler.updaters:
+        suffix += " ({} updating)".format(len(autoscaler.updaters))
+    if autoscaler.num_failed_updates:
+        suffix += " ({} failed to update)".format(len(autoscaler.num_failed_updates))
+    return "{} nodes{}".format(len(nodes), suffix)

.venv/lib/python3.11/site-packages/ray/autoscaler/_private/load_metrics.py ADDED Viewed

	@@ -0,0 +1,375 @@

+import logging
+import time
+from collections import Counter
+from functools import reduce
+from typing import Dict, List
+from ray._private.gcs_utils import PlacementGroupTableData
+from ray.autoscaler._private.constants import (
+    AUTOSCALER_MAX_RESOURCE_DEMAND_VECTOR_SIZE,
+    AUTOSCALER_REPORT_PER_NODE_STATUS,
+)
+from ray.autoscaler._private.util import (
+    DictCount,
+    LoadMetricsSummary,
+    NodeIP,
+    ResourceDict,
+)
+from ray.core.generated.common_pb2 import PlacementStrategy
+logger = logging.getLogger(__name__)
+def add_resources(dict1: Dict[str, float], dict2: Dict[str, float]) -> Dict[str, float]:
+    """Add the values in two dictionaries.
+    Returns:
+        dict: A new dictionary (inputs remain unmodified).
+    """
+    new_dict = dict1.copy()
+    for k, v in dict2.items():
+        new_dict[k] = v + new_dict.get(k, 0)
+    return new_dict
+def freq_of_dicts(dicts: List[Dict], serializer=None, deserializer=dict) -> DictCount:
+    """Count a list of dictionaries (or unhashable types).
+    This is somewhat annoying because mutable data structures aren't hashable,
+    and set/dict keys must be hashable.
+    Args:
+        dicts (List[D]): A list of dictionaries to be counted.
+        serializer (D -> S): A custom serialization function. The output type S
+            must be hashable. The default serializer converts a dictionary into
+            a frozenset of KV pairs.
+        deserializer (S -> U): A custom deserialization function. See the
+            serializer for information about type S. For dictionaries U := D.
+    Returns:
+        List[Tuple[U, int]]: Returns a list of tuples. Each entry in the list
+            is a tuple containing a unique entry from `dicts` and its
+            corresponding frequency count.
+    """
+    if serializer is None:
+        serializer = lambda d: frozenset(d.items())  # noqa: E731
+    freqs = Counter(serializer(d) for d in dicts)
+    as_list = []
+    for as_set, count in freqs.items():
+        as_list.append((deserializer(as_set), count))
+    return as_list
+class LoadMetrics:
+    """Container for cluster load metrics.
+    Metrics here are updated from raylet heartbeats. The autoscaler
+    queries these metrics to determine when to scale up, and which nodes
+    can be removed.
+    """
+    def __init__(self):
+        self.last_heartbeat_time_by_ip = {}
+        self.static_resources_by_ip = {}
+        self.dynamic_resources_by_ip = {}
+        self.raylet_id_by_ip = {}
+        self.waiting_bundles = []
+        self.infeasible_bundles = []
+        self.pending_placement_groups = []
+        self.resource_requests = []
+        self.cluster_full_of_actors_detected = False
+        self.ray_nodes_last_used_time_by_ip = {}
+    def __bool__(self):
+        """A load metrics instance is Falsey iff the autoscaler process
+        has not received a resource message from the GCS.
+        """
+        return bool(self.raylet_id_by_ip)
+    def update(
+        self,
+        ip: str,
+        raylet_id: bytes,
+        static_resources: Dict[str, Dict],
+        dynamic_resources: Dict[str, Dict],
+        node_idle_duration_s: float,
+        waiting_bundles: List[Dict[str, float]] = None,
+        infeasible_bundles: List[Dict[str, float]] = None,
+        pending_placement_groups: List[PlacementGroupTableData] = None,
+        cluster_full_of_actors_detected: bool = False,
+    ):
+        self.static_resources_by_ip[ip] = static_resources
+        self.raylet_id_by_ip[ip] = raylet_id
+        self.cluster_full_of_actors_detected = cluster_full_of_actors_detected
+        if not waiting_bundles:
+            waiting_bundles = []
+        if not infeasible_bundles:
+            infeasible_bundles = []
+        if not pending_placement_groups:
+            pending_placement_groups = []
+        # We are not guaranteed to have a corresponding dynamic resource
+        # for every static resource because dynamic resources are based on
+        # the available resources in the heartbeat, which does not exist
+        # if it is zero. Thus, we have to update dynamic resources here.
+        dynamic_resources_update = dynamic_resources.copy()
+        for resource_name, capacity in self.static_resources_by_ip[ip].items():
+            if resource_name not in dynamic_resources_update:
+                dynamic_resources_update[resource_name] = 0.0
+        self.dynamic_resources_by_ip[ip] = dynamic_resources_update
+        now = time.time()
+        self.ray_nodes_last_used_time_by_ip[ip] = now - node_idle_duration_s
+        self.last_heartbeat_time_by_ip[ip] = now
+        self.waiting_bundles = waiting_bundles
+        self.infeasible_bundles = infeasible_bundles
+        self.pending_placement_groups = pending_placement_groups
+    def mark_active(self, ip):
+        assert ip is not None, "IP should be known at this time"
+        logger.debug("Node {} is newly setup, treating as active".format(ip))
+        self.last_heartbeat_time_by_ip[ip] = time.time()
+    def is_active(self, ip):
+        return ip in self.last_heartbeat_time_by_ip
+    def prune_active_ips(self, active_ips: List[str]):
+        """The Raylet ips stored by LoadMetrics are obtained by polling
+        the GCS in Monitor.update_load_metrics().
+        On the other hand, the autoscaler gets a list of node ips from
+        its NodeProvider.
+        This method removes from LoadMetrics the ips unknown to the autoscaler.
+        Args:
+            active_ips (List[str]): The node ips known to the autoscaler.
+        """
+        active_ips = set(active_ips)
+        def prune(mapping, should_log):
+            unwanted_ips = set(mapping) - active_ips
+            for unwanted_ip in unwanted_ips:
+                if should_log:
+                    logger.info("LoadMetrics: " f"Removed ip: {unwanted_ip}.")
+                del mapping[unwanted_ip]
+            if unwanted_ips and should_log:
+                logger.info(
+                    "LoadMetrics: "
+                    "Removed {} stale ip mappings: {} not in {}".format(
+                        len(unwanted_ips), unwanted_ips, active_ips
+                    )
+                )
+            assert not (unwanted_ips & set(mapping))
+        prune(self.ray_nodes_last_used_time_by_ip, should_log=True)
+        prune(self.static_resources_by_ip, should_log=False)
+        prune(self.raylet_id_by_ip, should_log=False)
+        prune(self.dynamic_resources_by_ip, should_log=False)
+        prune(self.last_heartbeat_time_by_ip, should_log=False)
+    def get_node_resources(self):
+        """Return a list of node resources (static resource sizes).
+        Example:
+            >>> from ray.autoscaler._private.load_metrics import LoadMetrics
+            >>> metrics = LoadMetrics(...) # doctest: +SKIP
+            >>> metrics.get_node_resources() # doctest: +SKIP
+            [{"CPU": 1}, {"CPU": 4, "GPU": 8}]  # for two different nodes
+        """
+        return self.static_resources_by_ip.values()
+    def get_static_node_resources_by_ip(self) -> Dict[NodeIP, ResourceDict]:
+        """Return a dict of node resources for every node ip.
+        Example:
+            >>> from ray.autoscaler._private.load_metrics import LoadMetrics
+            >>> metrics = LoadMetrics(...)  # doctest: +SKIP
+            >>> metrics.get_static_node_resources_by_ip()  # doctest: +SKIP
+            {127.0.0.1: {"CPU": 1}, 127.0.0.2: {"CPU": 4, "GPU": 8}}
+        """
+        return self.static_resources_by_ip
+    def get_resource_utilization(self):
+        return self.dynamic_resources_by_ip
+    def _get_resource_usage(self):
+        resources_used = {}
+        resources_total = {}
+        for ip, max_resources in self.static_resources_by_ip.items():
+            avail_resources = self.dynamic_resources_by_ip[ip]
+            for resource_id, amount in max_resources.items():
+                used = amount - avail_resources[resource_id]
+                if resource_id not in resources_used:
+                    resources_used[resource_id] = 0.0
+                    resources_total[resource_id] = 0.0
+                resources_used[resource_id] += used
+                resources_total[resource_id] += amount
+                used = max(0, used)
+        return resources_used, resources_total
+    def get_resource_demand_vector(self, clip=True):
+        if clip:
+            # Bound the total number of bundles to
+            # 2xMAX_RESOURCE_DEMAND_VECTOR_SIZE. This guarantees the resource
+            # demand scheduler bin packing algorithm takes a reasonable amount
+            # of time to run.
+            return (
+                self.waiting_bundles[:AUTOSCALER_MAX_RESOURCE_DEMAND_VECTOR_SIZE]
+                + self.infeasible_bundles[:AUTOSCALER_MAX_RESOURCE_DEMAND_VECTOR_SIZE]
+            )
+        else:
+            return self.waiting_bundles + self.infeasible_bundles
+    def get_resource_requests(self):
+        return self.resource_requests
+    def get_pending_placement_groups(self):
+        return self.pending_placement_groups
+    def resources_avail_summary(self) -> str:
+        """Return a concise string of cluster size to report to event logs.
+        For example, "3 CPUs, 4 GPUs".
+        """
+        total_resources = (
+            reduce(add_resources, self.static_resources_by_ip.values())
+            if self.static_resources_by_ip
+            else {}
+        )
+        out = "{} CPUs".format(int(total_resources.get("CPU", 0)))
+        if "GPU" in total_resources:
+            out += ", {} GPUs".format(int(total_resources["GPU"]))
+        if "TPU" in total_resources:
+            out += ", {} TPUs".format(int(total_resources["TPU"]))
+        return out
+    def summary(self):
+        available_resources = (
+            reduce(add_resources, self.dynamic_resources_by_ip.values())
+            if self.dynamic_resources_by_ip
+            else {}
+        )
+        total_resources = (
+            reduce(add_resources, self.static_resources_by_ip.values())
+            if self.static_resources_by_ip
+            else {}
+        )
+        usage_dict = {}
+        for key in total_resources:
+            if key in ["memory", "object_store_memory"]:
+                total = total_resources[key]
+                available = available_resources[key]
+                usage_dict[key] = (total - available, total)
+            else:
+                total = total_resources[key]
+                usage_dict[key] = (total - available_resources[key], total)
+        summarized_demand_vector = freq_of_dicts(
+            self.get_resource_demand_vector(clip=False)
+        )
+        summarized_resource_requests = freq_of_dicts(self.get_resource_requests())
+        def placement_group_serializer(pg):
+            bundles = tuple(
+                frozenset(bundle.unit_resources.items()) for bundle in pg.bundles
+            )
+            return (bundles, pg.strategy)
+        def placement_group_deserializer(pg_tuple):
+            # We marshal this as a dictionary so that we can easily json.dumps
+            # it later.
+            # TODO (Alex): Would there be a benefit to properly
+            # marshalling this (into a protobuf)?
+            bundles = list(map(dict, pg_tuple[0]))
+            return {
+                "bundles": freq_of_dicts(bundles),
+                "strategy": PlacementStrategy.Name(pg_tuple[1]),
+            }
+        summarized_placement_groups = freq_of_dicts(
+            self.get_pending_placement_groups(),
+            serializer=placement_group_serializer,
+            deserializer=placement_group_deserializer,
+        )
+        nodes_summary = freq_of_dicts(self.static_resources_by_ip.values())
+        usage_by_node = None
+        if AUTOSCALER_REPORT_PER_NODE_STATUS:
+            usage_by_node = {}
+            for ip, totals in self.static_resources_by_ip.items():
+                available = self.dynamic_resources_by_ip.get(ip, {})
+                usage_by_node[ip] = {}
+                for resource, total in totals.items():
+                    usage_by_node[ip][resource] = (
+                        total - available.get(resource, 0),
+                        total,
+                    )
+        return LoadMetricsSummary(
+            usage=usage_dict,
+            resource_demand=summarized_demand_vector,
+            pg_demand=summarized_placement_groups,
+            request_demand=summarized_resource_requests,
+            node_types=nodes_summary,
+            usage_by_node=usage_by_node,
+        )
+    def set_resource_requests(self, requested_resources):
+        if requested_resources is not None:
+            assert isinstance(requested_resources, list), requested_resources
+        self.resource_requests = [
+            request for request in requested_resources if len(request) > 0
+        ]
+    def info_string(self):
+        return " - " + "\n - ".join(
+            ["{}: {}".format(k, v) for k, v in sorted(self._info().items())]
+        )
+    def _info(self):
+        resources_used, resources_total = self._get_resource_usage()
+        now = time.time()
+        idle_times = [now - t for t in self.ray_nodes_last_used_time_by_ip.values()]
+        heartbeat_times = [now - t for t in self.last_heartbeat_time_by_ip.values()]
+        most_delayed_heartbeats = sorted(
+            self.last_heartbeat_time_by_ip.items(), key=lambda pair: pair[1]
+        )[:5]
+        most_delayed_heartbeats = {ip: (now - t) for ip, t in most_delayed_heartbeats}
+        def format_resource(key, value):
+            if key in ["object_store_memory", "memory"]:
+                return "{} GiB".format(round(value / (1024 * 1024 * 1024), 2))
+            else:
+                return round(value, 2)
+        return {
+            "ResourceUsage": ", ".join(
+                [
+                    "{}/{} {}".format(
+                        format_resource(rid, resources_used[rid]),
+                        format_resource(rid, resources_total[rid]),
+                        rid,
+                    )
+                    for rid in sorted(resources_used)
+                    if not rid.startswith("node:")
+                ]
+            ),
+            "NodeIdleSeconds": "Min={} Mean={} Max={}".format(
+                int(min(idle_times)) if idle_times else -1,
+                int(float(sum(idle_times)) / len(idle_times)) if idle_times else -1,
+                int(max(idle_times)) if idle_times else -1,
+            ),
+            "TimeSinceLastHeartbeat": "Min={} Mean={} Max={}".format(
+                int(min(heartbeat_times)) if heartbeat_times else -1,
+                int(float(sum(heartbeat_times)) / len(heartbeat_times))
+                if heartbeat_times
+                else -1,
+                int(max(heartbeat_times)) if heartbeat_times else -1,
+            ),
+            "MostDelayedHeartbeats": most_delayed_heartbeats,
+        }

.venv/lib/python3.11/site-packages/ray/autoscaler/_private/loader.py ADDED Viewed

	@@ -0,0 +1,15 @@

+import importlib
+def load_function_or_class(path):
+    """Load a function or class at runtime given a full path.
+    Example of the path: mypkg.mysubpkg.myclass
+    """
+    class_data = path.split(".")
+    if len(class_data) < 2:
+        raise ValueError("You need to pass a valid path like mymodule.provider_class")
+    module_path = ".".join(class_data[:-1])
+    fn_or_class_str = class_data[-1]
+    module = importlib.import_module(module_path)
+    return getattr(module, fn_or_class_str)

.venv/lib/python3.11/site-packages/ray/autoscaler/_private/local/__init__.py ADDED Viewed

File without changes

.venv/lib/python3.11/site-packages/ray/autoscaler/_private/local/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (202 Bytes). View file

.venv/lib/python3.11/site-packages/ray/autoscaler/_private/local/__pycache__/config.cpython-311.pyc ADDED Viewed

Binary file (5.4 kB). View file

.venv/lib/python3.11/site-packages/ray/autoscaler/_private/local/__pycache__/coordinator_node_provider.cpython-311.pyc ADDED Viewed

Binary file (5.85 kB). View file

.venv/lib/python3.11/site-packages/ray/autoscaler/_private/local/__pycache__/node_provider.cpython-311.pyc ADDED Viewed

Binary file (17.3 kB). View file

.venv/lib/python3.11/site-packages/ray/autoscaler/_private/local/config.py ADDED Viewed

	@@ -0,0 +1,121 @@

+import copy
+import os
+from typing import Any, Dict
+from ray._private.utils import get_ray_temp_dir
+from ray.autoscaler._private.cli_logger import cli_logger
+unsupported_field_message = "The field {} is not supported for on-premise clusters."
+LOCAL_CLUSTER_NODE_TYPE = "local.cluster.node"
+def prepare_local(config: Dict[str, Any]) -> Dict[str, Any]:
+    """
+    Prepare local cluster config for ingestion by cluster launcher and
+    autoscaler.
+    """
+    config = copy.deepcopy(config)
+    for field in "head_node", "worker_nodes", "available_node_types":
+        if config.get(field):
+            err_msg = unsupported_field_message.format(field)
+            cli_logger.abort(err_msg)
+    # We use a config with a single node type for on-prem clusters.
+    # Resources internally detected by Ray are not overridden by the autoscaler
+    # (see NodeProvider.do_update)
+    config["available_node_types"] = {
+        LOCAL_CLUSTER_NODE_TYPE: {"node_config": {}, "resources": {}}
+    }
+    config["head_node_type"] = LOCAL_CLUSTER_NODE_TYPE
+    if "coordinator_address" in config["provider"]:
+        config = prepare_coordinator(config)
+    else:
+        config = prepare_manual(config)
+    return config
+def prepare_coordinator(config: Dict[str, Any]) -> Dict[str, Any]:
+    config = copy.deepcopy(config)
+    # User should explicitly set the max number of workers for the coordinator
+    # to allocate.
+    if "max_workers" not in config:
+        cli_logger.abort(
+            "The field `max_workers` is required when using an "
+            "automatically managed on-premise cluster."
+        )
+    node_type = config["available_node_types"][LOCAL_CLUSTER_NODE_TYPE]
+    # The autoscaler no longer uses global `min_workers`.
+    # Move `min_workers` to the node_type config.
+    node_type["min_workers"] = config.pop("min_workers", 0)
+    node_type["max_workers"] = config["max_workers"]
+    return config
+def prepare_manual(config: Dict[str, Any]) -> Dict[str, Any]:
+    """Validates and sets defaults for configs of manually managed on-prem
+    clusters.
+    - Checks for presence of required `worker_ips` and `head_ips` fields.
+    - Defaults min and max workers to the number of `worker_ips`.
+    - Caps min and max workers at the number of `worker_ips`.
+    - Writes min and max worker info into the single worker node type.
+    """
+    config = copy.deepcopy(config)
+    if ("worker_ips" not in config["provider"]) or (
+        "head_ip" not in config["provider"]
+    ):
+        cli_logger.abort(
+            "Please supply a `head_ip` and list of `worker_ips`. "
+            "Alternatively, supply a `coordinator_address`."
+        )
+    num_ips = len(config["provider"]["worker_ips"])
+    node_type = config["available_node_types"][LOCAL_CLUSTER_NODE_TYPE]
+    # Default to keeping all provided ips in the cluster.
+    config.setdefault("max_workers", num_ips)
+    # The autoscaler no longer uses global `min_workers`.
+    # We will move `min_workers` to the node_type config.
+    min_workers = config.pop("min_workers", num_ips)
+    max_workers = config["max_workers"]
+    if min_workers > num_ips:
+        cli_logger.warning(
+            f"The value of `min_workers` supplied ({min_workers}) is greater"
+            f" than the number of available worker ips ({num_ips})."
+            f" Setting `min_workers={num_ips}`."
+        )
+        node_type["min_workers"] = num_ips
+    else:
+        node_type["min_workers"] = min_workers
+    if max_workers > num_ips:
+        cli_logger.warning(
+            f"The value of `max_workers` supplied ({max_workers}) is greater"
+            f" than the number of available worker ips ({num_ips})."
+            f" Setting `max_workers={num_ips}`."
+        )
+        node_type["max_workers"] = num_ips
+        config["max_workers"] = num_ips
+    else:
+        node_type["max_workers"] = max_workers
+    if max_workers < num_ips:
+        cli_logger.warning(
+            f"The value of `max_workers` supplied ({max_workers}) is less"
+            f" than the number of available worker ips ({num_ips})."
+            f" At most {max_workers} Ray worker nodes will connect to the cluster."
+        )
+    return config
+def get_lock_path(cluster_name: str) -> str:
+    return os.path.join(get_ray_temp_dir(), "cluster-{}.lock".format(cluster_name))
+def get_state_path(cluster_name: str) -> str:
+    return os.path.join(get_ray_temp_dir(), "cluster-{}.state".format(cluster_name))
+def bootstrap_local(config: Dict[str, Any]) -> Dict[str, Any]:
+    return config

.venv/lib/python3.11/site-packages/ray/autoscaler/_private/local/coordinator_node_provider.py ADDED Viewed

	@@ -0,0 +1,110 @@

+import json
+import logging
+from http.client import RemoteDisconnected
+from ray.autoscaler.node_provider import NodeProvider
+from ray.autoscaler.tags import TAG_RAY_CLUSTER_NAME
+logger = logging.getLogger(__name__)
+class CoordinatorSenderNodeProvider(NodeProvider):
+    """NodeProvider for automatically managed private/local clusters.
+    The cluster management is handled by a remote coordinating server.
+    The server listens on <coordinator_address>, therefore, the address
+    should be provided in the provider section in the cluster config.
+    The server receieves HTTP requests from this class and uses
+    LocalNodeProvider to get their responses.
+    """
+    def __init__(self, provider_config, cluster_name):
+        NodeProvider.__init__(self, provider_config, cluster_name)
+        self.coordinator_address = provider_config["coordinator_address"]
+    def _get_http_response(self, request):
+        headers = {
+            "Content-Type": "application/json",
+        }
+        request_message = json.dumps(request).encode()
+        http_coordinator_address = "http://" + self.coordinator_address
+        try:
+            import requests  # `requests` is not part of stdlib.
+            from requests.exceptions import ConnectionError
+            r = requests.get(
+                http_coordinator_address,
+                data=request_message,
+                headers=headers,
+                timeout=None,
+            )
+        except (RemoteDisconnected, ConnectionError):
+            logger.exception(
+                "Could not connect to: "
+                + http_coordinator_address
+                + ". Did you run python coordinator_server.py"
+                + " --ips <list_of_node_ips> --port <PORT>?"
+            )
+            raise
+        except ImportError:
+            logger.exception(
+                "Not all Ray Autoscaler dependencies were found. "
+                "In Ray 1.4+, the Ray CLI, autoscaler, and dashboard will "
+                'only be usable via `pip install "ray[default]"`. Please '
+                "update your install command."
+            )
+            raise
+        response = r.json()
+        return response
+    def non_terminated_nodes(self, tag_filters):
+        # Only get the non terminated nodes associated with this cluster name.
+        tag_filters[TAG_RAY_CLUSTER_NAME] = self.cluster_name
+        request = {"type": "non_terminated_nodes", "args": (tag_filters,)}
+        return self._get_http_response(request)
+    def is_running(self, node_id):
+        request = {"type": "is_running", "args": (node_id,)}
+        return self._get_http_response(request)
+    def is_terminated(self, node_id):
+        request = {"type": "is_terminated", "args": (node_id,)}
+        return self._get_http_response(request)
+    def node_tags(self, node_id):
+        request = {"type": "node_tags", "args": (node_id,)}
+        return self._get_http_response(request)
+    def external_ip(self, node_id):
+        request = {"type": "external_ip", "args": (node_id,)}
+        response = self._get_http_response(request)
+        return response
+    def internal_ip(self, node_id):
+        request = {"type": "internal_ip", "args": (node_id,)}
+        response = self._get_http_response(request)
+        return response
+    def create_node(self, node_config, tags, count):
+        # Tag the newly created node with this cluster name. Helps to get
+        # the right nodes when calling non_terminated_nodes.
+        tags[TAG_RAY_CLUSTER_NAME] = self.cluster_name
+        request = {
+            "type": "create_node",
+            "args": (node_config, tags, count),
+        }
+        self._get_http_response(request)
+    def set_node_tags(self, node_id, tags):
+        request = {"type": "set_node_tags", "args": (node_id, tags)}
+        self._get_http_response(request)
+    def terminate_node(self, node_id):
+        request = {"type": "terminate_node", "args": (node_id,)}
+        self._get_http_response(request)
+    def terminate_nodes(self, node_ids):
+        request = {"type": "terminate_nodes", "args": (node_ids,)}
+        self._get_http_response(request)

.venv/lib/python3.11/site-packages/ray/autoscaler/_private/local/node_provider.py ADDED Viewed

	@@ -0,0 +1,304 @@

+import json
+import logging
+import os
+import socket
+from threading import RLock
+from filelock import FileLock
+from ray.autoscaler._private.local.config import (
+    LOCAL_CLUSTER_NODE_TYPE,
+    bootstrap_local,
+    get_lock_path,
+    get_state_path,
+)
+from ray.autoscaler.node_provider import NodeProvider
+from ray.autoscaler.tags import (
+    NODE_KIND_HEAD,
+    NODE_KIND_WORKER,
+    STATUS_UP_TO_DATE,
+    TAG_RAY_NODE_KIND,
+    TAG_RAY_NODE_NAME,
+    TAG_RAY_NODE_STATUS,
+    TAG_RAY_USER_NODE_TYPE,
+)
+logger = logging.getLogger(__name__)
+filelock_logger = logging.getLogger("filelock")
+filelock_logger.setLevel(logging.WARNING)
+class ClusterState:
+    def __init__(self, lock_path, save_path, provider_config):
+        self.lock = RLock()
+        os.makedirs(os.path.dirname(lock_path), exist_ok=True)
+        self.file_lock = FileLock(lock_path)
+        self.save_path = save_path
+        with self.lock:
+            with self.file_lock:
+                if os.path.exists(self.save_path):
+                    workers = json.loads(open(self.save_path).read())
+                    head_config = workers.get(provider_config["head_ip"])
+                    if (
+                        not head_config
+                        or head_config.get("tags", {}).get(TAG_RAY_NODE_KIND)
+                        != NODE_KIND_HEAD
+                    ):
+                        workers = {}
+                        logger.info("Head IP changed - recreating cluster.")
+                else:
+                    workers = {}
+                logger.info(
+                    "ClusterState: Loaded cluster state: {}".format(list(workers))
+                )
+                for worker_ip in provider_config["worker_ips"]:
+                    if worker_ip not in workers:
+                        workers[worker_ip] = {
+                            "tags": {TAG_RAY_NODE_KIND: NODE_KIND_WORKER},
+                            "state": "terminated",
+                        }
+                    else:
+                        assert (
+                            workers[worker_ip]["tags"][TAG_RAY_NODE_KIND]
+                            == NODE_KIND_WORKER
+                        )
+                if provider_config["head_ip"] not in workers:
+                    workers[provider_config["head_ip"]] = {
+                        "tags": {TAG_RAY_NODE_KIND: NODE_KIND_HEAD},
+                        "state": "terminated",
+                    }
+                else:
+                    assert (
+                        workers[provider_config["head_ip"]]["tags"][TAG_RAY_NODE_KIND]
+                        == NODE_KIND_HEAD
+                    )
+                # Relevant when a user reduces the number of workers
+                # without changing the headnode.
+                list_of_node_ips = list(provider_config["worker_ips"])
+                list_of_node_ips.append(provider_config["head_ip"])
+                for worker_ip in list(workers):
+                    if worker_ip not in list_of_node_ips:
+                        del workers[worker_ip]
+                # Set external head ip, if provided by user.
+                # Necessary if calling `ray up` from outside the network.
+                # Refer to LocalNodeProvider.external_ip function.
+                external_head_ip = provider_config.get("external_head_ip")
+                if external_head_ip:
+                    head = workers[provider_config["head_ip"]]
+                    head["external_ip"] = external_head_ip
+                assert len(workers) == len(provider_config["worker_ips"]) + 1
+                with open(self.save_path, "w") as f:
+                    logger.debug(
+                        "ClusterState: Writing cluster state: {}".format(workers)
+                    )
+                    f.write(json.dumps(workers))
+    def get(self):
+        with self.lock:
+            with self.file_lock:
+                workers = json.loads(open(self.save_path).read())
+                return workers
+    def put(self, worker_id, info):
+        assert "tags" in info
+        assert "state" in info
+        with self.lock:
+            with self.file_lock:
+                workers = self.get()
+                workers[worker_id] = info
+                with open(self.save_path, "w") as f:
+                    logger.info(
+                        "ClusterState: "
+                        "Writing cluster state: {}".format(list(workers))
+                    )
+                    f.write(json.dumps(workers))
+class OnPremCoordinatorState(ClusterState):
+    """Generates & updates the state file of CoordinatorSenderNodeProvider.
+    Unlike ClusterState, which generates a cluster specific file with
+    predefined head and worker ips, OnPremCoordinatorState overwrites
+    ClusterState's __init__ function to generate and manage a unified
+    file of the status of all the nodes for multiple clusters.
+    """
+    def __init__(self, lock_path, save_path, list_of_node_ips):
+        self.lock = RLock()
+        self.file_lock = FileLock(lock_path)
+        self.save_path = save_path
+        with self.lock:
+            with self.file_lock:
+                if os.path.exists(self.save_path):
+                    nodes = json.loads(open(self.save_path).read())
+                else:
+                    nodes = {}
+                logger.info(
+                    "OnPremCoordinatorState: "
+                    "Loaded on prem coordinator state: {}".format(nodes)
+                )
+                # Filter removed node ips.
+                for node_ip in list(nodes):
+                    if node_ip not in list_of_node_ips:
+                        del nodes[node_ip]
+                for node_ip in list_of_node_ips:
+                    if node_ip not in nodes:
+                        nodes[node_ip] = {
+                            "tags": {},
+                            "state": "terminated",
+                        }
+                assert len(nodes) == len(list_of_node_ips)
+                with open(self.save_path, "w") as f:
+                    logger.info(
+                        "OnPremCoordinatorState: "
+                        "Writing on prem coordinator state: {}".format(nodes)
+                    )
+                    f.write(json.dumps(nodes))
+class LocalNodeProvider(NodeProvider):
+    """NodeProvider for private/local clusters.
+    `node_id` is overloaded to also be `node_ip` in this class.
+    When `cluster_name` is provided, it manages a single cluster in a cluster
+    specific state file. But when `cluster_name` is None, it manages multiple
+    clusters in a unified state file that requires each node to be tagged with
+    TAG_RAY_CLUSTER_NAME in create and non_terminated_nodes function calls to
+    associate each node with the right cluster.
+    The current use case of managing multiple clusters is by
+    OnPremCoordinatorServer which receives node provider HTTP requests
+    from CoordinatorSenderNodeProvider and uses LocalNodeProvider to get
+    the responses.
+    """
+    def __init__(self, provider_config, cluster_name):
+        NodeProvider.__init__(self, provider_config, cluster_name)
+        if cluster_name:
+            lock_path = get_lock_path(cluster_name)
+            state_path = get_state_path(cluster_name)
+            self.state = ClusterState(
+                lock_path,
+                state_path,
+                provider_config,
+            )
+            self.use_coordinator = False
+        else:
+            # LocalNodeProvider with a coordinator server.
+            self.state = OnPremCoordinatorState(
+                "/tmp/coordinator.lock",
+                "/tmp/coordinator.state",
+                provider_config["list_of_node_ips"],
+            )
+            self.use_coordinator = True
+    def non_terminated_nodes(self, tag_filters):
+        workers = self.state.get()
+        matching_ips = []
+        for worker_ip, info in workers.items():
+            if info["state"] == "terminated":
+                continue
+            ok = True
+            for k, v in tag_filters.items():
+                if info["tags"].get(k) != v:
+                    ok = False
+                    break
+            if ok:
+                matching_ips.append(worker_ip)
+        return matching_ips
+    def is_running(self, node_id):
+        return self.state.get()[node_id]["state"] == "running"
+    def is_terminated(self, node_id):
+        return not self.is_running(node_id)
+    def node_tags(self, node_id):
+        return self.state.get()[node_id]["tags"]
+    def external_ip(self, node_id):
+        """Returns an external ip if the user has supplied one.
+        Otherwise, use the same logic as internal_ip below.
+        This can be used to call ray up from outside the network, for example
+        if the Ray cluster exists in an AWS VPC and we're interacting with
+        the cluster from a laptop (where using an internal_ip will not work).
+        Useful for debugging the local node provider with cloud VMs."""
+        node_state = self.state.get()[node_id]
+        ext_ip = node_state.get("external_ip")
+        if ext_ip:
+            return ext_ip
+        else:
+            return socket.gethostbyname(node_id)
+    def internal_ip(self, node_id):
+        return socket.gethostbyname(node_id)
+    def set_node_tags(self, node_id, tags):
+        with self.state.file_lock:
+            info = self.state.get()[node_id]
+            info["tags"].update(tags)
+            self.state.put(node_id, info)
+    def create_node(self, node_config, tags, count):
+        """Creates min(count, currently available) nodes."""
+        node_type = tags[TAG_RAY_NODE_KIND]
+        with self.state.file_lock:
+            workers = self.state.get()
+            for node_id, info in workers.items():
+                if info["state"] == "terminated" and (
+                    self.use_coordinator or info["tags"][TAG_RAY_NODE_KIND] == node_type
+                ):
+                    info["tags"] = tags
+                    info["state"] = "running"
+                    self.state.put(node_id, info)
+                    count = count - 1
+                    if count == 0:
+                        return
+    def terminate_node(self, node_id):
+        workers = self.state.get()
+        info = workers[node_id]
+        info["state"] = "terminated"
+        self.state.put(node_id, info)
+    @staticmethod
+    def bootstrap_config(cluster_config):
+        return bootstrap_local(cluster_config)
+def record_local_head_state_if_needed(local_provider: LocalNodeProvider) -> None:
+    """This function is called on the Ray head from StandardAutoscaler.reset
+    to record the head node's own existence in the cluster state file.
+    This is necessary because `provider.create_node` in
+    `commands.get_or_create_head_node` records the head state on the
+    cluster-launching machine but not on the head.
+    """
+    head_ip = local_provider.provider_config["head_ip"]
+    cluster_name = local_provider.cluster_name
+    # If the head node is not marked as created in the cluster state file,
+    if head_ip not in local_provider.non_terminated_nodes({}):
+        # These tags are based on the ones in commands.get_or_create_head_node;
+        # keep in sync.
+        head_tags = {
+            TAG_RAY_NODE_KIND: NODE_KIND_HEAD,
+            TAG_RAY_USER_NODE_TYPE: LOCAL_CLUSTER_NODE_TYPE,
+            TAG_RAY_NODE_NAME: "ray-{}-head".format(cluster_name),
+            TAG_RAY_NODE_STATUS: STATUS_UP_TO_DATE,
+        }
+        # Mark the head node as created in the cluster state file.
+        local_provider.create_node(node_config={}, tags=head_tags, count=1)
+        assert head_ip in local_provider.non_terminated_nodes({})

.venv/lib/python3.11/site-packages/ray/autoscaler/_private/log_timer.py ADDED Viewed

	@@ -0,0 +1,33 @@

+import datetime
+import logging
+from ray.autoscaler._private.cli_logger import cli_logger
+logger = logging.getLogger(__name__)
+class LogTimer:
+    def __init__(self, message, show_status=False):
+        self._message = message
+        self._show_status = show_status
+    def __enter__(self):
+        self._start_time = datetime.datetime.utcnow()
+    def __exit__(self, *error_vals):
+        if cli_logger.log_style != "record":
+            return
+        td = datetime.datetime.utcnow() - self._start_time
+        status = ""
+        if self._show_status:
+            status = "failed" if any(error_vals) else "succeeded"
+        cli_logger.print(
+            " ".join(
+                [
+                    self._message,
+                    status,
+                    "[LogTimer={:.0f}ms]".format(td.total_seconds() * 1000),
+                ]
+            )
+        )

.venv/lib/python3.11/site-packages/ray/autoscaler/_private/monitor.py ADDED Viewed

	@@ -0,0 +1,719 @@

+"""Autoscaler monitoring loop daemon."""
+import argparse
+import json
+import logging
+import os
+import signal
+import sys
+import time
+import traceback
+from collections import Counter
+from dataclasses import asdict
+from typing import Any, Callable, Dict, Optional, Union
+import ray
+import ray._private.ray_constants as ray_constants
+import ray._private.utils
+from ray._private.event.event_logger import get_event_logger
+from ray._private.ray_logging import setup_component_logger
+from ray._raylet import GcsClient
+from ray.autoscaler._private.autoscaler import StandardAutoscaler
+from ray.autoscaler._private.commands import teardown_cluster
+from ray.autoscaler._private.constants import (
+    AUTOSCALER_MAX_RESOURCE_DEMAND_VECTOR_SIZE,
+    AUTOSCALER_METRIC_PORT,
+    AUTOSCALER_UPDATE_INTERVAL_S,
+    DISABLE_LAUNCH_CONFIG_CHECK_KEY,
+)
+from ray.autoscaler._private.event_summarizer import EventSummarizer
+from ray.autoscaler._private.load_metrics import LoadMetrics
+from ray.autoscaler._private.prom_metrics import AutoscalerPrometheusMetrics
+from ray.autoscaler._private.util import format_readonly_node_type
+from ray.autoscaler.v2.sdk import get_cluster_resource_state
+from ray.core.generated import gcs_pb2
+from ray.core.generated.event_pb2 import Event as RayEvent
+from ray.experimental.internal_kv import (
+    _initialize_internal_kv,
+    _internal_kv_del,
+    _internal_kv_get,
+    _internal_kv_initialized,
+    _internal_kv_put,
+)
+try:
+    import prometheus_client
+except ImportError:
+    prometheus_client = None
+logger = logging.getLogger(__name__)
+def parse_resource_demands(resource_load_by_shape):
+    """Handle the message.resource_load_by_shape protobuf for the demand
+    based autoscaling. Catch and log all exceptions so this doesn't
+    interfere with the utilization based autoscaler until we're confident
+    this is stable. Worker queue backlogs are added to the appropriate
+    resource demand vector.
+    Args:
+        resource_load_by_shape (pb2.gcs.ResourceLoad): The resource demands
+            in protobuf form or None.
+    Returns:
+        List[ResourceDict]: Waiting bundles (ready and feasible).
+        List[ResourceDict]: Infeasible bundles.
+    """
+    waiting_bundles, infeasible_bundles = [], []
+    try:
+        for resource_demand_pb in list(resource_load_by_shape.resource_demands):
+            request_shape = dict(resource_demand_pb.shape)
+            for _ in range(resource_demand_pb.num_ready_requests_queued):
+                waiting_bundles.append(request_shape)
+            for _ in range(resource_demand_pb.num_infeasible_requests_queued):
+                infeasible_bundles.append(request_shape)
+            # Infeasible and ready states for tasks are (logically)
+            # mutually exclusive.
+            if resource_demand_pb.num_infeasible_requests_queued > 0:
+                backlog_queue = infeasible_bundles
+            else:
+                backlog_queue = waiting_bundles
+            for _ in range(resource_demand_pb.backlog_size):
+                backlog_queue.append(request_shape)
+            if (
+                len(waiting_bundles + infeasible_bundles)
+                > AUTOSCALER_MAX_RESOURCE_DEMAND_VECTOR_SIZE
+            ):
+                break
+    except Exception:
+        logger.exception("Failed to parse resource demands.")
+    return waiting_bundles, infeasible_bundles
+# Readonly provider config (e.g., for laptop mode, manually setup clusters).
+BASE_READONLY_CONFIG = {
+    "cluster_name": "default",
+    "max_workers": 0,
+    "upscaling_speed": 1.0,
+    "docker": {},
+    "idle_timeout_minutes": 0,
+    "provider": {
+        "type": "readonly",
+        "use_node_id_as_ip": True,  # For emulated multi-node on laptop.
+        DISABLE_LAUNCH_CONFIG_CHECK_KEY: True,  # No launch check.
+    },
+    "auth": {},
+    "available_node_types": {
+        "ray.head.default": {"resources": {}, "node_config": {}, "max_workers": 0}
+    },
+    "head_node_type": "ray.head.default",
+    "file_mounts": {},
+    "cluster_synced_files": [],
+    "file_mounts_sync_continuously": False,
+    "rsync_exclude": [],
+    "rsync_filter": [],
+    "initialization_commands": [],
+    "setup_commands": [],
+    "head_setup_commands": [],
+    "worker_setup_commands": [],
+    "head_start_ray_commands": [],
+    "worker_start_ray_commands": [],
+}
+class Monitor:
+    """Autoscaling monitor.
+    This process periodically collects stats from the GCS and triggers
+    autoscaler updates.
+    """
+    def __init__(
+        self,
+        address: str,
+        autoscaling_config: Union[str, Callable[[], Dict[str, Any]]],
+        log_dir: str = None,
+        prefix_cluster_info: bool = False,
+        monitor_ip: Optional[str] = None,
+        retry_on_failure: bool = True,
+    ):
+        self.gcs_address = address
+        worker = ray._private.worker.global_worker
+        # TODO: eventually plumb ClusterID through to here
+        self.gcs_client = GcsClient(address=self.gcs_address)
+        if monitor_ip:
+            monitor_addr = f"{monitor_ip}:{AUTOSCALER_METRIC_PORT}"
+            self.gcs_client.internal_kv_put(
+                b"AutoscalerMetricsAddress", monitor_addr.encode(), True, None
+            )
+        _initialize_internal_kv(self.gcs_client)
+        if monitor_ip:
+            monitor_addr = f"{monitor_ip}:{AUTOSCALER_METRIC_PORT}"
+            self.gcs_client.internal_kv_put(
+                b"AutoscalerMetricsAddress", monitor_addr.encode(), True, None
+            )
+        self._session_name = self.get_session_name(self.gcs_client)
+        logger.info(f"session_name: {self._session_name}")
+        worker.mode = 0
+        head_node_ip = self.gcs_address.split(":")[0]
+        self.load_metrics = LoadMetrics()
+        self.last_avail_resources = None
+        self.event_summarizer = EventSummarizer()
+        self.prefix_cluster_info = prefix_cluster_info
+        self.retry_on_failure = retry_on_failure
+        self.autoscaling_config = autoscaling_config
+        self.autoscaler = None
+        # If set, we are in a manually created cluster (non-autoscaling) and
+        # simply mirroring what the GCS tells us the cluster node types are.
+        self.readonly_config = None
+        if log_dir:
+            try:
+                self.event_logger = get_event_logger(
+                    RayEvent.SourceType.AUTOSCALER, log_dir
+                )
+            except Exception:
+                self.event_logger = None
+        else:
+            self.event_logger = None
+        self.prom_metrics = AutoscalerPrometheusMetrics(session_name=self._session_name)
+        if monitor_ip and prometheus_client:
+            # If monitor_ip wasn't passed in, then don't attempt to start the
+            # metric server to keep behavior identical to before metrics were
+            # introduced
+            try:
+                logger.info(
+                    "Starting autoscaler metrics server on port {}".format(
+                        AUTOSCALER_METRIC_PORT
+                    )
+                )
+                kwargs = {"addr": "127.0.0.1"} if head_node_ip == "127.0.0.1" else {}
+                prometheus_client.start_http_server(
+                    port=AUTOSCALER_METRIC_PORT,
+                    registry=self.prom_metrics.registry,
+                    **kwargs,
+                )
+                # Reset some gauges, since we don't know which labels have
+                # leaked if the autoscaler was restarted.
+                self.prom_metrics.pending_nodes.clear()
+                self.prom_metrics.active_nodes.clear()
+            except Exception:
+                logger.exception(
+                    "An exception occurred while starting the metrics server."
+                )
+        elif not prometheus_client:
+            logger.warning(
+                "`prometheus_client` not found, so metrics will not be exported."
+            )
+        logger.info("Monitor: Started")
+    def _initialize_autoscaler(self):
+        if self.autoscaling_config:
+            autoscaling_config = self.autoscaling_config
+        else:
+            # This config mirrors the current setup of the manually created
+            # cluster. Each node gets its own unique node type.
+            self.readonly_config = BASE_READONLY_CONFIG
+            # Note that the "available_node_types" of the config can change.
+            def get_latest_readonly_config():
+                return self.readonly_config
+            autoscaling_config = get_latest_readonly_config
+        self.autoscaler = StandardAutoscaler(
+            autoscaling_config,
+            self.load_metrics,
+            self.gcs_client,
+            self._session_name,
+            prefix_cluster_info=self.prefix_cluster_info,
+            event_summarizer=self.event_summarizer,
+            prom_metrics=self.prom_metrics,
+        )
+    def update_load_metrics(self):
+        """Fetches resource usage data from GCS and updates load metrics."""
+        response = self.gcs_client.get_all_resource_usage(timeout=60)
+        resources_batch_data = response.resource_usage_data
+        log_resource_batch_data_if_desired(resources_batch_data)
+        # This is a workaround to get correct idle_duration_ms
+        # from "get_cluster_resource_state"
+        # ref: https://github.com/ray-project/ray/pull/48519#issuecomment-2481659346
+        cluster_resource_state = get_cluster_resource_state(self.gcs_client)
+        ray_node_states = cluster_resource_state.node_states
+        ray_nodes_idle_duration_ms_by_id = {
+            node.node_id: node.idle_duration_ms for node in ray_node_states
+        }
+        # Tell the readonly node provider what nodes to report.
+        if self.readonly_config:
+            new_nodes = []
+            for msg in list(resources_batch_data.batch):
+                node_id = msg.node_id.hex()
+                new_nodes.append((node_id, msg.node_manager_address))
+            self.autoscaler.provider._set_nodes(new_nodes)
+        mirror_node_types = {}
+        cluster_full = False
+        if (
+            hasattr(response, "cluster_full_of_actors_detected_by_gcs")
+            and response.cluster_full_of_actors_detected_by_gcs
+        ):
+            # GCS has detected the cluster full of actors.
+            cluster_full = True
+        for resource_message in resources_batch_data.batch:
+            node_id = resource_message.node_id
+            # Generate node type config based on GCS reported node list.
+            if self.readonly_config:
+                # Keep prefix in sync with ReadonlyNodeProvider.
+                node_type = format_readonly_node_type(node_id.hex())
+                resources = {}
+                for k, v in resource_message.resources_total.items():
+                    resources[k] = v
+                mirror_node_types[node_type] = {
+                    "resources": resources,
+                    "node_config": {},
+                    "max_workers": 1,
+                }
+            if (
+                hasattr(resource_message, "cluster_full_of_actors_detected")
+                and resource_message.cluster_full_of_actors_detected
+            ):
+                # A worker node has detected the cluster full of actors.
+                cluster_full = True
+            total_resources = dict(resource_message.resources_total)
+            available_resources = dict(resource_message.resources_available)
+            waiting_bundles, infeasible_bundles = parse_resource_demands(
+                resources_batch_data.resource_load_by_shape
+            )
+            pending_placement_groups = list(
+                resources_batch_data.placement_group_load.placement_group_data
+            )
+            use_node_id_as_ip = self.autoscaler is not None and self.autoscaler.config[
+                "provider"
+            ].get("use_node_id_as_ip", False)
+            # "use_node_id_as_ip" is a hack meant to address situations in
+            # which there's more than one Ray node residing at a given ip.
+            # TODO (Dmitri): Stop using ips as node identifiers.
+            # https://github.com/ray-project/ray/issues/19086
+            if use_node_id_as_ip:
+                peloton_id = total_resources.get("NODE_ID_AS_RESOURCE")
+                # Legacy support https://github.com/ray-project/ray/pull/17312
+                if peloton_id is not None:
+                    ip = str(int(peloton_id))
+                else:
+                    ip = node_id.hex()
+            else:
+                ip = resource_message.node_manager_address
+            idle_duration_s = 0.0
+            if node_id in ray_nodes_idle_duration_ms_by_id:
+                idle_duration_s = ray_nodes_idle_duration_ms_by_id[node_id] / 1000
+            else:
+                logger.warning(
+                    f"node_id {node_id} not found in ray_nodes_idle_duration_ms_by_id"
+                )
+            self.load_metrics.update(
+                ip,
+                node_id,
+                total_resources,
+                available_resources,
+                idle_duration_s,
+                waiting_bundles,
+                infeasible_bundles,
+                pending_placement_groups,
+                cluster_full,
+            )
+        if self.readonly_config:
+            self.readonly_config["available_node_types"].update(mirror_node_types)
+    def get_session_name(self, gcs_client: GcsClient) -> Optional[str]:
+        """Obtain the session name from the GCS.
+        If the GCS doesn't respond, session name is considered None.
+        In this case, the metrics reported from the monitor won't have
+        the correct session name.
+        """
+        if not _internal_kv_initialized():
+            return None
+        session_name = gcs_client.internal_kv_get(
+            b"session_name",
+            ray_constants.KV_NAMESPACE_SESSION,
+            timeout=10,
+        )
+        if session_name:
+            session_name = session_name.decode()
+        return session_name
+    def update_resource_requests(self):
+        """Fetches resource requests from the internal KV and updates load."""
+        if not _internal_kv_initialized():
+            return
+        data = _internal_kv_get(
+            ray._private.ray_constants.AUTOSCALER_RESOURCE_REQUEST_CHANNEL
+        )
+        if data:
+            try:
+                resource_request = json.loads(data)
+                self.load_metrics.set_resource_requests(resource_request)
+            except Exception:
+                logger.exception("Error parsing resource requests")
+    def _run(self):
+        """Run the monitor loop."""
+        while True:
+            try:
+                gcs_request_start_time = time.time()
+                self.update_load_metrics()
+                gcs_request_time = time.time() - gcs_request_start_time
+                self.update_resource_requests()
+                self.update_event_summary()
+                load_metrics_summary = self.load_metrics.summary()
+                status = {
+                    "gcs_request_time": gcs_request_time,
+                    "time": time.time(),
+                    "monitor_pid": os.getpid(),
+                }
+                if self.autoscaler and not self.load_metrics:
+                    # load_metrics is Falsey iff we haven't collected any
+                    # resource messages from the GCS, which can happen at startup if
+                    # the GCS hasn't yet received data from the Raylets.
+                    # In this case, do not do an autoscaler update.
+                    # Wait to get load metrics.
+                    logger.info(
+                        "Autoscaler has not yet received load metrics. Waiting."
+                    )
+                elif self.autoscaler:
+                    # Process autoscaling actions
+                    update_start_time = time.time()
+                    self.autoscaler.update()
+                    status["autoscaler_update_time"] = time.time() - update_start_time
+                    autoscaler_summary = self.autoscaler.summary()
+                    try:
+                        self.emit_metrics(
+                            load_metrics_summary,
+                            autoscaler_summary,
+                            self.autoscaler.all_node_types,
+                        )
+                    except Exception:
+                        logger.exception("Error emitting metrics")
+                    if autoscaler_summary:
+                        status["autoscaler_report"] = asdict(autoscaler_summary)
+                        status[
+                            "non_terminated_nodes_time"
+                        ] = (
+                            self.autoscaler.non_terminated_nodes.non_terminated_nodes_time  # noqa: E501
+                        )
+                    for msg in self.event_summarizer.summary():
+                        # Need to prefix each line of the message for the lines to
+                        # get pushed to the driver logs.
+                        for line in msg.split("\n"):
+                            logger.info(
+                                "{}{}".format(
+                                    ray_constants.LOG_PREFIX_EVENT_SUMMARY, line
+                                )
+                            )
+                            if self.event_logger:
+                                self.event_logger.info(line)
+                    self.event_summarizer.clear()
+                status["load_metrics_report"] = asdict(load_metrics_summary)
+                as_json = json.dumps(status)
+                if _internal_kv_initialized():
+                    _internal_kv_put(
+                        ray_constants.DEBUG_AUTOSCALING_STATUS, as_json, overwrite=True
+                    )
+            except Exception:
+                # By default, do not exit the monitor on failure.
+                if self.retry_on_failure:
+                    logger.exception("Monitor: Execution exception. Trying again...")
+                else:
+                    raise
+            # Wait for a autoscaler update interval before processing the next
+            # round of messages.
+            time.sleep(AUTOSCALER_UPDATE_INTERVAL_S)
+    def emit_metrics(self, load_metrics_summary, autoscaler_summary, node_types):
+        if autoscaler_summary is None:
+            return None
+        for resource_name in ["CPU", "GPU", "TPU"]:
+            _, total = load_metrics_summary.usage.get(resource_name, (0, 0))
+            pending = autoscaler_summary.pending_resources.get(resource_name, 0)
+            self.prom_metrics.cluster_resources.labels(
+                resource=resource_name,
+                SessionName=self.prom_metrics.session_name,
+            ).set(total)
+            self.prom_metrics.pending_resources.labels(
+                resource=resource_name,
+                SessionName=self.prom_metrics.session_name,
+            ).set(pending)
+        pending_node_count = Counter()
+        for _, node_type, _ in autoscaler_summary.pending_nodes:
+            pending_node_count[node_type] += 1
+        for node_type, count in autoscaler_summary.pending_launches.items():
+            pending_node_count[node_type] += count
+        for node_type in node_types:
+            count = pending_node_count[node_type]
+            self.prom_metrics.pending_nodes.labels(
+                SessionName=self.prom_metrics.session_name,
+                NodeType=node_type,
+            ).set(count)
+        for node_type in node_types:
+            count = autoscaler_summary.active_nodes.get(node_type, 0)
+            self.prom_metrics.active_nodes.labels(
+                SessionName=self.prom_metrics.session_name,
+                NodeType=node_type,
+            ).set(count)
+        failed_node_counts = Counter()
+        for _, node_type in autoscaler_summary.failed_nodes:
+            failed_node_counts[node_type] += 1
+        # NOTE: This metric isn't reset with monitor resets. This means it will
+        # only be updated when the autoscaler' node tracker remembers failed
+        # nodes. If the node type failure is evicted from the autoscaler, the
+        # metric may not update for a while.
+        for node_type, count in failed_node_counts.items():
+            self.prom_metrics.recently_failed_nodes.labels(
+                SessionName=self.prom_metrics.session_name,
+                NodeType=node_type,
+            ).set(count)
+    def update_event_summary(self):
+        """Report the current size of the cluster.
+        To avoid log spam, only cluster size changes (CPU, GPU or TPU count change)
+        are reported to the event summarizer. The event summarizer will report
+        only the latest cluster size per batch.
+        """
+        avail_resources = self.load_metrics.resources_avail_summary()
+        if not self.readonly_config and avail_resources != self.last_avail_resources:
+            self.event_summarizer.add(
+                "Resized to {}.",  # e.g., Resized to 100 CPUs, 4 GPUs, 4 TPUs.
+                quantity=avail_resources,
+                aggregate=lambda old, new: new,
+            )
+            self.last_avail_resources = avail_resources
+    def destroy_autoscaler_workers(self):
+        """Cleanup the autoscaler, in case of an exception in the run() method.
+        We kill the worker nodes, but retain the head node in order to keep
+        logs around, keeping costs minimal. This monitor process runs on the
+        head node anyway, so this is more reliable."""
+        if self.autoscaler is None:
+            return  # Nothing to clean up.
+        if self.autoscaling_config is None:
+            # This is a logic error in the program. Can't do anything.
+            logger.error("Monitor: Cleanup failed due to lack of autoscaler config.")
+            return
+        logger.info("Monitor: Exception caught. Taking down workers...")
+        clean = False
+        while not clean:
+            try:
+                teardown_cluster(
+                    config_file=self.autoscaling_config,
+                    yes=True,  # Non-interactive.
+                    workers_only=True,  # Retain head node for logs.
+                    override_cluster_name=None,
+                    keep_min_workers=True,  # Retain minimal amount of workers.
+                )
+                clean = True
+                logger.info("Monitor: Workers taken down.")
+            except Exception:
+                logger.error("Monitor: Cleanup exception. Trying again...")
+                time.sleep(2)
+    def _handle_failure(self, error):
+        if (
+            self.autoscaler is not None
+            and os.environ.get("RAY_AUTOSCALER_FATESHARE_WORKERS", "") == "1"
+        ):
+            self.autoscaler.kill_workers()
+            # Take down autoscaler workers if necessary.
+            self.destroy_autoscaler_workers()
+        # Something went wrong, so push an error to all current and future
+        # drivers.
+        message = f"The autoscaler failed with the following error:\n{error}"
+        if _internal_kv_initialized():
+            _internal_kv_put(
+                ray_constants.DEBUG_AUTOSCALING_ERROR, message, overwrite=True
+            )
+        gcs_publisher = ray._raylet.GcsPublisher(address=self.gcs_address)
+        from ray._private.utils import publish_error_to_driver
+        publish_error_to_driver(
+            ray_constants.MONITOR_DIED_ERROR,
+            message,
+            gcs_publisher=gcs_publisher,
+        )
+    def _signal_handler(self, sig, frame):
+        try:
+            self._handle_failure(
+                f"Terminated with signal {sig}\n"
+                + "".join(traceback.format_stack(frame))
+            )
+        except Exception:
+            logger.exception("Monitor: Failure in signal handler.")
+        sys.exit(sig + 128)
+    def run(self):
+        # Register signal handlers for autoscaler termination.
+        # Signals will not be received on windows
+        signal.signal(signal.SIGINT, self._signal_handler)
+        signal.signal(signal.SIGTERM, self._signal_handler)
+        try:
+            if _internal_kv_initialized():
+                # Delete any previous autoscaling errors.
+                _internal_kv_del(ray_constants.DEBUG_AUTOSCALING_ERROR)
+            self._initialize_autoscaler()
+            self._run()
+        except Exception:
+            logger.exception("Error in monitor loop")
+            self._handle_failure(traceback.format_exc())
+            raise
+def log_resource_batch_data_if_desired(
+    resources_batch_data: gcs_pb2.ResourceUsageBatchData,
+) -> None:
+    if os.getenv("AUTOSCALER_LOG_RESOURCE_BATCH_DATA") == "1":
+        logger.info("Logging raw resource message pulled from GCS.")
+        logger.info(resources_batch_data)
+        logger.info("Done logging raw resource message.")
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        description=("Parse GCS server for the monitor to connect to.")
+    )
+    parser.add_argument(
+        "--gcs-address", required=False, type=str, help="The address (ip:port) of GCS."
+    )
+    parser.add_argument(
+        "--autoscaling-config",
+        required=False,
+        type=str,
+        help="the path to the autoscaling config file",
+    )
+    parser.add_argument(
+        "--logging-level",
+        required=False,
+        type=str,
+        default=ray_constants.LOGGER_LEVEL,
+        choices=ray_constants.LOGGER_LEVEL_CHOICES,
+        help=ray_constants.LOGGER_LEVEL_HELP,
+    )
+    parser.add_argument(
+        "--logging-format",
+        required=False,
+        type=str,
+        default=ray_constants.LOGGER_FORMAT,
+        help=ray_constants.LOGGER_FORMAT_HELP,
+    )
+    parser.add_argument(
+        "--logging-filename",
+        required=False,
+        type=str,
+        default=ray_constants.MONITOR_LOG_FILE_NAME,
+        help="Specify the name of log file, "
+        "log to stdout if set empty, default is "
+        f'"{ray_constants.MONITOR_LOG_FILE_NAME}"',
+    )
+    parser.add_argument(
+        "--logs-dir",
+        required=True,
+        type=str,
+        help="Specify the path of the temporary directory used by Ray processes.",
+    )
+    parser.add_argument(
+        "--logging-rotate-bytes",
+        required=False,
+        type=int,
+        default=ray_constants.LOGGING_ROTATE_BYTES,
+        help="Specify the max bytes for rotating "
+        "log file, default is "
+        f"{ray_constants.LOGGING_ROTATE_BYTES} bytes.",
+    )
+    parser.add_argument(
+        "--logging-rotate-backup-count",
+        required=False,
+        type=int,
+        default=ray_constants.LOGGING_ROTATE_BACKUP_COUNT,
+        help="Specify the backup count of rotated log file, default is "
+        f"{ray_constants.LOGGING_ROTATE_BACKUP_COUNT}.",
+    )
+    parser.add_argument(
+        "--monitor-ip",
+        required=False,
+        type=str,
+        default=None,
+        help="The IP address of the machine hosting the monitor process.",
+    )
+    args = parser.parse_args()
+    setup_component_logger(
+        logging_level=args.logging_level,
+        logging_format=args.logging_format,
+        log_dir=args.logs_dir,
+        filename=args.logging_filename,
+        max_bytes=args.logging_rotate_bytes,
+        backup_count=args.logging_rotate_backup_count,
+    )
+    logger.info(f"Starting monitor using ray installation: {ray.__file__}")
+    logger.info(f"Ray version: {ray.__version__}")
+    logger.info(f"Ray commit: {ray.__commit__}")
+    logger.info(f"Monitor started with command: {sys.argv}")
+    if args.autoscaling_config:
+        autoscaling_config = os.path.expanduser(args.autoscaling_config)
+    else:
+        autoscaling_config = None
+    bootstrap_address = args.gcs_address
+    if bootstrap_address is None:
+        raise ValueError("--gcs-address must be set!")
+    monitor = Monitor(
+        bootstrap_address,
+        autoscaling_config,
+        log_dir=args.logs_dir,
+        monitor_ip=args.monitor_ip,
+    )
+    monitor.run()

.venv/lib/python3.11/site-packages/ray/autoscaler/_private/node_launcher.py ADDED Viewed

	@@ -0,0 +1,221 @@

+import copy
+import logging
+import operator
+import threading
+import time
+import traceback
+from typing import Any, Dict, Optional
+from ray.autoscaler._private.node_provider_availability_tracker import (
+    NodeProviderAvailabilityTracker,
+)
+from ray.autoscaler._private.prom_metrics import AutoscalerPrometheusMetrics
+from ray.autoscaler._private.util import hash_launch_conf
+from ray.autoscaler.node_launch_exception import NodeLaunchException
+from ray.autoscaler.tags import (
+    NODE_KIND_WORKER,
+    STATUS_UNINITIALIZED,
+    TAG_RAY_LAUNCH_CONFIG,
+    TAG_RAY_NODE_KIND,
+    TAG_RAY_NODE_NAME,
+    TAG_RAY_NODE_STATUS,
+    TAG_RAY_USER_NODE_TYPE,
+)
+logger = logging.getLogger(__name__)
+class BaseNodeLauncher:
+    """Launches Ray nodes in the main thread using
+    `BaseNodeLauncher.launch_node()`.
+    This is a superclass of NodeLauncher, which launches nodes asynchronously
+    in the background.
+    By default, the subclass NodeLauncher is used to launch nodes in subthreads.
+    That behavior can be flagged off in the provider config by setting
+    `foreground_node_launch: True`; the autoscaler will then makes blocking calls to
+    BaseNodeLauncher.launch_node() in the main thread.
+    """
+    def __init__(
+        self,
+        provider,
+        pending,
+        event_summarizer,
+        node_provider_availability_tracker: NodeProviderAvailabilityTracker,
+        session_name: Optional[str] = None,
+        prom_metrics=None,
+        node_types=None,
+        index=None,
+        *args,
+        **kwargs,
+    ):
+        self.pending = pending
+        self.event_summarizer = event_summarizer
+        self.node_provider_availability_tracker = node_provider_availability_tracker
+        self.prom_metrics = prom_metrics or AutoscalerPrometheusMetrics(
+            session_name=session_name
+        )
+        self.provider = provider
+        self.node_types = node_types
+        self.index = str(index) if index is not None else ""
+    def launch_node(
+        self, config: Dict[str, Any], count: int, node_type: str
+    ) -> Optional[Dict]:
+        self.log("Got {} nodes to launch.".format(count))
+        created_nodes = self._launch_node(config, count, node_type)
+        self.pending.dec(node_type, count)
+        return created_nodes
+    def _launch_node(
+        self, config: Dict[str, Any], count: int, node_type: str
+    ) -> Optional[Dict]:
+        if self.node_types:
+            assert node_type, node_type
+        # The `worker_nodes` field is deprecated in favor of per-node-type
+        # node_configs. We allow it for backwards-compatibility.
+        launch_config = copy.deepcopy(config.get("worker_nodes", {}))
+        if node_type:
+            launch_config.update(
+                config["available_node_types"][node_type]["node_config"]
+            )
+        resources = copy.deepcopy(
+            config["available_node_types"][node_type]["resources"]
+        )
+        labels = copy.deepcopy(
+            config["available_node_types"][node_type].get("labels", {})
+        )
+        launch_hash = hash_launch_conf(launch_config, config["auth"])
+        node_config = copy.deepcopy(config.get("worker_nodes", {}))
+        node_tags = {
+            TAG_RAY_NODE_NAME: "ray-{}-worker".format(config["cluster_name"]),
+            TAG_RAY_NODE_KIND: NODE_KIND_WORKER,
+            TAG_RAY_NODE_STATUS: STATUS_UNINITIALIZED,
+            TAG_RAY_LAUNCH_CONFIG: launch_hash,
+        }
+        # A custom node type is specified; set the tag in this case, and also
+        # merge the configs. We merge the configs instead of overriding, so
+        # that the bootstrapped per-cloud properties are preserved.
+        # TODO(ekl) this logic is duplicated in commands.py (keep in sync)
+        if node_type:
+            node_tags[TAG_RAY_USER_NODE_TYPE] = node_type
+            node_config.update(launch_config)
+        node_launch_start_time = time.time()
+        error_msg = None
+        full_exception = None
+        created_nodes = {}
+        try:
+            created_nodes = self.provider.create_node_with_resources_and_labels(
+                node_config, node_tags, count, resources, labels
+            )
+        except NodeLaunchException as node_launch_exception:
+            self.node_provider_availability_tracker.update_node_availability(
+                node_type, int(node_launch_start_time), node_launch_exception
+            )
+            if node_launch_exception.src_exc_info is not None:
+                full_exception = "\n".join(
+                    traceback.format_exception(*node_launch_exception.src_exc_info)
+                )
+            error_msg = (
+                f"Failed to launch {{}} node(s) of type {node_type}. "
+                f"({node_launch_exception.category}): "
+                f"{node_launch_exception.description}"
+            )
+        except Exception:
+            error_msg = f"Failed to launch {{}} node(s) of type {node_type}."
+            full_exception = traceback.format_exc()
+        else:
+            # Record some metrics/observability information when a node is launched.
+            launch_time = time.time() - node_launch_start_time
+            for _ in range(count):
+                # Note: when launching multiple nodes we observe the time it
+                # took all nodes to launch for each node. For example, if 4
+                # nodes were created in 25 seconds, we would observe the 25
+                # second create time 4 times.
+                self.prom_metrics.worker_create_node_time.observe(launch_time)
+            self.prom_metrics.started_nodes.inc(count)
+            self.node_provider_availability_tracker.update_node_availability(
+                node_type=node_type,
+                timestamp=int(node_launch_start_time),
+                node_launch_exception=None,
+            )
+        if error_msg is not None:
+            self.event_summarizer.add(
+                error_msg,
+                quantity=count,
+                aggregate=operator.add,
+            )
+            self.log(error_msg)
+            self.prom_metrics.node_launch_exceptions.inc()
+            self.prom_metrics.failed_create_nodes.inc(count)
+        else:
+            self.log("Launching {} nodes, type {}.".format(count, node_type))
+            self.event_summarizer.add(
+                "Adding {} node(s) of type " + str(node_type) + ".",
+                quantity=count,
+                aggregate=operator.add,
+            )
+        if full_exception is not None:
+            self.log(full_exception)
+        return created_nodes
+    def log(self, statement):
+        # launcher_class is "BaseNodeLauncher", or "NodeLauncher" if called
+        # from that subclass.
+        launcher_class: str = type(self).__name__
+        prefix = "{}{}:".format(launcher_class, self.index)
+        logger.info(prefix + " {}".format(statement))
+class NodeLauncher(BaseNodeLauncher, threading.Thread):
+    """Launches nodes asynchronously in the background."""
+    def __init__(
+        self,
+        provider,
+        queue,
+        pending,
+        event_summarizer,
+        node_provider_availability_tracker,
+        session_name: Optional[str] = None,
+        prom_metrics=None,
+        node_types=None,
+        index=None,
+        *thread_args,
+        **thread_kwargs,
+    ):
+        self.queue = queue
+        BaseNodeLauncher.__init__(
+            self,
+            provider=provider,
+            pending=pending,
+            event_summarizer=event_summarizer,
+            session_name=session_name,
+            node_provider_availability_tracker=node_provider_availability_tracker,
+            prom_metrics=prom_metrics,
+            node_types=node_types,
+            index=index,
+        )
+        threading.Thread.__init__(self, *thread_args, **thread_kwargs)
+    def run(self):
+        """Collects launch data from queue populated by StandardAutoscaler.
+        Launches nodes in a background thread.
+        Overrides threading.Thread.run().
+        NodeLauncher.start() executes this loop in a background thread.
+        """
+        while True:
+            config, count, node_type = self.queue.get()
+            # launch_node is implemented in BaseNodeLauncher
+            self.launch_node(config, count, node_type)

.venv/lib/python3.11/site-packages/ray/autoscaler/_private/node_provider_availability_tracker.py ADDED Viewed

	@@ -0,0 +1,165 @@

+import threading
+import time
+from dataclasses import dataclass
+from typing import Callable, Dict, Optional, Tuple
+from ray.autoscaler._private.constants import (
+    AUTOSCALER_NODE_AVAILABILITY_MAX_STALENESS_S,
+)
+from ray.autoscaler.node_launch_exception import NodeLaunchException
+@dataclass
+class UnavailableNodeInformation:
+    category: str
+    description: str
+@dataclass
+class NodeAvailabilityRecord:
+    node_type: str
+    is_available: bool
+    last_checked_timestamp: float
+    unavailable_node_information: Optional[UnavailableNodeInformation]
+@dataclass
+class NodeAvailabilitySummary:
+    node_availabilities: Dict[
+        str, NodeAvailabilityRecord
+    ]  # Mapping from node type to node availability record.
+    @classmethod
+    def from_fields(cls, **fields) -> Optional["NodeAvailabilitySummary"]:
+        """Implement marshalling from nested fields. pydantic isn't a core dependency
+        so we're implementing this by hand instead."""
+        parsed = {}
+        node_availabilites_dict = fields.get("node_availabilities", {})
+        for node_type, node_availability_record_dict in node_availabilites_dict.items():
+            unavailable_information_dict = node_availability_record_dict.pop(
+                "unavailable_node_information", None
+            )
+            unavaiable_information = None
+            if unavailable_information_dict is not None:
+                unavaiable_information = UnavailableNodeInformation(
+                    **unavailable_information_dict
+                )
+            parsed[node_type] = NodeAvailabilityRecord(
+                unavailable_node_information=unavaiable_information,
+                **node_availability_record_dict,
+            )
+        return NodeAvailabilitySummary(node_availabilities=parsed)
+    def __eq__(self, other: "NodeAvailabilitySummary"):
+        return self.node_availabilities == other.node_availabilities
+    def __bool__(self) -> bool:
+        return bool(self.node_availabilities)
+class NodeProviderAvailabilityTracker:
+    """A thread safe, TTL cache of node provider availability. We don't use
+    cachetools.TTLCache because it always sets the expiration time relative to
+    insertion time, but in our case, we want entries to expire relative to when
+    the node creation was attempted (and entries aren't necessarily added in
+    order). We want the entries to expire because the information grows stale
+    over time.
+    """
+    def __init__(
+        self,
+        timer: Callable[[], float] = time.time,
+        ttl: float = AUTOSCALER_NODE_AVAILABILITY_MAX_STALENESS_S,
+    ):
+        """A cache that tracks the availability of nodes and throw away
+        entries which have grown too stale.
+        Args:
+          timer: A function that returns the current time in seconds.
+          ttl: The ttl from the insertion timestamp of an entry.
+        """
+        self.timer = timer
+        self.ttl = ttl
+        # Mapping from node type to (eviction_time, record)
+        self.store: Dict[str, Tuple[float, NodeAvailabilityRecord]] = {}
+        # A global lock to simplify thread safety handling.
+        self.lock = threading.RLock()
+    def _update_node_availability_requires_lock(
+        self,
+        node_type: str,
+        timestamp: int,
+        node_launch_exception: Optional[NodeLaunchException],
+    ) -> None:
+        if node_launch_exception is None:
+            record = NodeAvailabilityRecord(
+                node_type=node_type,
+                is_available=True,
+                last_checked_timestamp=timestamp,
+                unavailable_node_information=None,
+            )
+        else:
+            info = UnavailableNodeInformation(
+                category=node_launch_exception.category,
+                description=node_launch_exception.description,
+            )
+            record = NodeAvailabilityRecord(
+                node_type=node_type,
+                is_available=False,
+                last_checked_timestamp=timestamp,
+                unavailable_node_information=info,
+            )
+        expiration_time = timestamp + self.ttl
+        # TODO (Alex): In theory it would be nice to make this dictionary
+        # ordered by expiration time, unfortunately that's a bit difficult
+        # since `update_node_availability` can be called with out of order
+        # timestamps.
+        self.store[node_type] = (expiration_time, record)
+        self._remove_old_entries()
+    def update_node_availability(
+        self,
+        node_type: str,
+        timestamp: int,
+        node_launch_exception: Optional[NodeLaunchException],
+    ) -> None:
+        """
+        Update the availability and details of a single ndoe type.
+        Args:
+          node_type: The node type.
+          timestamp: The timestamp that this information is accurate as of.
+          node_launch_exception: Details about why the node launch failed. If
+            empty, the node type will be considered available."""
+        with self.lock:
+            self._update_node_availability_requires_lock(
+                node_type, timestamp, node_launch_exception
+            )
+    def summary(self) -> NodeAvailabilitySummary:
+        """
+        Returns a summary of node availabilities and their staleness.
+        Returns
+            A summary of node availabilities and their staleness.
+        """
+        with self.lock:
+            self._remove_old_entries()
+            return NodeAvailabilitySummary(
+                {node_type: record for node_type, (_, record) in self.store.items()}
+            )
+    def _remove_old_entries(self):
+        """Remove any expired entries from the cache."""
+        cur_time = self.timer()
+        with self.lock:
+            for key, (expiration_time, _) in list(self.store.items()):
+                if expiration_time < cur_time:
+                    del self.store[key]

.venv/lib/python3.11/site-packages/ray/autoscaler/_private/node_tracker.py ADDED Viewed

	@@ -0,0 +1,77 @@

+from typing import List, Set, Tuple
+from ray.autoscaler._private import constants
+class NodeTracker:
+    """Map nodes to their corresponding logs.
+    We need to be a little careful here. At an given point in time, node_id <->
+    ip can be interchangeably used, but the node_id -> ip relation is not
+    bijective _across time_ since IP addresses can be reused. Therefore, we
+    should treat node_id as the only unique identifier.
+    """
+    def __init__(self):
+        # Mapping from node_id -> (ip, node type, stdout_path, process runner)
+        self.node_mapping = {}
+        # A quick, inefficient FIFO cache implementation.
+        self.lru_order = []
+    def _add_node_mapping(self, node_id: str, value: str):
+        if node_id in self.node_mapping:
+            return
+        assert len(self.lru_order) == len(self.node_mapping)
+        if len(self.lru_order) >= constants.AUTOSCALER_MAX_NODES_TRACKED:
+            # The LRU eviction case
+            node_id = self.lru_order.pop(0)
+            del self.node_mapping[node_id]
+        self.node_mapping[node_id] = value
+        self.lru_order.append(node_id)
+    def track(self, node_id: str, ip: str, node_type: str):
+        """
+        Begin to track a new node.
+        Args:
+            node_id: The node id.
+            ip: The node ip address.
+            node_type: The node type.
+        """
+        if node_id not in self.node_mapping:
+            self._add_node_mapping(node_id, (ip, node_type))
+    def untrack(self, node_id: str):
+        """Gracefully stop tracking a node. If a node is intentionally removed from
+        the cluster, we should stop tracking it so we don't mistakenly mark it
+        as failed.
+        Args:
+            node_id: The node id which failed.
+        """
+        if node_id in self.node_mapping:
+            self.lru_order.remove(node_id)
+            del self.node_mapping[node_id]
+    def get_all_failed_node_info(
+        self, non_failed_ids: Set[str]
+    ) -> List[Tuple[str, str]]:
+        """Get the information about all failed nodes. A failed node is any node which
+        we began to track that is not pending or alive (i.e. not failed).
+        Args:
+            non_failed_ids: Nodes are failed unless they are in this set.
+        Returns:
+            List[Tuple[str, str]]: A list of tuples. Each tuple is the ip
+            address and type of a failed node.
+        """
+        failed_nodes = self.node_mapping.keys() - non_failed_ids
+        failed_info = []
+        # Returning the list in order is important for display purposes.
+        for node_id in filter(lambda node_id: node_id in failed_nodes, self.lru_order):
+            failed_info.append(self.node_mapping[node_id])
+        return failed_info

.venv/lib/python3.11/site-packages/ray/autoscaler/_private/prom_metrics.py ADDED Viewed

	@@ -0,0 +1,292 @@

+from typing import Optional
+class NullMetric:
+    """Mock metric class to be used in case of prometheus_client import error."""
+    def set(self, *args, **kwargs):
+        pass
+    def observe(self, *args, **kwargs):
+        pass
+    def inc(self, *args, **kwargs):
+        pass
+    def labels(self, *args, **kwargs):
+        return self
+    def clear(self):
+        pass
+try:
+    from prometheus_client import CollectorRegistry, Counter, Gauge, Histogram
+    # The metrics in this class should be kept in sync with
+    # python/ray/tests/test_metrics_agent.py
+    class AutoscalerPrometheusMetrics:
+        def __init__(
+            self, session_name: str = None, registry: Optional[CollectorRegistry] = None
+        ):
+            self.registry: CollectorRegistry = registry or CollectorRegistry(
+                auto_describe=True
+            )
+            self._session_name = session_name
+            # Buckets: 5 seconds, 10 seconds, 20 seconds, 30 seconds,
+            #          45 seconds, 1 minute, 1.5 minutes, 2 minutes,
+            #          3 minutes, 4 minutes, 5 minutes, 6 minutes,
+            #          8 minutes, 10 minutes, 12 minutes, 15 minutes
+            #          20 minutes, 25 minutes, 30 minutes
+            # used for both worker launch time and worker update time
+            histogram_buckets = [
+                5,
+                10,
+                20,
+                30,
+                45,
+                60,
+                90,
+                120,
+                180,
+                240,
+                300,
+                360,
+                480,
+                600,
+                720,
+                900,
+                1200,
+                1500,
+                1800,
+            ]
+            # Buckets: .01 seconds to 1000 seconds.
+            # Used for autoscaler update time.
+            update_time_buckets = [0.01, 0.1, 1, 10, 100, 1000]
+            self.worker_create_node_time: Histogram = Histogram(
+                "worker_create_node_time_seconds",
+                "Worker launch time. This is the time it takes for a call to "
+                "a node provider's create_node method to return. Note that "
+                "when nodes are launched in batches, the launch time for that "
+                "batch will be observed once for *each* node in that batch. "
+                "For example, if 8 nodes are launched in 3 minutes, a launch "
+                "time of 3 minutes will be observed 8 times.",
+                labelnames=("SessionName",),
+                unit="seconds",
+                namespace="autoscaler",
+                registry=self.registry,
+                buckets=histogram_buckets,
+            ).labels(SessionName=session_name)
+            self.worker_update_time: Histogram = Histogram(
+                "worker_update_time_seconds",
+                "Worker update time. This is the time between when an updater "
+                "thread begins executing and when it exits successfully. This "
+                "metric only observes times for successful updates.",
+                labelnames=("SessionName",),
+                unit="seconds",
+                namespace="autoscaler",
+                registry=self.registry,
+                buckets=histogram_buckets,
+            ).labels(SessionName=session_name)
+            self.update_time: Histogram = Histogram(
+                "update_time",
+                "Autoscaler update time. This is the time for an autoscaler "
+                "update iteration to complete.",
+                labelnames=("SessionName",),
+                unit="seconds",
+                namespace="autoscaler",
+                registry=self.registry,
+                buckets=update_time_buckets,
+            ).labels(SessionName=session_name)
+            self.pending_nodes: Gauge = Gauge(
+                "pending_nodes",
+                "Number of nodes pending to be started.",
+                labelnames=(
+                    "NodeType",
+                    "SessionName",
+                ),
+                unit="nodes",
+                namespace="autoscaler",
+                registry=self.registry,
+            )
+            self.active_nodes: Gauge = Gauge(
+                "active_nodes",
+                "Number of nodes in the cluster.",
+                labelnames=(
+                    "NodeType",
+                    "SessionName",
+                ),
+                unit="nodes",
+                namespace="autoscaler",
+                registry=self.registry,
+            )
+            self.recently_failed_nodes = Gauge(
+                "recently_failed_nodes",
+                "The number of recently failed nodes. This count could reset "
+                "at undefined times.",
+                labelnames=(
+                    "NodeType",
+                    "SessionName",
+                ),
+                unit="nodes",
+                namespace="autoscaler",
+                registry=self.registry,
+            )
+            self.started_nodes: Counter = Counter(
+                "started_nodes",
+                "Number of nodes started.",
+                labelnames=("SessionName",),
+                unit="nodes",
+                namespace="autoscaler",
+                registry=self.registry,
+            ).labels(SessionName=session_name)
+            self.stopped_nodes: Counter = Counter(
+                "stopped_nodes",
+                "Number of nodes stopped.",
+                labelnames=("SessionName",),
+                unit="nodes",
+                namespace="autoscaler",
+                registry=self.registry,
+            ).labels(SessionName=session_name)
+            self.updating_nodes: Gauge = Gauge(
+                "updating_nodes",
+                "Number of nodes in the process of updating.",
+                labelnames=("SessionName",),
+                unit="nodes",
+                namespace="autoscaler",
+                registry=self.registry,
+            ).labels(SessionName=session_name)
+            self.recovering_nodes: Gauge = Gauge(
+                "recovering_nodes",
+                "Number of nodes in the process of recovering.",
+                labelnames=("SessionName",),
+                unit="nodes",
+                namespace="autoscaler",
+                registry=self.registry,
+            ).labels(SessionName=session_name)
+            self.running_workers: Gauge = Gauge(
+                "running_workers",
+                "Number of worker nodes running.",
+                labelnames=("SessionName",),
+                unit="nodes",
+                namespace="autoscaler",
+                registry=self.registry,
+            ).labels(SessionName=session_name)
+            self.failed_create_nodes: Counter = Counter(
+                "failed_create_nodes",
+                "Number of nodes that failed to be created due to an "
+                "exception in the node provider's create_node method.",
+                labelnames=("SessionName",),
+                unit="nodes",
+                namespace="autoscaler",
+                registry=self.registry,
+            ).labels(SessionName=session_name)
+            self.failed_updates: Counter = Counter(
+                "failed_updates",
+                "Number of failed worker node updates.",
+                labelnames=("SessionName",),
+                unit="updates",
+                namespace="autoscaler",
+                registry=self.registry,
+            ).labels(SessionName=session_name)
+            self.successful_updates: Counter = Counter(
+                "successful_updates",
+                "Number of succesfful worker node updates.",
+                labelnames=("SessionName",),
+                unit="updates",
+                namespace="autoscaler",
+                registry=self.registry,
+            ).labels(SessionName=session_name)
+            self.failed_recoveries: Counter = Counter(
+                "failed_recoveries",
+                "Number of failed node recoveries.",
+                labelnames=("SessionName",),
+                unit="recoveries",
+                namespace="autoscaler",
+                registry=self.registry,
+            ).labels(SessionName=session_name)
+            self.successful_recoveries: Counter = Counter(
+                "successful_recoveries",
+                "Number of successful node recoveries.",
+                labelnames=("SessionName",),
+                unit="recoveries",
+                namespace="autoscaler",
+                registry=self.registry,
+            ).labels(SessionName=session_name)
+            self.update_loop_exceptions: Counter = Counter(
+                "update_loop_exceptions",
+                "Number of exceptions raised in the update loop of the autoscaler.",
+                labelnames=("SessionName",),
+                unit="exceptions",
+                namespace="autoscaler",
+                registry=self.registry,
+            ).labels(SessionName=session_name)
+            self.node_launch_exceptions: Counter = Counter(
+                "node_launch_exceptions",
+                "Number of exceptions raised while launching nodes.",
+                labelnames=("SessionName",),
+                unit="exceptions",
+                namespace="autoscaler",
+                registry=self.registry,
+            ).labels(SessionName=session_name)
+            self.reset_exceptions: Counter = Counter(
+                "reset_exceptions",
+                "Number of exceptions raised while resetting the autoscaler.",
+                labelnames=("SessionName",),
+                unit="exceptions",
+                namespace="autoscaler",
+                registry=self.registry,
+            ).labels(SessionName=session_name)
+            self.config_validation_exceptions: Counter = Counter(
+                "config_validation_exceptions",
+                "Number of exceptions raised while validating the config "
+                "during a reset.",
+                labelnames=("SessionName",),
+                unit="exceptions",
+                namespace="autoscaler",
+                registry=self.registry,
+            ).labels(SessionName=session_name)
+            self.drain_node_exceptions: Counter = Counter(
+                "drain_node_exceptions",
+                "Number of exceptions raised when making a DrainNode rpc"
+                "prior to node termination.",
+                labelnames=("SessionName",),
+                unit="exceptions",
+                namespace="autoscaler",
+                registry=self.registry,
+            ).labels(SessionName=session_name)
+            # This represents the autoscaler's view of essentially
+            # `ray.cluster_resources()`, it may be slightly different from the
+            # core metric from an eventual consistency perspective.
+            self.cluster_resources: Gauge = Gauge(
+                "cluster_resources",
+                "Total logical resources in the cluster.",
+                labelnames=("resource", "SessionName"),
+                unit="resources",
+                namespace="autoscaler",
+                registry=self.registry,
+            )
+            # This represents the pending launches + nodes being set up for the
+            # autoscaler.
+            self.pending_resources: Gauge = Gauge(
+                "pending_resources",
+                "Pending logical resources in the cluster.",
+                labelnames=("resource", "SessionName"),
+                unit="resources",
+                namespace="autoscaler",
+                registry=self.registry,
+            )
+        @property
+        def session_name(self):
+            return self._session_name
+except ImportError:
+    class AutoscalerPrometheusMetrics(object):
+        def __init__(self, session_name: str = None):
+            pass
+        def __getattr__(self, attr):
+            return NullMetric()