diff --git a/.venv/lib/python3.11/site-packages/ray/air/_internal/__pycache__/__init__.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/air/_internal/__pycache__/__init__.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..1c758688cd005fa69ba8ea37b37223d73d5884e1 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/air/_internal/__pycache__/__init__.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/air/_internal/__pycache__/config.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/air/_internal/__pycache__/config.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..91eab69568bf5d4237ddd9c093eff071e1092f50 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/air/_internal/__pycache__/config.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/autoscaler/__init__.py b/.venv/lib/python3.11/site-packages/ray/autoscaler/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..8fa62e35a4e45bb40b5cf9500f3ea6ec2a13fc13 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/autoscaler/__init__.py @@ -0,0 +1,8 @@ +import os +from pathlib import Path + +from ray.autoscaler import sdk + +__all__ = ["sdk"] + +AUTOSCALER_DIR_PATH = Path(os.path.abspath(os.path.dirname(__file__))) diff --git a/.venv/lib/python3.11/site-packages/ray/autoscaler/__pycache__/__init__.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/autoscaler/__pycache__/__init__.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..491a4b53601656a40976f19120c9a37308045bb2 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/autoscaler/__pycache__/__init__.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/autoscaler/__pycache__/batching_node_provider.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/autoscaler/__pycache__/batching_node_provider.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..34e2ac888ad6b9aefcd7f6609ced8d817cab23c6 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/autoscaler/__pycache__/batching_node_provider.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/autoscaler/__pycache__/command_runner.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/autoscaler/__pycache__/command_runner.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..f9cde9051303562ee4608da15b589785f3ec7cad Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/autoscaler/__pycache__/command_runner.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/autoscaler/__pycache__/launch_and_verify_cluster.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/autoscaler/__pycache__/launch_and_verify_cluster.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..4d6746aa728366c7df828f45fb02820e29fdc112 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/autoscaler/__pycache__/launch_and_verify_cluster.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/autoscaler/__pycache__/node_launch_exception.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/autoscaler/__pycache__/node_launch_exception.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c8d0970084f9fc817b2ac3a7799f9da6a018c98b Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/autoscaler/__pycache__/node_launch_exception.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/autoscaler/__pycache__/node_provider.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/autoscaler/__pycache__/node_provider.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..771ef7367a0889c3af99beec5b66ad7b101e7228 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/autoscaler/__pycache__/node_provider.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/autoscaler/__pycache__/tags.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/autoscaler/__pycache__/tags.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..afd9389d138c80e68f3bd97ef470405b460b72ee Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/autoscaler/__pycache__/tags.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/autoscaler/_private/__init__.py b/.venv/lib/python3.11/site-packages/ray/autoscaler/_private/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/.venv/lib/python3.11/site-packages/ray/autoscaler/_private/_azure/__init__.py b/.venv/lib/python3.11/site-packages/ray/autoscaler/_private/_azure/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/.venv/lib/python3.11/site-packages/ray/autoscaler/_private/_azure/__pycache__/__init__.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/autoscaler/_private/_azure/__pycache__/__init__.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..907fde51062a2ad3fd695a3a3871290885822e57 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/autoscaler/_private/_azure/__pycache__/__init__.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/autoscaler/_private/_azure/__pycache__/config.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/autoscaler/_private/_azure/__pycache__/config.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..1d90f1ada6680277ca5a0517849fa41f2cf9dba2 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/autoscaler/_private/_azure/__pycache__/config.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/autoscaler/_private/_azure/__pycache__/node_provider.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/autoscaler/_private/_azure/__pycache__/node_provider.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..ba704946f004b3342ce16627739fdb488e0b1ca1 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/autoscaler/_private/_azure/__pycache__/node_provider.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/autoscaler/_private/_azure/azure-config-template.json b/.venv/lib/python3.11/site-packages/ray/autoscaler/_private/_azure/azure-config-template.json new file mode 100644 index 0000000000000000000000000000000000000000..bedb063d4447aaf0ce6745044840cc23808454a9 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/autoscaler/_private/_azure/azure-config-template.json @@ -0,0 +1,130 @@ +{ + "$schema": "https://schema.management.azure.com/schemas/2019-04-01/deploymentTemplate.json#", + "contentVersion": "1.0.0.0", + "parameters": { + "clusterId": { + "type": "string", + "metadata": { + "description": "Unique string appended to resource names to isolate resources from different ray clusters." + } + }, + "subnet": { + "type": "string", + "metadata": { + "description": "Subnet parameters." + } + }, + "msiName": { + "type": "string", + "metadata": { + "description": "Managed service identity." + } + }, + "msiResourceGroup": { + "type": "string", + "metadata": { + "description": "Managed service identity resource group." + } + }, + "createMsi": { + "type": "bool", + "defaultValue": "true" + } + }, + "variables": { + "contributor": "[subscriptionResourceId('Microsoft.Authorization/roleDefinitions', 'b24988ac-6180-42a0-ab88-20f7382dd24c')]", + "location": "[resourceGroup().location]", + "roleAssignmentName": "[concat('ray-', parameters('clusterId'), '-ra')]", + "nsgName": "[concat('ray-', parameters('clusterId'), '-nsg')]", + "nsg": "[resourceId('Microsoft.Network/networkSecurityGroups', variables('nsgName'))]", + "vnetName": "[concat('ray-', parameters('clusterId'), '-vnet')]", + "subnetName": "[concat('ray-', parameters('clusterId'), '-subnet')]" + }, + "resources": [ + { + "condition": "[parameters('createMsi')]", + "type": "Microsoft.ManagedIdentity/userAssignedIdentities", + "apiVersion": "2018-11-30", + "location": "[variables('location')]", + "name": "[parameters('msiName')]" + }, + { + "type": "Microsoft.Authorization/roleAssignments", + "apiVersion": "2020-08-01-preview", + "name": "[guid(variables('roleAssignmentName'))]", + "properties": { + "principalId": "[reference(resourceId(parameters('msiResourceGroup'), 'Microsoft.ManagedIdentity/userAssignedIdentities', parameters('msiName')), '2018-11-30').principalId]", + "roleDefinitionId": "[variables('contributor')]", + "scope": "[resourceGroup().id]", + "principalType": "ServicePrincipal" + }, + "dependsOn": [ + "[parameters('msiName')]" + ] + }, + { + "type": "Microsoft.Network/networkSecurityGroups", + "apiVersion": "2019-02-01", + "name": "[variables('nsgName')]", + "location": "[variables('location')]", + "properties": { + "securityRules": [ + { + "name": "SSH", + "properties": { + "priority": 1000, + "protocol": "TCP", + "access": "Allow", + "direction": "Inbound", + "sourceAddressPrefix": "*", + "sourcePortRange": "*", + "destinationAddressPrefix": "*", + "destinationPortRange": "22" + } + } + ] + } + }, + { + "type": "Microsoft.Network/virtualNetworks", + "apiVersion": "2019-11-01", + "name": "[variables('vnetName')]", + "location": "[variables('location')]", + "properties": { + "addressSpace": { + "addressPrefixes": [ + "[parameters('subnet')]" + ] + }, + "subnets": [ + { + "name": "[variables('subnetName')]", + "properties": { + "addressPrefix": "[parameters('subnet')]", + "networkSecurityGroup": { + "id": "[variables('nsg')]" + } + } + } + ] + }, + "dependsOn": [ + "[variables('nsg')]" + ] + } + ], + "outputs": { + "subnet": { + "type": "string", + "value": "[resourceId('Microsoft.Network/virtualNetworks/subnets', variables('vnetName'), variables('subnetName'))]" + }, + "nsg": { + "type": "string", + "value": "[variables('nsg')]" + }, + "msi": { + "type": "string", + "value": "[resourceId(parameters('msiResourceGroup'), 'Microsoft.ManagedIdentity/userAssignedIdentities', parameters('msiName'))]" + } + } +} \ No newline at end of file diff --git a/.venv/lib/python3.11/site-packages/ray/autoscaler/_private/_azure/azure-vm-template.json b/.venv/lib/python3.11/site-packages/ray/autoscaler/_private/_azure/azure-vm-template.json new file mode 100644 index 0000000000000000000000000000000000000000..25c00797bb240c05ec35701d64333e8225f42d5d --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/autoscaler/_private/_azure/azure-vm-template.json @@ -0,0 +1,294 @@ +{ + "$schema": "https://schema.management.azure.com/schemas/2019-04-01/deploymentTemplate.json#", + "contentVersion": "1.0.0.0", + "parameters": { + "vmName": { + "type": "string", + "metadata": { + "description": "The name of you Virtual Machine." + } + }, + "adminUsername": { + "type": "string", + "metadata": { + "description": "Username for the Virtual Machine." + } + }, + "publicKey": { + "type": "securestring", + "metadata": { + "description": "SSH Key for the Virtual Machine" + } + }, + "imagePublisher": { + "type": "string", + "metadata": { + "description": "The publisher of the VM image" + } + }, + "imageOffer": { + "type": "string", + "metadata": { + "description": "The offer of the VM image" + } + }, + "imageSku": { + "type": "string", + "metadata": { + "description": "The sku of the VM image" + } + }, + "imageVersion": { + "type": "string", + "metadata": { + "description": "The version of the VM image" + } + }, + "vmSize": { + "type": "string", + "metadata": { + "description": "The size of the VM" + } + }, + "vmTags": { + "type": "object", + "metadata": { + "description": "Tags for the VM" + } + }, + "vmCount": { + "type": "int", + "metadata": { + "description": "Number of VMs to deploy" + } + }, + "provisionPublicIp": { + "type": "bool", + "defaultValue": true, + "metadata": { + "description": "If true creates a public ip" + } + }, + "priority": { + "type": "string", + "defaultValue": "Regular", + "metadata": { + "description": "Specifies the priority for the virtual machine." + } + }, + "evictionPolicy": { + "type": "string", + "defaultValue": "Delete", + "metadata": { + "description": "Specifies the eviction policy for the virtual machine." + } + }, + "billingProfile": { + "type": "object", + "defaultValue": {}, + "metadata": { + "description": "Specifies the maximum price to pay for Azure Spot VM." + } + }, + "msi": { + "type": "string", + "metadata": { + "description": "Managed service identity resource id." + } + }, + "nsg": { + "type": "string", + "metadata": { + "description": "Network security group resource id." + } + }, + "subnet": { + "type": "string", + "metadata": { + "descriptions": "Subnet resource id." + } + }, + "enableAcceleratedNetworking": { + "type": "bool", + "defaultValue": false, + "metadata": { + "descriptions": "Whether to enable accelerated networking." + } + } + }, + "variables": { + "location": "[resourceGroup().location]", + "networkInterfaceNamePrivate": "[concat(parameters('vmName'), '-nic')]", + "networkInterfaceNamePublic": "[concat(parameters('vmName'), '-nic-public')]", + "networkInterfaceName": "[if(parameters('provisionPublicIp'), variables('networkInterfaceNamePublic'), variables('networkInterfaceNamePrivate'))]", + "networkIpConfig": "[guid(resourceGroup().id, parameters('vmName'))]", + "osDiskType": "Standard_LRS", + "publicIpAddressName": "[concat(parameters('vmName'), '-ip')]" + }, + "resources": [ + { + "type": "Microsoft.Network/networkInterfaces", + "apiVersion": "2020-06-01", + "name": "[concat(variables('networkInterfaceNamePublic'), copyIndex())]", + "location": "[variables('location')]", + "dependsOn": [ + "[resourceId('Microsoft.Network/publicIpAddresses/', concat(variables('publicIpAddressName'), copyIndex()))]" + ], + "copy": { + "name": "NICPublicCopy", + "count": "[parameters('vmCount')]" + }, + "properties": { + "ipConfigurations": [ + { + "name": "[variables('networkIpConfig')]", + "properties": { + "subnet": { + "id": "[parameters('subnet')]" + }, + "privateIPAllocationMethod": "Dynamic", + "publicIpAddress": { + "id": "[resourceId('Microsoft.Network/publicIPAddresses', concat(variables('publicIPAddressName'), copyIndex()))]" + } + } + } + ], + "networkSecurityGroup": { + "id": "[parameters('nsg')]" + }, + "enableAcceleratedNetworking": "[parameters('enableAcceleratedNetworking')]" + }, + "condition": "[parameters('provisionPublicIp')]" + }, + { + "type": "Microsoft.Network/networkInterfaces", + "apiVersion": "2020-06-01", + "name": "[concat(variables('networkInterfaceNamePrivate'), copyIndex())]", + "location": "[variables('location')]", + "copy": { + "name": "NICPrivateCopy", + "count": "[parameters('vmCount')]" + }, + "properties": { + "ipConfigurations": [ + { + "name": "[variables('networkIpConfig')]", + "properties": { + "subnet": { + "id": "[parameters('subnet')]" + }, + "privateIPAllocationMethod": "Dynamic" + } + } + ], + "networkSecurityGroup": { + "id": "[parameters('nsg')]" + }, + "enableAcceleratedNetworking": "[parameters('enableAcceleratedNetworking')]" + }, + "condition": "[not(parameters('provisionPublicIp'))]" + }, + { + "type": "Microsoft.Network/publicIpAddresses", + "apiVersion": "2019-02-01", + "name": "[concat(variables('publicIpAddressName'), copyIndex())]", + "location": "[variables('location')]", + "properties": { + "publicIpAllocationMethod": "Static", + "publicIPAddressVersion": "IPv4" + }, + "copy": { + "name": "PublicIpCopy", + "count": "[parameters('vmCount')]" + }, + "sku": { + "name": "Basic", + "tier": "Regional" + }, + "condition": "[parameters('provisionPublicIp')]" + }, + { + "type": "Microsoft.Compute/virtualMachines", + "apiVersion": "2019-03-01", + "name": "[concat(parameters('vmName'), copyIndex())]", + "location": "[variables('location')]", + "dependsOn": [ + "[resourceId('Microsoft.Network/networkInterfaces/', concat(variables('networkInterfaceName'), copyIndex()))]" + ], + "copy": { + "name": "VmCopy", + "count": "[parameters('vmCount')]" + }, + "tags": "[parameters('vmTags')]", + "properties": { + "hardwareProfile": { + "vmSize": "[parameters('vmSize')]" + }, + "storageProfile": { + "osDisk": { + "createOption": "fromImage", + "managedDisk": { + "storageAccountType": "[variables('osDiskType')]" + } + }, + "imageReference": { + "publisher": "[parameters('imagePublisher')]", + "offer": "[parameters('imageOffer')]", + "sku": "[parameters('imageSku')]", + "version": "[parameters('imageVersion')]" + } + }, + "networkProfile": { + "networkInterfaces": [ + { + "id": "[resourceId('Microsoft.Network/networkInterfaces', concat(variables('networkInterfaceName'), copyIndex()))]" + } + ] + }, + "osProfile": { + "computerName": "[concat(parameters('vmName'), copyIndex())]", + "adminUsername": "[parameters('adminUsername')]", + "adminPassword": "[parameters('publicKey')]", + "linuxConfiguration": { + "disablePasswordAuthentication": true, + "ssh": { + "publicKeys": [ + { + "path": "[concat('/home/', parameters('adminUsername'), '/.ssh/authorized_keys')]", + "keyData": "[parameters('publicKey')]" + } + ] + } + } + }, + "priority": "[parameters('priority')]", + "evictionPolicy": "[if(equals(parameters('priority'), 'Spot'), parameters('evictionPolicy'), '')]", + "billingProfile": "[parameters('billingProfile')]" + }, + "identity": { + "type": "UserAssigned", + "userAssignedIdentities": { + "[parameters('msi')]": { + } + } + } + } + ], + "outputs": { + "publicIp": { + "type": "array", + "copy": { + "count": "[parameters('vmCount')]", + "input": "[reference(concat(variables('publicIpAddressName'), copyIndex())).ipAddress]" + }, + "condition": "[parameters('provisionPublicIp')]" + }, + "privateIp": { + "type": "array", + "copy": { + "count": "[parameters('vmCount')]", + "input": "[reference(concat(variables('networkInterfaceName'), copyIndex())).ipConfigurations[0].properties.privateIPAddress]" + } + } + } +} diff --git a/.venv/lib/python3.11/site-packages/ray/autoscaler/_private/_azure/config.py b/.venv/lib/python3.11/site-packages/ray/autoscaler/_private/_azure/config.py new file mode 100644 index 0000000000000000000000000000000000000000..d49875ab327d7d50ab010c0f8bfe9b38966264be --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/autoscaler/_private/_azure/config.py @@ -0,0 +1,208 @@ +import json +import logging +import random +from hashlib import sha256 +from pathlib import Path +from typing import Any, Callable + +from azure.common.credentials import get_cli_profile +from azure.identity import AzureCliCredential +from azure.mgmt.resource import ResourceManagementClient +from azure.mgmt.resource.resources.models import DeploymentMode + +UNIQUE_ID_LEN = 4 + +logger = logging.getLogger(__name__) + + +def get_azure_sdk_function(client: Any, function_name: str) -> Callable: + """Retrieve a callable function from Azure SDK client object. + + Newer versions of the various client SDKs renamed function names to + have a begin_ prefix. This function supports both the old and new + versions of the SDK by first trying the old name and falling back to + the prefixed new name. + """ + func = getattr( + client, function_name, getattr(client, f"begin_{function_name}", None) + ) + if func is None: + raise AttributeError( + "'{obj}' object has no {func} or begin_{func} attribute".format( + obj={client.__name__}, func=function_name + ) + ) + return func + + +def bootstrap_azure(config): + config = _configure_key_pair(config) + config = _configure_resource_group(config) + return config + + +def _configure_resource_group(config): + # TODO: look at availability sets + # https://docs.microsoft.com/en-us/azure/virtual-machines/windows/tutorial-availability-sets + subscription_id = config["provider"].get("subscription_id") + if subscription_id is None: + subscription_id = get_cli_profile().get_subscription_id() + resource_client = ResourceManagementClient(AzureCliCredential(), subscription_id) + config["provider"]["subscription_id"] = subscription_id + logger.info("Using subscription id: %s", subscription_id) + + assert ( + "resource_group" in config["provider"] + ), "Provider config must include resource_group field" + resource_group = config["provider"]["resource_group"] + + assert ( + "location" in config["provider"] + ), "Provider config must include location field" + params = {"location": config["provider"]["location"]} + + if "tags" in config["provider"]: + params["tags"] = config["provider"]["tags"] + + logger.info("Creating/Updating resource group: %s", resource_group) + rg_create_or_update = get_azure_sdk_function( + client=resource_client.resource_groups, function_name="create_or_update" + ) + rg_create_or_update(resource_group_name=resource_group, parameters=params) + + # load the template file + current_path = Path(__file__).parent + template_path = current_path.joinpath("azure-config-template.json") + with open(template_path, "r") as template_fp: + template = json.load(template_fp) + + logger.info("Using cluster name: %s", config["cluster_name"]) + + # set unique id for resources in this cluster + unique_id = config["provider"].get("unique_id") + if unique_id is None: + hasher = sha256() + hasher.update(config["provider"]["resource_group"].encode("utf-8")) + unique_id = hasher.hexdigest()[:UNIQUE_ID_LEN] + else: + unique_id = str(unique_id) + config["provider"]["unique_id"] = unique_id + logger.info("Using unique id: %s", unique_id) + cluster_id = "{}-{}".format(config["cluster_name"], unique_id) + + subnet_mask = config["provider"].get("subnet_mask") + if subnet_mask is None: + # choose a random subnet, skipping most common value of 0 + random.seed(unique_id) + subnet_mask = "10.{}.0.0/16".format(random.randint(1, 254)) + logger.info("Using subnet mask: %s", subnet_mask) + + # Copy over properties from existing subnet. + # Addresses issue (https://github.com/Azure/azure-quickstart-templates/issues/2786) + # where existing subnet properties will get overwritten unless explicitly specified + # during multiple deployments even if vnet/subnet do not change. + # May eventually be fixed by passing empty subnet list if they already exist: + # https://techcommunity.microsoft.com/t5/azure-networking-blog/azure-virtual-network-now-supports-updates-without-subnet/ba-p/4067952 + list_by_rg = get_azure_sdk_function( + client=resource_client.resources, function_name="list_by_resource_group" + ) + existing_vnets = list( + list_by_rg( + resource_group, + f"substringof('{unique_id}', name) and " + "resourceType eq 'Microsoft.Network/virtualNetworks'", + ) + ) + if len(existing_vnets) > 0: + vnid = existing_vnets[0].id + get_by_id = get_azure_sdk_function( + client=resource_client.resources, function_name="get_by_id" + ) + subnet = get_by_id(vnid, resource_client.DEFAULT_API_VERSION).properties[ + "subnets" + ][0] + template_vnet = next( + ( + rs + for rs in template["resources"] + if rs["type"] == "Microsoft.Network/virtualNetworks" + ), + None, + ) + if template_vnet is not None: + template_subnets = template_vnet["properties"].get("subnets") + if template_subnets is not None: + template_subnets[0]["properties"].update(subnet["properties"]) + + # Get or create an MSI name and resource group. + # Defaults to current resource group if not provided. + use_existing_msi = ( + "msi_name" in config["provider"] and "msi_resource_group" in config["provider"] + ) + msi_resource_group = config["provider"].get("msi_resource_group", resource_group) + msi_name = config["provider"].get("msi_name", f"ray-{cluster_id}-msi") + logger.info( + "Using msi_name: %s from msi_resource_group: %s", msi_name, msi_resource_group + ) + + parameters = { + "properties": { + "mode": DeploymentMode.incremental, + "template": template, + "parameters": { + "subnet": {"value": subnet_mask}, + "clusterId": {"value": cluster_id}, + "msiName": {"value": msi_name}, + "msiResourceGroup": {"value": msi_resource_group}, + "createMsi": {"value": not use_existing_msi}, + }, + } + } + + create_or_update = get_azure_sdk_function( + client=resource_client.deployments, function_name="create_or_update" + ) + outputs = ( + create_or_update( + resource_group_name=resource_group, + deployment_name="ray-config", + parameters=parameters, + ) + .result() + .properties.outputs + ) + + # append output resource ids to be used with vm creation + config["provider"]["msi"] = outputs["msi"]["value"] + config["provider"]["nsg"] = outputs["nsg"]["value"] + config["provider"]["subnet"] = outputs["subnet"]["value"] + + return config + + +def _configure_key_pair(config): + ssh_user = config["auth"]["ssh_user"] + public_key = None + # search if the keys exist + for key_type in ["ssh_private_key", "ssh_public_key"]: + try: + key_path = Path(config["auth"][key_type]).expanduser() + except KeyError: + raise Exception("Config must define {}".format(key_type)) + except TypeError: + raise Exception("Invalid config value for {}".format(key_type)) + + assert key_path.is_file(), "Could not find ssh key: {}".format(key_path) + + if key_type == "ssh_public_key": + with open(key_path, "r") as f: + public_key = f.read() + + for node_type in config["available_node_types"].values(): + azure_arm_parameters = node_type["node_config"].setdefault( + "azure_arm_parameters", {} + ) + azure_arm_parameters["adminUsername"] = ssh_user + azure_arm_parameters["publicKey"] = public_key + + return config diff --git a/.venv/lib/python3.11/site-packages/ray/autoscaler/_private/_azure/node_provider.py b/.venv/lib/python3.11/site-packages/ray/autoscaler/_private/_azure/node_provider.py new file mode 100644 index 0000000000000000000000000000000000000000..363c776bb3792afe13d69c06d906db3cc0baea45 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/autoscaler/_private/_azure/node_provider.py @@ -0,0 +1,488 @@ +import json +import logging +import time +from concurrent.futures import Future, ThreadPoolExecutor +from pathlib import Path +from threading import RLock +from uuid import uuid4 + +from azure.core.exceptions import ResourceNotFoundError +from azure.identity import DefaultAzureCredential +from azure.mgmt.compute import ComputeManagementClient +from azure.mgmt.network import NetworkManagementClient +from azure.mgmt.resource import ResourceManagementClient +from azure.mgmt.resource.resources.models import DeploymentMode + +from ray.autoscaler._private._azure.config import ( + bootstrap_azure, + get_azure_sdk_function, +) +from ray.autoscaler._private.constants import ( + AUTOSCALER_NODE_START_WAIT_S, + AUTOSCALER_NODE_TERMINATE_WAIT_S, + MAX_PARALLEL_SHUTDOWN_WORKERS, +) +from ray.autoscaler.node_provider import NodeProvider +from ray.autoscaler.tags import ( + NODE_KIND_HEAD, + TAG_RAY_CLUSTER_NAME, + TAG_RAY_LAUNCH_CONFIG, + TAG_RAY_NODE_KIND, + TAG_RAY_NODE_NAME, + TAG_RAY_USER_NODE_TYPE, +) + +VM_NAME_MAX_LEN = 64 +UNIQUE_ID_LEN = 4 + +logger = logging.getLogger(__name__) +azure_logger = logging.getLogger("azure.core.pipeline.policies.http_logging_policy") +azure_logger.setLevel(logging.WARNING) + + +def synchronized(f): + def wrapper(self, *args, **kwargs): + self.lock.acquire() + try: + return f(self, *args, **kwargs) + finally: + self.lock.release() + + return wrapper + + +class AzureNodeProvider(NodeProvider): + """Node Provider for Azure + + This provider assumes Azure credentials are set by running ``az login`` + and the default subscription is configured through ``az account`` + or set in the ``provider`` field of the autoscaler configuration. + + Nodes may be in one of three states: {pending, running, terminated}. Nodes + appear immediately once started by ``create_node``, and transition + immediately to terminated when ``terminate_node`` is called. + """ + + def __init__(self, provider_config, cluster_name): + NodeProvider.__init__(self, provider_config, cluster_name) + subscription_id = provider_config["subscription_id"] + self.cache_stopped_nodes = provider_config.get("cache_stopped_nodes", True) + credential = DefaultAzureCredential(exclude_shared_token_cache_credential=True) + self.compute_client = ComputeManagementClient(credential, subscription_id) + self.network_client = NetworkManagementClient(credential, subscription_id) + self.resource_client = ResourceManagementClient(credential, subscription_id) + + self.lock = RLock() + + # cache node objects + self.cached_nodes = {} + + # Cache terminating node operations + self.terminating_nodes: dict[str, Future] = {} + self.termination_executor = ThreadPoolExecutor( + max_workers=MAX_PARALLEL_SHUTDOWN_WORKERS + ) + + @synchronized + def _get_filtered_nodes(self, tag_filters): + # add cluster name filter to only get nodes from this cluster + cluster_tag_filters = {**tag_filters, TAG_RAY_CLUSTER_NAME: self.cluster_name} + + def match_tags(tags): + for k, v in cluster_tag_filters.items(): + if tags.get(k) != v: + return False + return True + + vms = self.compute_client.virtual_machines.list( + resource_group_name=self.provider_config["resource_group"] + ) + + nodes = [self._extract_metadata(vm) for vm in vms] + self.cached_nodes = {node["name"]: node for node in nodes} + + # Update terminating nodes list by removing nodes that + # have finished termination. + self.terminating_nodes = { + k: v for k, v in self.terminating_nodes.items() if not v.done() + } + + return {k: v for k, v in self.cached_nodes.items() if match_tags(v["tags"])} + + def _extract_metadata(self, vm): + # get tags + metadata = {"name": vm.name, "tags": vm.tags, "status": ""} + + # get status + resource_group = self.provider_config["resource_group"] + try: + instance = self.compute_client.virtual_machines.instance_view( + resource_group_name=resource_group, vm_name=vm.name + ).as_dict() + except ResourceNotFoundError: + return metadata + + for status in instance["statuses"]: + # If ProvisioningState is "failed" (e.g., + # ProvisioningState/failed/RetryableError), we can get a third + # string here, so we need to limit to the first two outputs. + code, state = status["code"].split("/")[:2] + # skip provisioning status + if code == "PowerState": + metadata["status"] = state + break + + # get ip data + nic_id = vm.network_profile.network_interfaces[0].id + metadata["nic_name"] = nic_id.split("/")[-1] + nic = self.network_client.network_interfaces.get( + resource_group_name=resource_group, + network_interface_name=metadata["nic_name"], + ) + ip_config = nic.ip_configurations[0] + + # Get public IP if not using internal IPs or if this is the + # head node and use_external_head_ip is True + if not self.provider_config.get("use_internal_ips", False) or ( + self.provider_config.get("use_external_head_ip", False) + and metadata["tags"][TAG_RAY_NODE_KIND] == NODE_KIND_HEAD + ): + public_ip_id = ip_config.public_ip_address.id + metadata["public_ip_name"] = public_ip_id.split("/")[-1] + public_ip = self.network_client.public_ip_addresses.get( + resource_group_name=resource_group, + public_ip_address_name=metadata["public_ip_name"], + ) + metadata["external_ip"] = public_ip.ip_address + + metadata["internal_ip"] = ip_config.private_ip_address + + return metadata + + def stopped_nodes(self, tag_filters): + """Return a list of stopped node ids filtered by the specified tags dict.""" + nodes = self._get_filtered_nodes(tag_filters=tag_filters) + return [k for k, v in nodes.items() if v["status"].startswith("deallocat")] + + def non_terminated_nodes(self, tag_filters): + """Return a list of node ids filtered by the specified tags dict. + + This list must not include terminated nodes. For performance reasons, + providers are allowed to cache the result of a call to nodes() to + serve single-node queries (e.g. is_running(node_id)). This means that + nodes() must be called again to refresh results. + + Examples: + >>> from ray.autoscaler.tags import TAG_RAY_NODE_KIND + >>> provider = ... # doctest: +SKIP + >>> provider.non_terminated_nodes( # doctest: +SKIP + ... {TAG_RAY_NODE_KIND: "worker"}) + ["node-1", "node-2"] + """ + nodes = self._get_filtered_nodes(tag_filters=tag_filters) + return [ + k + for k, v in nodes.items() + if not v["status"].startswith("deallocat") or k in self.terminating_nodes + ] + + def is_running(self, node_id): + """Return whether the specified node is running.""" + # always get current status + node = self._get_node(node_id=node_id) + return node["status"] == "running" + + def is_terminated(self, node_id): + """Return whether the specified node is terminated.""" + # always get current status + node = self._get_node(node_id=node_id) + return node["status"].startswith("deallocat") + + def node_tags(self, node_id): + """Returns the tags of the given node (string dict).""" + return self._get_cached_node(node_id=node_id)["tags"] + + def external_ip(self, node_id): + """Returns the external ip of the given node.""" + ip = ( + self._get_cached_node(node_id=node_id)["external_ip"] + or self._get_node(node_id=node_id)["external_ip"] + ) + return ip + + def internal_ip(self, node_id): + """Returns the internal ip (Ray ip) of the given node.""" + ip = ( + self._get_cached_node(node_id=node_id)["internal_ip"] + or self._get_node(node_id=node_id)["internal_ip"] + ) + return ip + + def create_node(self, node_config, tags, count): + resource_group = self.provider_config["resource_group"] + + if self.cache_stopped_nodes: + VALIDITY_TAGS = [ + TAG_RAY_CLUSTER_NAME, + TAG_RAY_NODE_KIND, + TAG_RAY_LAUNCH_CONFIG, + TAG_RAY_USER_NODE_TYPE, + ] + filters = {tag: tags[tag] for tag in VALIDITY_TAGS if tag in tags} + reuse_nodes = self.stopped_nodes(filters)[:count] + logger.info( + f"Reusing nodes {list(reuse_nodes)}. " + "To disable reuse, set `cache_stopped_nodes: False` " + "under `provider` in the cluster configuration.", + ) + start = get_azure_sdk_function( + client=self.compute_client.virtual_machines, function_name="start" + ) + for node_id in reuse_nodes: + start(resource_group_name=resource_group, vm_name=node_id).wait() + self.set_node_tags(node_id, tags) + count -= len(reuse_nodes) + + if count: + self._create_node(node_config, tags, count) + + def _create_node(self, node_config, tags, count): + """Creates a number of nodes within the namespace.""" + resource_group = self.provider_config["resource_group"] + + # load the template file + current_path = Path(__file__).parent + template_path = current_path.joinpath("azure-vm-template.json") + with open(template_path, "r") as template_fp: + template = json.load(template_fp) + + # get the tags + config_tags = node_config.get("tags", {}).copy() + config_tags.update(tags) + config_tags[TAG_RAY_CLUSTER_NAME] = self.cluster_name + + vm_name = "{node}-{unique_id}-{vm_id}".format( + node=config_tags.get(TAG_RAY_NODE_NAME, "node"), + unique_id=self.provider_config["unique_id"], + vm_id=uuid4().hex[:UNIQUE_ID_LEN], + )[:VM_NAME_MAX_LEN] + + template_params = node_config["azure_arm_parameters"].copy() + template_params["vmName"] = vm_name + # Provision public IP if not using internal IPs or if this is the + # head node and use_external_head_ip is True + template_params["provisionPublicIp"] = not self.provider_config.get( + "use_internal_ips", False + ) or ( + self.provider_config.get("use_external_head_ip", False) + and config_tags[TAG_RAY_NODE_KIND] == NODE_KIND_HEAD + ) + template_params["vmTags"] = config_tags + template_params["vmCount"] = count + template_params["msi"] = self.provider_config["msi"] + template_params["nsg"] = self.provider_config["nsg"] + template_params["subnet"] = self.provider_config["subnet"] + + parameters = { + "properties": { + "mode": DeploymentMode.incremental, + "template": template, + "parameters": { + key: {"value": value} for key, value in template_params.items() + }, + } + } + + # TODO: we could get the private/public ips back directly + create_or_update = get_azure_sdk_function( + client=self.resource_client.deployments, function_name="create_or_update" + ) + create_or_update( + resource_group_name=resource_group, + deployment_name=vm_name, + parameters=parameters, + ).wait(timeout=AUTOSCALER_NODE_START_WAIT_S) + + @synchronized + def set_node_tags(self, node_id, tags): + """Sets the tag values (string dict) for the specified node.""" + node_tags = self._get_cached_node(node_id)["tags"] + node_tags.update(tags) + update = get_azure_sdk_function( + client=self.compute_client.virtual_machines, function_name="update" + ) + update( + resource_group_name=self.provider_config["resource_group"], + vm_name=node_id, + parameters={"tags": node_tags}, + ) + self.cached_nodes[node_id]["tags"] = node_tags + + def terminate_node(self, node_id): + """Terminates the specified node. This will delete the VM and + associated resources (NIC, IP, Storage) for the specified node.""" + + resource_group = self.provider_config["resource_group"] + + if self.cache_stopped_nodes: + try: + # stop machine and leave all resources + logger.info( + f"Stopping instance {node_id}" + "(to fully terminate instead, " + "set `cache_stopped_nodes: False` " + "under `provider` in the cluster configuration)" + ) + stop = get_azure_sdk_function( + client=self.compute_client.virtual_machines, + function_name="deallocate", + ) + stop(resource_group_name=resource_group, vm_name=node_id) + except Exception as e: + logger.warning("Failed to stop VM: {}".format(e)) + + # If node_id is in terminating nodes dict, it's already terminating + # Otherwise, kick off termination and add it to the dict + elif node_id not in self.terminating_nodes: + self.terminating_nodes[node_id] = self.termination_executor.submit( + self._delete_node_and_resources, resource_group, node_id + ) + + def _delete_node_and_resources(self, resource_group, node_id): + try: + vm = self.compute_client.virtual_machines.get( + resource_group_name=resource_group, vm_name=node_id + ) + except ResourceNotFoundError as e: + # Node no longer exists + logger.warning("Failed to delete VM: {}".format(e)) + return + + # Gather dependent disks + disks = set() + if vm.storage_profile is not None and vm.storage_profile.data_disks is not None: + for d in vm.storage_profile.data_disks: + if d.name is not None: + disks.add(d.name) + if ( + vm.storage_profile is not None + and vm.storage_profile.os_disk is not None + and vm.storage_profile.os_disk.name is not None + ): + disks.add(vm.storage_profile.os_disk.name) + + # Gather dependent NICs and public IPs + nics = set() + ips = set() + if ( + vm.network_profile is not None + and vm.network_profile.network_interfaces is not None + ): + for nint in vm.network_profile.network_interfaces: + if nint.id is not None: + nic_name = nint.id.split("/")[-1] + nics.add(nic_name) + # Get public IP if not using internal IPs or if this is the + # head node and use_external_head_ip is True + if not self.provider_config.get("use_internal_ips", False) or ( + self.provider_config.get("use_external_head_ip", False) + and vm.tags[TAG_RAY_NODE_KIND] == NODE_KIND_HEAD + ): + nic = self.network_client.network_interfaces.get( + resource_group_name=resource_group, + network_interface_name=nic_name, + ) + if nic.ip_configurations is not None: + for ipc in nic.ip_configurations: + if ipc.public_ip_address.id is not None: + ips.add(ipc.public_ip_address.id.split("/")[-1]) + + # Delete VM + st = time.monotonic() + delete = get_azure_sdk_function( + client=self.compute_client.virtual_machines, + function_name="delete", + ) + try: + delete(resource_group_name=resource_group, vm_name=node_id).wait( + timeout=AUTOSCALER_NODE_TERMINATE_WAIT_S + ) + except Exception as e: + logger.warning("Failed to delete VM: {}".format(e)) + + # Delete disks (no need to wait for these, but gather the LROs for end) + disk_lros = [] + delete = get_azure_sdk_function( + client=self.compute_client.disks, function_name="delete" + ) + for d in disks: + try: + disk_lros.append( + delete( + resource_group_name=resource_group, + disk_name=d, + ) + ) + except Exception as e: + logger.warning("Failed to delete disk: {}".format(e)) + + # Delete NICs + nic_lros = [] + delete = get_azure_sdk_function( + client=self.network_client.network_interfaces, function_name="delete" + ) + for n in nics: + try: + nic_lros.append( + delete( + resource_group_name=resource_group, + network_interface_name=n, + ) + ) + except Exception as e: + logger.warning("Failed to delete NIC: {}".format(e)) + + while ( + not all(nlro.done() for nlro in nic_lros) + and (time.monotonic() - st) < AUTOSCALER_NODE_TERMINATE_WAIT_S + ): + time.sleep(0.1) + + # Delete Public IPs + delete = get_azure_sdk_function( + client=self.network_client.public_ip_addresses, + function_name="delete", + ) + ip_lros = [] + for ip in ips: + try: + ip_lros.append( + delete( + resource_group_name=resource_group, + public_ip_address_name=ip, + ) + ) + except Exception as e: + logger.warning("Failed to delete public IP: {}".format(e)) + + while ( + not all(dlro.done() for dlro in disk_lros) + and (time.monotonic() - st) < AUTOSCALER_NODE_TERMINATE_WAIT_S + ): + time.sleep(0.1) + while ( + not all(iplro.done() for iplro in ip_lros) + and (time.monotonic() - st) < AUTOSCALER_NODE_TERMINATE_WAIT_S + ): + time.sleep(0.1) + + def _get_node(self, node_id): + self._get_filtered_nodes({}) # Side effect: updates cache + return self.cached_nodes[node_id] + + def _get_cached_node(self, node_id): + return self.cached_nodes.get(node_id) or self._get_node(node_id=node_id) + + @staticmethod + def bootstrap_config(cluster_config): + return bootstrap_azure(cluster_config) diff --git a/.venv/lib/python3.11/site-packages/ray/autoscaler/_private/autoscaler.py b/.venv/lib/python3.11/site-packages/ray/autoscaler/_private/autoscaler.py new file mode 100644 index 0000000000000000000000000000000000000000..6874d90376b6c7110b58e041a16dc03ab384c0d2 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/autoscaler/_private/autoscaler.py @@ -0,0 +1,1508 @@ +import copy +import logging +import math +import operator +import os +import queue +import subprocess +import threading +import time +from collections import Counter, defaultdict, namedtuple +from dataclasses import dataclass, field +from enum import Enum +from typing import Any, Callable, Dict, FrozenSet, List, Optional, Set, Tuple, Union + +import yaml + +import ray +import ray._private.ray_constants as ray_constants +from ray.autoscaler._private.constants import ( + AUTOSCALER_HEARTBEAT_TIMEOUT_S, + AUTOSCALER_MAX_CONCURRENT_LAUNCHES, + AUTOSCALER_MAX_LAUNCH_BATCH, + AUTOSCALER_MAX_NUM_FAILURES, + AUTOSCALER_STATUS_LOG, + AUTOSCALER_UPDATE_INTERVAL_S, + DISABLE_LAUNCH_CONFIG_CHECK_KEY, + DISABLE_NODE_UPDATERS_KEY, + FOREGROUND_NODE_LAUNCH_KEY, + WORKER_LIVENESS_CHECK_KEY, +) +from ray.autoscaler._private.event_summarizer import EventSummarizer +from ray.autoscaler._private.legacy_info_string import legacy_log_info_string +from ray.autoscaler._private.load_metrics import LoadMetrics +from ray.autoscaler._private.local.node_provider import ( + LocalNodeProvider, + record_local_head_state_if_needed, +) +from ray.autoscaler._private.node_launcher import BaseNodeLauncher, NodeLauncher +from ray.autoscaler._private.node_provider_availability_tracker import ( + NodeAvailabilitySummary, + NodeProviderAvailabilityTracker, +) +from ray.autoscaler._private.node_tracker import NodeTracker +from ray.autoscaler._private.prom_metrics import AutoscalerPrometheusMetrics +from ray.autoscaler._private.providers import _get_node_provider +from ray.autoscaler._private.resource_demand_scheduler import ( + ResourceDemandScheduler, + ResourceDict, + get_bin_pack_residual, +) +from ray.autoscaler._private.updater import NodeUpdaterThread +from ray.autoscaler._private.util import ( + ConcurrentCounter, + NodeCount, + NodeID, + NodeIP, + NodeType, + NodeTypeConfigDict, + format_info_string, + hash_launch_conf, + hash_runtime_conf, + validate_config, + with_head_node_ip, +) +from ray.autoscaler.node_provider import NodeProvider +from ray.autoscaler.tags import ( + NODE_KIND_HEAD, + NODE_KIND_UNMANAGED, + NODE_KIND_WORKER, + STATUS_UP_TO_DATE, + STATUS_UPDATE_FAILED, + TAG_RAY_FILE_MOUNTS_CONTENTS, + TAG_RAY_LAUNCH_CONFIG, + TAG_RAY_NODE_KIND, + TAG_RAY_NODE_STATUS, + TAG_RAY_RUNTIME_CONFIG, + TAG_RAY_USER_NODE_TYPE, +) +from ray.exceptions import RpcError + +logger = logging.getLogger(__name__) + +# Status of a node e.g. "up-to-date", see ray/autoscaler/tags.py +NodeStatus = str + +# Tuple of modified fields for the given node_id returned by should_update +# that will be passed into a NodeUpdaterThread. +UpdateInstructions = namedtuple( + "UpdateInstructions", + ["node_id", "setup_commands", "ray_start_commands", "docker_config"], +) + +NodeLaunchData = Tuple[NodeTypeConfigDict, NodeCount, Optional[NodeType]] + + +@dataclass +class AutoscalerSummary: + active_nodes: Dict[NodeType, int] + idle_nodes: Optional[Dict[NodeType, int]] + pending_nodes: List[Tuple[NodeIP, NodeType, NodeStatus]] + pending_launches: Dict[NodeType, int] + failed_nodes: List[Tuple[NodeIP, NodeType]] + node_availability_summary: NodeAvailabilitySummary = field( + default_factory=lambda: NodeAvailabilitySummary({}) + ) + # A dictionary of node IP to a list of reasons the node is not idle. + node_activities: Optional[Dict[str, Tuple[NodeIP, List[str]]]] = None + pending_resources: Dict[str, int] = field(default_factory=lambda: {}) + # A mapping from node name (the same key as `usage_by_node`) to node type. + # Optional for deployment modes which have the concept of node types and + # backwards compatibility. + node_type_mapping: Optional[Dict[str, str]] = None + # Whether the autoscaler summary is v1 or v2. + legacy: bool = False + + +class NonTerminatedNodes: + """Class to extract and organize information on non-terminated nodes.""" + + def __init__(self, provider: NodeProvider): + start_time = time.time() + # All non-terminated nodes + self.all_node_ids = provider.non_terminated_nodes({}) + + # Managed worker nodes (node kind "worker"): + self.worker_ids: List[NodeID] = [] + # The head node (node kind "head") + self.head_id: Optional[NodeID] = None + + for node in self.all_node_ids: + node_kind = provider.node_tags(node)[TAG_RAY_NODE_KIND] + if node_kind == NODE_KIND_WORKER: + self.worker_ids.append(node) + elif node_kind == NODE_KIND_HEAD: + self.head_id = node + + # Note: For typical use-cases, self.all_node_ids == self.worker_ids + + # [self.head_id]. The difference being in the case of unmanaged nodes. + + # Record the time of the non_terminated nodes call. This typically + # translates to a "describe" or "list" call on most cluster managers + # which can be quite expensive. Note that we include the processing + # time because on some clients, there may be pagination and the + # underlying api calls may be done lazily. + self.non_terminated_nodes_time = time.time() - start_time + logger.info( + f"The autoscaler took {round(self.non_terminated_nodes_time, 3)}" + " seconds to fetch the list of non-terminated nodes." + ) + + def remove_terminating_nodes(self, terminating_nodes: List[NodeID]) -> None: + """Remove nodes we're in the process of terminating from internal + state.""" + + def not_terminating(node): + return node not in terminating_nodes + + self.worker_ids = list(filter(not_terminating, self.worker_ids)) + self.all_node_ids = list(filter(not_terminating, self.all_node_ids)) + + +# Whether a worker should be kept based on the min_workers and +# max_workers constraints. +# +# keep: should keep the worker +# terminate: should terminate the worker +# decide_later: the worker can be terminated if needed +KeepOrTerminate = Enum("KeepOrTerminate", "keep terminate decide_later") + + +class StandardAutoscaler: + """The autoscaling control loop for a Ray cluster. + + There are two ways to start an autoscaling cluster: manually by running + `ray start --head --autoscaling-config=/path/to/config.yaml` on a instance + that has permission to launch other instances, or you can also use `ray up + /path/to/config.yaml` from your laptop, which will configure the right + AWS/Cloud roles automatically. See the Ray documentation + (https://docs.ray.io/en/latest/) for a full definition of autoscaling behavior. + StandardAutoscaler's `update` method is periodically called in + `monitor.py`'s monitoring loop. + + StandardAutoscaler is also used to bootstrap clusters (by adding workers + until the cluster size that can handle the resource demand is met). + """ + + def __init__( + self, + # TODO(ekl): require config reader to be a callable always. + config_reader: Union[str, Callable[[], dict]], + load_metrics: LoadMetrics, + gcs_client: "ray._raylet.GcsClient", + session_name: Optional[str] = None, + max_launch_batch: int = AUTOSCALER_MAX_LAUNCH_BATCH, + max_concurrent_launches: int = AUTOSCALER_MAX_CONCURRENT_LAUNCHES, + max_failures: int = AUTOSCALER_MAX_NUM_FAILURES, + process_runner: Any = subprocess, + update_interval_s: int = AUTOSCALER_UPDATE_INTERVAL_S, + prefix_cluster_info: bool = False, + event_summarizer: Optional[EventSummarizer] = None, + prom_metrics: Optional[AutoscalerPrometheusMetrics] = None, + ): + """Create a StandardAutoscaler. + + Args: + config_reader: Path to a Ray Autoscaler YAML, or a function to read + and return the latest config. + load_metrics: Provides metrics for the Ray cluster. + session_name: The session name of the cluster this autoscaler + is deployed. + max_launch_batch: Max number of nodes to launch in one request. + max_concurrent_launches: Max number of nodes that can be + concurrently launched. This value and `max_launch_batch` + determine the number of batches that are used to launch nodes. + max_failures: Number of failures that the autoscaler will tolerate + before exiting. + process_runner: Subproc-like interface used by the CommandRunner. + update_interval_s: Seconds between running the autoscaling loop. + prefix_cluster_info: Whether to add the cluster name to info strs. + event_summarizer: Utility to consolidate duplicated messages. + prom_metrics: Prometheus metrics for autoscaler-related operations. + gcs_client: client for interactions with the GCS. Used to drain nodes + before termination. + """ + + if isinstance(config_reader, str): + # Auto wrap with file reader. + def read_fn(): + with open(config_reader) as f: + new_config = yaml.safe_load(f.read()) + return new_config + + self.config_reader = read_fn + else: + self.config_reader = config_reader + + self.node_provider_availability_tracker = NodeProviderAvailabilityTracker() + # Prefix each line of info string with cluster name if True + self.prefix_cluster_info = prefix_cluster_info + # Keep this before self.reset (self.provider needs to be created + # exactly once). + self.provider = None + # Keep this before self.reset (if an exception occurs in reset + # then prom_metrics must be instantitiated to increment the + # exception counter) + self.prom_metrics = prom_metrics or AutoscalerPrometheusMetrics( + session_name=session_name + ) # noqa + self.resource_demand_scheduler = None + self.reset(errors_fatal=True) + self.load_metrics = load_metrics + + self.max_failures = max_failures + self.max_launch_batch = max_launch_batch + self.max_concurrent_launches = max_concurrent_launches + self.process_runner = process_runner + self.event_summarizer = event_summarizer or EventSummarizer() + + # Map from node_id to NodeUpdater threads + self.updaters: Dict[NodeID, NodeUpdaterThread] = {} + self.num_failed_updates: Dict[NodeID, int] = defaultdict(int) + self.num_successful_updates: Dict[NodeID, int] = defaultdict(int) + self.num_failures = 0 + self.last_update_time = 0.0 + self.update_interval_s = update_interval_s + + # Keeps track of pending and running nodes + self.non_terminated_nodes: Optional[NonTerminatedNodes] = None + + # Tracks nodes scheduled for termination + self.nodes_to_terminate: List[NodeID] = [] + + # Disable NodeUpdater threads if true. + # Should be set to true in situations where another component, such as + # a Kubernetes operator, is responsible for Ray setup on nodes. + self.disable_node_updaters = self.config["provider"].get( + DISABLE_NODE_UPDATERS_KEY, False + ) + logger.info(f"{DISABLE_NODE_UPDATERS_KEY}:{self.disable_node_updaters}") + + # Disable launch config checking if true. + # This is set in the fake_multinode situations where there isn't any + # meaningful node "type" to enforce. + self.disable_launch_config_check = self.config["provider"].get( + DISABLE_LAUNCH_CONFIG_CHECK_KEY, False + ) + logger.info( + f"{DISABLE_LAUNCH_CONFIG_CHECK_KEY}:{self.disable_launch_config_check}" + ) + + # By default, the autoscaler launches nodes in batches asynchronously in + # background threads. + # When the following flag is set, that behavior is disabled, so that nodes + # are launched in the main thread, all in one batch, blocking until all + # NodeProvider.create_node calls have returned. + self.foreground_node_launch = self.config["provider"].get( + FOREGROUND_NODE_LAUNCH_KEY, False + ) + logger.info(f"{FOREGROUND_NODE_LAUNCH_KEY}:{self.foreground_node_launch}") + + # By default, the autoscaler kills and/or tries to recover + # a worker node if it hasn't produced a resource heartbeat in the last 30 + # seconds. The worker_liveness_check flag allows disabling this behavior in + # settings where another component, such as a Kubernetes operator, is + # responsible for healthchecks. + self.worker_liveness_check = self.config["provider"].get( + WORKER_LIVENESS_CHECK_KEY, True + ) + logger.info(f"{WORKER_LIVENESS_CHECK_KEY}:{self.worker_liveness_check}") + + # Node launchers + self.foreground_node_launcher: Optional[BaseNodeLauncher] = None + self.launch_queue: Optional[queue.Queue[NodeLaunchData]] = None + self.pending_launches = ConcurrentCounter() + if self.foreground_node_launch: + self.foreground_node_launcher = BaseNodeLauncher( + provider=self.provider, + pending=self.pending_launches, + event_summarizer=self.event_summarizer, + node_provider_availability_tracker=self.node_provider_availability_tracker, # noqa: E501 Flake and black disagree how to format this. + session_name=session_name, + node_types=self.available_node_types, + prom_metrics=self.prom_metrics, + ) + else: + self.launch_queue = queue.Queue() + max_batches = math.ceil(max_concurrent_launches / float(max_launch_batch)) + for i in range(int(max_batches)): + node_launcher = NodeLauncher( + provider=self.provider, + queue=self.launch_queue, + index=i, + pending=self.pending_launches, + event_summarizer=self.event_summarizer, + node_provider_availability_tracker=self.node_provider_availability_tracker, # noqa: E501 Flake and black disagreee how to format this. + session_name=session_name, + node_types=self.available_node_types, + prom_metrics=self.prom_metrics, + ) + node_launcher.daemon = True + node_launcher.start() + + # NodeTracker maintains soft state to track the number of recently + # failed nodes. It is best effort only. + self.node_tracker = NodeTracker() + + # Expand local file_mounts to allow ~ in the paths. This can't be done + # earlier when the config is written since we might be on different + # platform and the expansion would result in wrong path. + self.config["file_mounts"] = { + remote: os.path.expanduser(local) + for remote, local in self.config["file_mounts"].items() + } + + self.gcs_client = gcs_client + + for local_path in self.config["file_mounts"].values(): + assert os.path.exists(local_path) + logger.info("StandardAutoscaler: {}".format(self.config)) + + @property + def all_node_types(self) -> Set[str]: + return self.config["available_node_types"].keys() + + def update(self): + try: + self.reset(errors_fatal=False) + self._update() + except Exception as e: + self.prom_metrics.update_loop_exceptions.inc() + logger.exception("StandardAutoscaler: Error during autoscaling.") + self.num_failures += 1 + if self.num_failures > self.max_failures: + logger.critical("StandardAutoscaler: Too many errors, abort.") + raise e + + def _update(self): + # For type checking, assert that these objects have been instantitiated. + assert self.provider + assert self.resource_demand_scheduler + + now = time.time() + # Throttle autoscaling updates to this interval to avoid exceeding + # rate limits on API calls. + if now - self.last_update_time < self.update_interval_s: + return + + self.last_update_time = now + + # Query the provider to update the list of non-terminated nodes + self.non_terminated_nodes = NonTerminatedNodes(self.provider) + + # Back off the update if the provider says it's not safe to proceed. + if not self.provider.safe_to_scale(): + logger.info( + "Backing off of autoscaler update." + f" Will try again in {self.update_interval_s} seconds." + ) + return + + # This will accumulate the nodes we need to terminate. + self.nodes_to_terminate = [] + + # Update running nodes gauge + num_workers = len(self.non_terminated_nodes.worker_ids) + self.prom_metrics.running_workers.set(num_workers) + + # Remove from LoadMetrics the ips unknown to the NodeProvider. + self.load_metrics.prune_active_ips( + active_ips=[ + self.provider.internal_ip(node_id) + for node_id in self.non_terminated_nodes.all_node_ids + ] + ) + + # Update status strings + if AUTOSCALER_STATUS_LOG: + logger.info(self.info_string()) + legacy_log_info_string(self, self.non_terminated_nodes.worker_ids) + + if not self.provider.is_readonly(): + self.terminate_nodes_to_enforce_config_constraints(now) + + if self.disable_node_updaters: + # Don't handle unhealthy nodes if the liveness check is disabled. + # self.worker_liveness_check is True by default. + if self.worker_liveness_check: + self.terminate_unhealthy_nodes(now) + else: + self.process_completed_updates() + self.update_nodes() + # Don't handle unhealthy nodes if the liveness check is disabled. + # self.worker_liveness_check is True by default. + if self.worker_liveness_check: + self.attempt_to_recover_unhealthy_nodes(now) + self.set_prometheus_updater_data() + + # Dict[NodeType, int], List[ResourceDict] + to_launch, unfulfilled = self.resource_demand_scheduler.get_nodes_to_launch( + self.non_terminated_nodes.all_node_ids, + self.pending_launches.breakdown(), + self.load_metrics.get_resource_demand_vector(), + self.load_metrics.get_resource_utilization(), + self.load_metrics.get_pending_placement_groups(), + self.load_metrics.get_static_node_resources_by_ip(), + ensure_min_cluster_size=self.load_metrics.get_resource_requests(), + node_availability_summary=self.node_provider_availability_tracker.summary(), + ) + self._report_pending_infeasible(unfulfilled) + + if not self.provider.is_readonly(): + self.launch_required_nodes(to_launch) + + # Execute optional end-of-update logic. + # Keep this method call at the end of autoscaler._update(). + self.provider.post_process() + + # Record the amount of time the autoscaler took for + # this _update() iteration. + update_time = time.time() - self.last_update_time + logger.info( + f"The autoscaler took {round(update_time, 3)}" + " seconds to complete the update iteration." + ) + self.prom_metrics.update_time.observe(update_time) + + def terminate_nodes_to_enforce_config_constraints(self, now: float): + """Terminates nodes to enforce constraints defined by the autoscaling + config. + + (1) Terminates nodes in excess of `max_workers`. + (2) Terminates nodes idle for longer than `idle_timeout_minutes`. + (3) Terminates outdated nodes, + namely nodes whose configs don't match `node_config` for the + relevant node type. + + Avoids terminating non-outdated nodes required by + autoscaler.sdk.request_resources(). + """ + # For type checking, assert that these objects have been instantitiated. + assert self.non_terminated_nodes + assert self.provider + + last_used = self.load_metrics.ray_nodes_last_used_time_by_ip + + idle_timeout_s = 60 * self.config["idle_timeout_minutes"] + + last_used_cutoff = now - idle_timeout_s + + # Sort based on last used to make sure to keep min_workers that + # were most recently used. Otherwise, _keep_min_workers_of_node_type + # might keep a node that should be terminated. + sorted_node_ids = self._sort_based_on_last_used( + self.non_terminated_nodes.worker_ids, last_used + ) + + # Don't terminate nodes needed by request_resources() + nodes_not_allowed_to_terminate: FrozenSet[NodeID] = {} + if self.load_metrics.get_resource_requests(): + nodes_not_allowed_to_terminate = ( + self._get_nodes_needed_for_request_resources(sorted_node_ids) + ) + + # Tracks counts of nodes we intend to keep for each node type. + node_type_counts = defaultdict(int) + + def keep_node(node_id: NodeID) -> None: + assert self.provider + # Update per-type counts. + tags = self.provider.node_tags(node_id) + if TAG_RAY_USER_NODE_TYPE in tags: + node_type = tags[TAG_RAY_USER_NODE_TYPE] + node_type_counts[node_type] += 1 + + # Nodes that we could terminate, if needed. + nodes_we_could_terminate: List[NodeID] = [] + + for node_id in sorted_node_ids: + # Make sure to not kill idle node types if the number of workers + # of that type is lower/equal to the min_workers of that type + # or it is needed for request_resources(). + should_keep_or_terminate, reason = self._keep_worker_of_node_type( + node_id, node_type_counts + ) + if should_keep_or_terminate == KeepOrTerminate.terminate: + self.schedule_node_termination(node_id, reason, logger.info) + continue + if ( + should_keep_or_terminate == KeepOrTerminate.keep + or node_id in nodes_not_allowed_to_terminate + ) and self.launch_config_ok(node_id): + keep_node(node_id) + continue + + node_ip = self.provider.internal_ip(node_id) + + if node_ip in last_used and last_used[node_ip] < last_used_cutoff: + self.schedule_node_termination(node_id, "idle", logger.info) + # Get the local time of the node's last use as a string. + formatted_last_used_time = time.asctime( + time.localtime(last_used[node_ip]) + ) + logger.info(f"Node last used: {formatted_last_used_time}.") + # Note that the current time will appear in the log prefix. + elif not self.launch_config_ok(node_id): + self.schedule_node_termination(node_id, "outdated", logger.info) + else: + keep_node(node_id) + nodes_we_could_terminate.append(node_id) + + # Terminate nodes if there are too many + num_workers = len(self.non_terminated_nodes.worker_ids) + num_extra_nodes_to_terminate = ( + num_workers - len(self.nodes_to_terminate) - self.config["max_workers"] + ) + + if num_extra_nodes_to_terminate > len(nodes_we_could_terminate): + logger.warning( + "StandardAutoscaler: trying to terminate " + f"{num_extra_nodes_to_terminate} nodes, while only " + f"{len(nodes_we_could_terminate)} are safe to terminate." + " Inconsistent config is likely." + ) + num_extra_nodes_to_terminate = len(nodes_we_could_terminate) + + # If num_extra_nodes_to_terminate is negative or zero, + # we would have less than max_workers nodes after terminating + # nodes_to_terminate and we do not need to terminate anything else. + if num_extra_nodes_to_terminate > 0: + extra_nodes_to_terminate = nodes_we_could_terminate[ + -num_extra_nodes_to_terminate: + ] + for node_id in extra_nodes_to_terminate: + self.schedule_node_termination(node_id, "max workers", logger.info) + + self.terminate_scheduled_nodes() + + def schedule_node_termination( + self, node_id: NodeID, reason_opt: Optional[str], logger_method: Callable + ) -> None: + # For type checking, assert that this object has been instantitiated. + assert self.provider + + if reason_opt is None: + raise Exception("reason should be not None.") + reason: str = reason_opt + node_ip = self.provider.internal_ip(node_id) + # Log, record an event, and add node_id to nodes_to_terminate. + logger_method( + "StandardAutoscaler: " + f"Terminating the node with id {node_id}" + f" and ip {node_ip}." + f" ({reason})" + ) + self.event_summarizer.add( + "Removing {} nodes of type " + + self._get_node_type(node_id) + + " ({}).".format(reason), + quantity=1, + aggregate=operator.add, + ) + self.nodes_to_terminate.append(node_id) + + def terminate_scheduled_nodes(self): + """Terminate scheduled nodes and clean associated autoscaler state.""" + # For type checking, assert that these objects have been instantitiated. + assert self.provider + assert self.non_terminated_nodes + + if not self.nodes_to_terminate: + return + + # Drain the nodes + self.drain_nodes_via_gcs(self.nodes_to_terminate) + # Terminate the nodes + self.provider.terminate_nodes(self.nodes_to_terminate) + for node in self.nodes_to_terminate: + self.node_tracker.untrack(node) + self.prom_metrics.stopped_nodes.inc() + + # Update internal node lists + self.non_terminated_nodes.remove_terminating_nodes(self.nodes_to_terminate) + + self.nodes_to_terminate = [] + + def drain_nodes_via_gcs(self, provider_node_ids_to_drain: List[NodeID]): + """Send an RPC request to the GCS to drain (prepare for termination) + the nodes with the given node provider ids. + + note: The current implementation of DrainNode on the GCS side is to + de-register and gracefully shut down the Raylets. In the future, + the behavior may change to better reflect the name "Drain." + See https://github.com/ray-project/ray/pull/19350. + """ + # For type checking, assert that this object has been instantitiated. + assert self.provider + + # The GCS expects Raylet ids in the request, rather than NodeProvider + # ids. To get the Raylet ids of the nodes to we're draining, we make + # the following translations of identifiers: + # node provider node id -> ip -> raylet id + + # Convert node provider node ids to ips. + node_ips = set() + failed_ip_fetch = False + for provider_node_id in provider_node_ids_to_drain: + # If the provider's call to fetch ip fails, the exception is not + # fatal. Log the exception and proceed. + try: + ip = self.provider.internal_ip(provider_node_id) + node_ips.add(ip) + except Exception: + logger.exception( + "Failed to get ip of node with id" + f" {provider_node_id} during scale-down." + ) + failed_ip_fetch = True + if failed_ip_fetch: + self.prom_metrics.drain_node_exceptions.inc() + + # Only attempt to drain connected nodes, i.e. nodes with ips in + # LoadMetrics. + connected_node_ips = node_ips & self.load_metrics.raylet_id_by_ip.keys() + + # Convert ips to Raylet ids. + # (The assignment ip->raylet_id is well-defined under current + # assumptions. See "use_node_id_as_ip" in monitor.py) + raylet_ids_to_drain = { + self.load_metrics.raylet_id_by_ip[ip] for ip in connected_node_ips + } + + if not raylet_ids_to_drain: + return + + logger.info(f"Draining {len(raylet_ids_to_drain)} raylet(s).") + try: + # A successful response indicates that the GCS has marked the + # desired nodes as "drained." The cloud provider can then terminate + # the nodes without the GCS printing an error. + # Check if we succeeded in draining all of the intended nodes by + # looking at the RPC response. + drained_raylet_ids = set( + self.gcs_client.drain_nodes(raylet_ids_to_drain, timeout=5) + ) + failed_to_drain = raylet_ids_to_drain - drained_raylet_ids + if failed_to_drain: + self.prom_metrics.drain_node_exceptions.inc() + logger.error(f"Failed to drain {len(failed_to_drain)} raylet(s).") + # If we get a gRPC error with an UNIMPLEMENTED code, fail silently. + # This error indicates that the GCS is using Ray version < 1.8.0, + # for which DrainNode is not implemented. + except RpcError as e: + # If the code is UNIMPLEMENTED, pass. + if e.rpc_code == ray._raylet.GRPC_STATUS_CODE_UNIMPLEMENTED: + pass + # Otherwise, it's a plain old gRPC error and we should log it. + else: + self.prom_metrics.drain_node_exceptions.inc() + logger.exception("Failed to drain Ray nodes. Traceback follows.") + except Exception: + # We don't need to interrupt the autoscaler update with an + # exception, but we should log what went wrong and record the + # failure in Prometheus. + self.prom_metrics.drain_node_exceptions.inc() + logger.exception("Failed to drain Ray nodes. Traceback follows.") + + def launch_required_nodes(self, to_launch: Dict[NodeType, int]) -> None: + if to_launch: + for node_type, count in to_launch.items(): + self.launch_new_node(count, node_type=node_type) + + def update_nodes(self): + """Run NodeUpdaterThreads to run setup commands, sync files, + and/or start Ray. + """ + # Update nodes with out-of-date files. + # TODO(edoakes): Spawning these threads directly seems to cause + # problems. They should at a minimum be spawned as daemon threads. + # See https://github.com/ray-project/ray/pull/5903 for more info. + T = [] + for node_id, setup_commands, ray_start_commands, docker_config in ( + self.should_update(node_id) + for node_id in self.non_terminated_nodes.worker_ids + ): + if node_id is not None: + resources = self._node_resources(node_id) + labels = self._node_labels(node_id) + logger.debug(f"{node_id}: Starting new thread runner.") + T.append( + threading.Thread( + target=self.spawn_updater, + args=( + node_id, + setup_commands, + ray_start_commands, + resources, + labels, + docker_config, + ), + ) + ) + for t in T: + t.start() + for t in T: + t.join() + + def process_completed_updates(self): + """Clean up completed NodeUpdaterThreads.""" + completed_nodes = [] + for node_id, updater in self.updaters.items(): + if not updater.is_alive(): + completed_nodes.append(node_id) + if completed_nodes: + failed_nodes = [] + for node_id in completed_nodes: + updater = self.updaters[node_id] + if updater.exitcode == 0: + self.num_successful_updates[node_id] += 1 + self.prom_metrics.successful_updates.inc() + if updater.for_recovery: + self.prom_metrics.successful_recoveries.inc() + if updater.update_time: + self.prom_metrics.worker_update_time.observe( + updater.update_time + ) + # Mark the node as active to prevent the node recovery + # logic immediately trying to restart Ray on the new node. + self.load_metrics.mark_active(self.provider.internal_ip(node_id)) + else: + failed_nodes.append(node_id) + self.num_failed_updates[node_id] += 1 + self.prom_metrics.failed_updates.inc() + if updater.for_recovery: + self.prom_metrics.failed_recoveries.inc() + self.node_tracker.untrack(node_id) + del self.updaters[node_id] + + if failed_nodes: + # Some nodes in failed_nodes may already have been terminated + # during an update (for being idle after missing a heartbeat). + + # Update the list of non-terminated workers. + for node_id in failed_nodes: + # Check if the node has already been terminated. + if node_id in self.non_terminated_nodes.worker_ids: + self.schedule_node_termination( + node_id, "launch failed", logger.error + ) + else: + logger.warning( + f"StandardAutoscaler: {node_id}:" + " Failed to update node." + " Node has already been terminated." + ) + self.terminate_scheduled_nodes() + + def set_prometheus_updater_data(self): + """Record total number of active NodeUpdaterThreads and how many of + these are being run to recover nodes. + """ + self.prom_metrics.updating_nodes.set(len(self.updaters)) + num_recovering = 0 + for updater in self.updaters.values(): + if updater.for_recovery: + num_recovering += 1 + self.prom_metrics.recovering_nodes.set(num_recovering) + + def _report_pending_infeasible(self, unfulfilled: List[ResourceDict]): + """Emit event messages for infeasible or unschedulable tasks. + + This adds messages to the event summarizer for warning on infeasible + or "cluster full" resource requests. + + Args: + unfulfilled: List of resource demands that would be unfulfilled + even after full scale-up. + """ + # For type checking, assert that this object has been instantitiated. + assert self.resource_demand_scheduler + pending = [] + infeasible = [] + for bundle in unfulfilled: + placement_group = any( + "_group_" in k + or k == ray_constants.PLACEMENT_GROUP_BUNDLE_RESOURCE_NAME + for k in bundle + ) + if placement_group: + continue + if self.resource_demand_scheduler.is_feasible(bundle): + pending.append(bundle) + else: + infeasible.append(bundle) + if pending: + if self.load_metrics.cluster_full_of_actors_detected: + for request in pending: + self.event_summarizer.add_once_per_interval( + "Warning: The following resource request cannot be " + "scheduled right now: {}. This is likely due to all " + "cluster resources being claimed by actors. Consider " + "creating fewer actors or adding more nodes " + "to this Ray cluster.".format(request), + key="pending_{}".format(sorted(request.items())), + interval_s=30, + ) + if infeasible: + for request in infeasible: + self.event_summarizer.add_once_per_interval( + "Error: No available node types can fulfill resource " + "request {}. Add suitable node types to this cluster to " + "resolve this issue.".format(request), + key="infeasible_{}".format(sorted(request.items())), + interval_s=30, + ) + + def _sort_based_on_last_used( + self, nodes: List[NodeID], last_used: Dict[str, float] + ) -> List[NodeID]: + """Sort the nodes based on the last time they were used. + + The first item in the return list is the most recently used. + """ + last_used_copy = copy.deepcopy(last_used) + # Add the unconnected nodes as the least recently used (the end of + # list). This prioritizes connected nodes. + least_recently_used = -1 + + def last_time_used(node_id: NodeID): + assert self.provider + node_ip = self.provider.internal_ip(node_id) + if node_ip not in last_used_copy: + return least_recently_used + else: + return last_used_copy[node_ip] + + return sorted(nodes, key=last_time_used, reverse=True) + + def _get_nodes_needed_for_request_resources( + self, sorted_node_ids: List[NodeID] + ) -> FrozenSet[NodeID]: + # TODO(ameer): try merging this with resource_demand_scheduler + # code responsible for adding nodes for request_resources(). + """Returns the nodes NOT allowed to terminate due to request_resources(). + + Args: + sorted_node_ids: the node ids sorted based on last used (LRU last). + + Returns: + FrozenSet[NodeID]: a set of nodes (node ids) that + we should NOT terminate. + """ + # For type checking, assert that this object has been instantitiated. + assert self.provider + + nodes_not_allowed_to_terminate: Set[NodeID] = set() + static_node_resources: Dict[ + NodeIP, ResourceDict + ] = self.load_metrics.get_static_node_resources_by_ip() + + head_node_resources: ResourceDict = copy.deepcopy( + self.available_node_types[self.config["head_node_type"]]["resources"] + ) + # TODO(ameer): this is somewhat duplicated in + # resource_demand_scheduler.py. + if not head_node_resources: + # Legacy yaml might include {} in the resources field. + head_node_ip = self.provider.internal_ip(self.non_terminated_nodes.head_id) + head_node_resources = static_node_resources.get(head_node_ip, {}) + + max_node_resources: List[ResourceDict] = [head_node_resources] + resource_demand_vector_worker_node_ids = [] + # Get max resources on all the non terminated nodes. + for node_id in sorted_node_ids: + tags = self.provider.node_tags(node_id) + if TAG_RAY_USER_NODE_TYPE in tags: + node_type = tags[TAG_RAY_USER_NODE_TYPE] + node_resources: ResourceDict = copy.deepcopy( + self.available_node_types[node_type]["resources"] + ) + if not node_resources: + # Legacy yaml might include {} in the resources field. + node_ip = self.provider.internal_ip(node_id) + node_resources = static_node_resources.get(node_ip, {}) + max_node_resources.append(node_resources) + resource_demand_vector_worker_node_ids.append(node_id) + # Since it is sorted based on last used, we "keep" nodes that are + # most recently used when we binpack. We assume get_bin_pack_residual + # is following the given order here. + used_resource_requests: List[ResourceDict] + _, used_resource_requests = get_bin_pack_residual( + max_node_resources, self.load_metrics.get_resource_requests() + ) + # Remove the first entry (the head node). + max_node_resources.pop(0) + # Remove the first entry (the head node). + used_resource_requests.pop(0) + for i, node_id in enumerate(resource_demand_vector_worker_node_ids): + if ( + used_resource_requests[i] == max_node_resources[i] + and max_node_resources[i] + ): + # No resources of the node were needed for request_resources(). + # max_node_resources[i] is an empty dict for legacy yamls + # before the node is connected. + pass + else: + nodes_not_allowed_to_terminate.add(node_id) + return frozenset(nodes_not_allowed_to_terminate) + + def _keep_worker_of_node_type( + self, node_id: NodeID, node_type_counts: Dict[NodeType, int] + ) -> Tuple[KeepOrTerminate, Optional[str]]: + """Determines if a worker should be kept based on the min_workers + and max_workers constraint of the worker's node_type. + + Returns KeepOrTerminate.keep when both of the following hold: + (a) The worker's node_type is present among the keys of the current + config's available_node_types dict. + (b) Deleting the node would violate the min_workers constraint for that + worker's node_type. + + Returns KeepOrTerminate.terminate when both the following hold: + (a) The worker's node_type is not present among the keys of the current + config's available_node_types dict. + (b) Keeping the node would violate the max_workers constraint for that + worker's node_type. + + Return KeepOrTerminate.decide_later otherwise. + + Args: + node_type_counts(Dict[NodeType, int]): The non_terminated node + types counted so far. + Returns: + KeepOrTerminate: keep if the node should be kept, terminate if the + node should be terminated, decide_later if we are allowed + to terminate it, but do not have to. + Optional[str]: reason for termination. Not None on + KeepOrTerminate.terminate, None otherwise. + """ + # For type checking, assert that this object has been instantitiated. + assert self.provider + + tags = self.provider.node_tags(node_id) + if TAG_RAY_USER_NODE_TYPE in tags: + node_type = tags[TAG_RAY_USER_NODE_TYPE] + + min_workers = self.available_node_types.get(node_type, {}).get( + "min_workers", 0 + ) + max_workers = self.available_node_types.get(node_type, {}).get( + "max_workers", 0 + ) + if node_type not in self.available_node_types: + # The node type has been deleted from the cluster config. + # Allow terminating it if needed. + available_node_types = list(self.available_node_types.keys()) + return ( + KeepOrTerminate.terminate, + f"not in available_node_types: {available_node_types}", + ) + new_count = node_type_counts[node_type] + 1 + if new_count <= min(min_workers, max_workers): + return KeepOrTerminate.keep, None + if new_count > max_workers: + return KeepOrTerminate.terminate, "max_workers_per_type" + + return KeepOrTerminate.decide_later, None + + def _node_resources(self, node_id): + node_type = self.provider.node_tags(node_id).get(TAG_RAY_USER_NODE_TYPE) + if self.available_node_types: + return self.available_node_types.get(node_type, {}).get("resources", {}) + else: + return {} + + def _node_labels(self, node_id): + node_type = self.provider.node_tags(node_id).get(TAG_RAY_USER_NODE_TYPE) + if self.available_node_types: + return self.available_node_types.get(node_type, {}).get("labels", {}) + else: + return {} + + def reset(self, errors_fatal=False): + sync_continuously = False + if hasattr(self, "config"): + sync_continuously = self.config.get("file_mounts_sync_continuously", False) + try: + new_config = self.config_reader() + if new_config != getattr(self, "config", None): + try: + validate_config(new_config) + except Exception as e: + self.prom_metrics.config_validation_exceptions.inc() + logger.debug( + "Cluster config validation failed. The version of " + "the ray CLI you launched this cluster with may " + "be higher than the version of ray being run on " + "the cluster. Some new features may not be " + "available until you upgrade ray on your cluster.", + exc_info=e, + ) + logger.debug( + f"New config after validation: {new_config}," + f" of type: {type(new_config)}" + ) + (new_runtime_hash, new_file_mounts_contents_hash) = hash_runtime_conf( + new_config["file_mounts"], + new_config["cluster_synced_files"], + [ + new_config["worker_setup_commands"], + new_config["worker_start_ray_commands"], + ], + generate_file_mounts_contents_hash=sync_continuously, + ) + self.config = new_config + self.runtime_hash = new_runtime_hash + self.file_mounts_contents_hash = new_file_mounts_contents_hash + if not self.provider: + self.provider = _get_node_provider( + self.config["provider"], self.config["cluster_name"] + ) + + # If using the LocalNodeProvider, make sure the head node is marked + # non-terminated. + if isinstance(self.provider, LocalNodeProvider): + record_local_head_state_if_needed(self.provider) + + self.available_node_types = self.config["available_node_types"] + upscaling_speed = self.config.get("upscaling_speed") + aggressive = self.config.get("autoscaling_mode") == "aggressive" + target_utilization_fraction = self.config.get("target_utilization_fraction") + if upscaling_speed: + upscaling_speed = float(upscaling_speed) + # TODO(ameer): consider adding (if users ask) an option of + # initial_upscaling_num_workers. + elif aggressive: + upscaling_speed = 99999 + logger.warning( + "Legacy aggressive autoscaling mode " + "detected. Replacing it by setting upscaling_speed to " + "99999." + ) + elif target_utilization_fraction: + upscaling_speed = 1 / max(target_utilization_fraction, 0.001) - 1 + logger.warning( + "Legacy target_utilization_fraction config " + "detected. Replacing it by setting upscaling_speed to " + + "1 / target_utilization_fraction - 1." + ) + else: + upscaling_speed = 1.0 + if self.resource_demand_scheduler: + # The node types are autofilled internally for legacy yamls, + # overwriting the class will remove the inferred node resources + # for legacy yamls. + self.resource_demand_scheduler.reset_config( + self.provider, + self.available_node_types, + self.config["max_workers"], + self.config["head_node_type"], + upscaling_speed, + ) + else: + self.resource_demand_scheduler = ResourceDemandScheduler( + self.provider, + self.available_node_types, + self.config["max_workers"], + self.config["head_node_type"], + upscaling_speed, + ) + + except Exception as e: + self.prom_metrics.reset_exceptions.inc() + if errors_fatal: + raise e + else: + logger.exception("StandardAutoscaler: Error parsing config.") + + def launch_config_ok(self, node_id): + if self.disable_launch_config_check: + return True + node_tags = self.provider.node_tags(node_id) + tag_launch_conf = node_tags.get(TAG_RAY_LAUNCH_CONFIG) + node_type = node_tags.get(TAG_RAY_USER_NODE_TYPE) + if node_type not in self.available_node_types: + # The node type has been deleted from the cluster config. + # Don't keep the node. + return False + + # The `worker_nodes` field is deprecated in favor of per-node-type + # node_configs. We allow it for backwards-compatibility. + launch_config = copy.deepcopy(self.config.get("worker_nodes", {})) + if node_type: + launch_config.update( + self.config["available_node_types"][node_type]["node_config"] + ) + calculated_launch_hash = hash_launch_conf(launch_config, self.config["auth"]) + + if calculated_launch_hash != tag_launch_conf: + return False + return True + + def files_up_to_date(self, node_id): + node_tags = self.provider.node_tags(node_id) + applied_config_hash = node_tags.get(TAG_RAY_RUNTIME_CONFIG) + applied_file_mounts_contents_hash = node_tags.get(TAG_RAY_FILE_MOUNTS_CONTENTS) + if applied_config_hash != self.runtime_hash or ( + self.file_mounts_contents_hash is not None + and self.file_mounts_contents_hash != applied_file_mounts_contents_hash + ): + logger.info( + "StandardAutoscaler: " + "{}: Runtime state is ({},{}), want ({},{})".format( + node_id, + applied_config_hash, + applied_file_mounts_contents_hash, + self.runtime_hash, + self.file_mounts_contents_hash, + ) + ) + return False + return True + + def heartbeat_on_time(self, node_id: NodeID, now: float) -> bool: + """Determine whether we've received a heartbeat from a node within the + last AUTOSCALER_HEARTBEAT_TIMEOUT_S seconds. + """ + # For type checking, assert that this object has been instantitiated. + assert self.provider + + key = self.provider.internal_ip(node_id) + + if key in self.load_metrics.last_heartbeat_time_by_ip: + last_heartbeat_time = self.load_metrics.last_heartbeat_time_by_ip[key] + delta = now - last_heartbeat_time + if delta < AUTOSCALER_HEARTBEAT_TIMEOUT_S: + return True + return False + + def terminate_unhealthy_nodes(self, now: float): + """Terminated nodes for which we haven't received a heartbeat on time. + These nodes are subsequently terminated. + """ + # For type checking, assert that these objects have been instantitiated. + assert self.provider + assert self.non_terminated_nodes + + for node_id in self.non_terminated_nodes.worker_ids: + node_status = self.provider.node_tags(node_id)[TAG_RAY_NODE_STATUS] + # We're not responsible for taking down + # nodes with pending or failed status: + if not node_status == STATUS_UP_TO_DATE: + continue + # This node is up-to-date. If it hasn't had the chance to produce + # a heartbeat, fake the heartbeat now (see logic for completed node + # updaters). + ip = self.provider.internal_ip(node_id) + if ip not in self.load_metrics.last_heartbeat_time_by_ip: + self.load_metrics.mark_active(ip) + # Heartbeat indicates node is healthy: + if self.heartbeat_on_time(node_id, now): + continue + self.schedule_node_termination( + node_id, "lost contact with raylet", logger.warning + ) + self.terminate_scheduled_nodes() + + def attempt_to_recover_unhealthy_nodes(self, now): + for node_id in self.non_terminated_nodes.worker_ids: + self.recover_if_needed(node_id, now) + + def recover_if_needed(self, node_id, now): + if not self.can_update(node_id): + return + if self.heartbeat_on_time(node_id, now): + return + + logger.warning( + "StandardAutoscaler: " + "{}: No recent heartbeat, " + "restarting Ray to recover...".format(node_id) + ) + self.event_summarizer.add( + "Restarting {} nodes of type " + + self._get_node_type(node_id) + + " (lost contact with raylet).", + quantity=1, + aggregate=operator.add, + ) + head_node_ip = self.provider.internal_ip(self.non_terminated_nodes.head_id) + updater = NodeUpdaterThread( + node_id=node_id, + provider_config=self.config["provider"], + provider=self.provider, + auth_config=self.config["auth"], + cluster_name=self.config["cluster_name"], + file_mounts={}, + initialization_commands=[], + setup_commands=[], + ray_start_commands=with_head_node_ip( + self.config["worker_start_ray_commands"], head_node_ip + ), + runtime_hash=self.runtime_hash, + file_mounts_contents_hash=self.file_mounts_contents_hash, + process_runner=self.process_runner, + use_internal_ip=True, + is_head_node=False, + docker_config=self.config.get("docker"), + node_resources=self._node_resources(node_id), + node_labels=self._node_labels(node_id), + for_recovery=True, + ) + updater.start() + self.updaters[node_id] = updater + + def _get_node_type(self, node_id: str) -> str: + # For type checking, assert that this object has been instantitiated. + assert self.provider + + node_tags = self.provider.node_tags(node_id) + if TAG_RAY_USER_NODE_TYPE in node_tags: + return node_tags[TAG_RAY_USER_NODE_TYPE] + else: + return "unknown_node_type" + + def _get_node_type_specific_fields(self, node_id: str, fields_key: str) -> Any: + # For type checking, assert that this object has been instantitiated. + assert self.provider + + fields = self.config[fields_key] + node_tags = self.provider.node_tags(node_id) + if TAG_RAY_USER_NODE_TYPE in node_tags: + node_type = node_tags[TAG_RAY_USER_NODE_TYPE] + if node_type not in self.available_node_types: + raise ValueError(f"Unknown node type tag: {node_type}.") + node_specific_config = self.available_node_types[node_type] + if fields_key in node_specific_config: + fields = node_specific_config[fields_key] + return fields + + def _get_node_specific_docker_config(self, node_id): + if "docker" not in self.config: + return {} + docker_config = copy.deepcopy(self.config.get("docker", {})) + node_specific_docker = self._get_node_type_specific_fields(node_id, "docker") + docker_config.update(node_specific_docker) + return docker_config + + def should_update(self, node_id): + if not self.can_update(node_id): + return UpdateInstructions(None, None, None, None) # no update + + status = self.provider.node_tags(node_id).get(TAG_RAY_NODE_STATUS) + if status == STATUS_UP_TO_DATE and self.files_up_to_date(node_id): + return UpdateInstructions(None, None, None, None) # no update + + successful_updated = self.num_successful_updates.get(node_id, 0) > 0 + if successful_updated and self.config.get("restart_only", False): + setup_commands = [] + ray_start_commands = self.config["worker_start_ray_commands"] + elif successful_updated and self.config.get("no_restart", False): + setup_commands = self._get_node_type_specific_fields( + node_id, "worker_setup_commands" + ) + ray_start_commands = [] + else: + setup_commands = self._get_node_type_specific_fields( + node_id, "worker_setup_commands" + ) + ray_start_commands = self.config["worker_start_ray_commands"] + + docker_config = self._get_node_specific_docker_config(node_id) + return UpdateInstructions( + node_id=node_id, + setup_commands=setup_commands, + ray_start_commands=ray_start_commands, + docker_config=docker_config, + ) + + def spawn_updater( + self, + node_id, + setup_commands, + ray_start_commands, + node_resources, + node_labels, + docker_config, + ): + logger.info( + f"Creating new (spawn_updater) updater thread for node" f" {node_id}." + ) + ip = self.provider.internal_ip(node_id) + node_type = self._get_node_type(node_id) + self.node_tracker.track(node_id, ip, node_type) + head_node_ip = self.provider.internal_ip(self.non_terminated_nodes.head_id) + updater = NodeUpdaterThread( + node_id=node_id, + provider_config=self.config["provider"], + provider=self.provider, + auth_config=self.config["auth"], + cluster_name=self.config["cluster_name"], + file_mounts=self.config["file_mounts"], + initialization_commands=with_head_node_ip( + self._get_node_type_specific_fields(node_id, "initialization_commands"), + head_node_ip, + ), + setup_commands=with_head_node_ip(setup_commands, head_node_ip), + ray_start_commands=with_head_node_ip(ray_start_commands, head_node_ip), + runtime_hash=self.runtime_hash, + file_mounts_contents_hash=self.file_mounts_contents_hash, + is_head_node=False, + cluster_synced_files=self.config["cluster_synced_files"], + rsync_options={ + "rsync_exclude": self.config.get("rsync_exclude"), + "rsync_filter": self.config.get("rsync_filter"), + }, + process_runner=self.process_runner, + use_internal_ip=True, + docker_config=docker_config, + node_resources=node_resources, + node_labels=node_labels, + ) + updater.start() + self.updaters[node_id] = updater + + def can_update(self, node_id): + if self.disable_node_updaters: + return False + if node_id in self.updaters: + return False + if not self.launch_config_ok(node_id): + return False + if self.num_failed_updates.get(node_id, 0) > 0: # TODO(ekl) retry? + return False + logger.debug( + f"{node_id} is not being updated and " + "passes config check (can_update=True)." + ) + return True + + def launch_new_node(self, count: int, node_type: str) -> None: + logger.info("StandardAutoscaler: Queue {} new nodes for launch".format(count)) + self.pending_launches.inc(node_type, count) + config = copy.deepcopy(self.config) + if self.foreground_node_launch: + assert self.foreground_node_launcher is not None + # Launch in the main thread and block. + self.foreground_node_launcher.launch_node(config, count, node_type) + else: + assert self.launch_queue is not None + # Split into individual launch requests of the max batch size. + while count > 0: + # Enqueue launch data for the background NodeUpdater threads. + self.launch_queue.put( + (config, min(count, self.max_launch_batch), node_type) + ) + count -= self.max_launch_batch + + def kill_workers(self): + logger.error("StandardAutoscaler: kill_workers triggered") + nodes = self.workers() + if nodes: + self.provider.terminate_nodes(nodes) + for node in nodes: + self.node_tracker.untrack(node) + self.prom_metrics.stopped_nodes.inc() + logger.error("StandardAutoscaler: terminated {} node(s)".format(len(nodes))) + + def summary(self) -> Optional[AutoscalerSummary]: + """Summarizes the active, pending, and failed node launches. + + An active node is a node whose raylet is actively reporting heartbeats. + A pending node is non-active node whose node tag is uninitialized, + waiting for ssh, syncing files, or setting up. + If a node is not pending or active, it is failed. + + Returns: + AutoscalerSummary: The summary. + """ + # For type checking, assert that this object has been instantitiated. + assert self.provider + + if not self.non_terminated_nodes: + return None + active_nodes: Dict[NodeType, int] = Counter() + pending_nodes = [] + failed_nodes = [] + non_failed = set() + + node_type_mapping = {} + + for node_id in self.non_terminated_nodes.all_node_ids: + ip = self.provider.internal_ip(node_id) + node_tags = self.provider.node_tags(node_id) + + if not all( + tag in node_tags + for tag in ( + TAG_RAY_NODE_KIND, + TAG_RAY_USER_NODE_TYPE, + TAG_RAY_NODE_STATUS, + ) + ): + # In some node providers, creation of a node and tags is not + # atomic, so just skip it. + continue + + if node_tags[TAG_RAY_NODE_KIND] == NODE_KIND_UNMANAGED: + continue + node_type = node_tags[TAG_RAY_USER_NODE_TYPE] + + node_type_mapping[ip] = node_type + + # TODO (Alex): If a node's raylet has died, it shouldn't be marked + # as active. + is_active = self.load_metrics.is_active(ip) + if is_active: + active_nodes[node_type] += 1 + non_failed.add(node_id) + else: + status = node_tags[TAG_RAY_NODE_STATUS] + completed_states = [STATUS_UP_TO_DATE, STATUS_UPDATE_FAILED] + is_pending = status not in completed_states + if is_pending: + pending_nodes.append((node_id, ip, node_type, status)) + non_failed.add(node_id) + + failed_nodes = self.node_tracker.get_all_failed_node_info(non_failed) + + # The concurrent counter leaves some 0 counts in, so we need to + # manually filter those out. + pending_launches = {} + for node_type, count in self.pending_launches.breakdown().items(): + if count: + pending_launches[node_type] = count + + pending_resources = {} + for node_resources in self.resource_demand_scheduler.calculate_node_resources( + nodes=[node_id for node_id, _, _, _ in pending_nodes], + pending_nodes=pending_launches, + # We don't fill this field out because we're intentionally only + # passing pending nodes (which aren't tracked by load metrics + # anyways). + unused_resources_by_ip={}, + )[0]: + for key, value in node_resources.items(): + pending_resources[key] = value + pending_resources.get(key, 0) + + return AutoscalerSummary( + # Convert active_nodes from counter to dict for later serialization + active_nodes=dict(active_nodes), + idle_nodes=None, + pending_nodes=[ + (ip, node_type, status) for _, ip, node_type, status in pending_nodes + ], + pending_launches=pending_launches, + failed_nodes=failed_nodes, + node_availability_summary=self.node_provider_availability_tracker.summary(), + pending_resources=pending_resources, + node_type_mapping=node_type_mapping, + legacy=True, + ) + + def info_string(self): + lm_summary = self.load_metrics.summary() + autoscaler_summary = self.summary() + assert autoscaler_summary + return "\n" + format_info_string(lm_summary, autoscaler_summary) diff --git a/.venv/lib/python3.11/site-packages/ray/autoscaler/_private/cli_logger.py b/.venv/lib/python3.11/site-packages/ray/autoscaler/_private/cli_logger.py new file mode 100644 index 0000000000000000000000000000000000000000..5172891d311964c281c4e748cb72713dc361a3ed --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/autoscaler/_private/cli_logger.py @@ -0,0 +1,825 @@ +"""Logger implementing the Command Line Interface. + +A replacement for the standard Python `logging` API +designed for implementing a better CLI UX for the cluster launcher. + +Supports color, bold text, italics, underlines, etc. +(depending on TTY features) +as well as indentation and other structured output. +""" +import inspect +import logging +import os +import sys +import time +from contextlib import contextmanager +from functools import wraps +from typing import Any, Callable, Dict, List, Optional, Tuple + +import click +import colorama + +# Import ray first to use the bundled colorama +import ray # noqa: F401 + +if sys.platform == "win32": + import msvcrt +else: + import select + + +class _ColorfulMock: + def __init__(self): + # do not do any color work + self.identity = lambda x: x + + self.colorful = self + self.colormode = None + + self.NO_COLORS = None + self.ANSI_8_COLORS = None + + def disable(self): + pass + + @contextmanager + def with_style(self, x): + class IdentityClass: + def __getattr__(self, name): + return lambda y: y + + yield IdentityClass() + + def __getattr__(self, name): + if name == "with_style": + return self.with_style + + return self.identity + + +try: + import colorful as _cf + from colorful.core import ColorfulString + + _cf.use_8_ansi_colors() +except ModuleNotFoundError: + # We mock Colorful to restrict the colors used for consistency + # anyway, so we also allow for not having colorful at all. + # If the Ray Core dependency on colorful is ever removed, + # the CliLogger code will still work. + class ColorfulString: + pass + + _cf = _ColorfulMock() + + +# We want to only allow specific formatting +# to prevent people from accidentally making bad looking color schemes. +# +# This is especially important since most will look bad on either light +# or dark themes. +class _ColorfulProxy: + _proxy_allowlist = [ + "disable", + "reset", + "bold", + "italic", + "underlined", + # used instead of `gray` as `dimmed` adapts to + # both light and dark themes + "dimmed", + "dodgerBlue", # group + "limeGreen", # success + "red", # error + "orange", # warning + "skyBlue", # label + "magenta", # syntax highlighting key words and symbols + "yellow", # syntax highlighting strings + ] + + def __getattr__(self, name): + res = getattr(_cf, name) + if callable(res) and name not in _ColorfulProxy._proxy_allowlist: + raise ValueError( + "Usage of the colorful method '" + name + "' is forbidden " + "by the proxy to keep a consistent color scheme. " + "Check `cli_logger.py` for allowed methods" + ) + return res + + +cf = _ColorfulProxy() + +colorama.init(strip=False) + + +def _external_caller_info(): + """Get the info from the caller frame. + + Used to override the logging function and line number with the correct + ones. See the comment on _patched_makeRecord for more info. + """ + + frame = inspect.currentframe() + caller = frame + levels = 0 + while caller.f_code.co_filename == __file__: + caller = caller.f_back + levels += 1 + return { + "lineno": caller.f_lineno, + "filename": os.path.basename(caller.f_code.co_filename), + } + + +def _format_msg( + msg: str, + *args: Any, + no_format: bool = None, + _tags: Dict[str, Any] = None, + _numbered: Tuple[str, int, int] = None, + **kwargs: Any, +): + """Formats a message for printing. + + Renders `msg` using the built-in `str.format` and the passed-in + `*args` and `**kwargs`. + + Args: + *args (Any): `.format` arguments for `msg`. + no_format (bool): + If `no_format` is `True`, + `.format` will not be called on the message. + + Useful if the output is user-provided or may otherwise + contain an unexpected formatting string (e.g. "{}"). + _tags (Dict[str, Any]): + key-value pairs to display at the end of + the message in square brackets. + + If a tag is set to `True`, it is printed without the value, + the presence of the tag treated as a "flag". + + E.g. `_format_msg("hello", _tags=dict(from=mom, signed=True))` + `hello [from=Mom, signed]` + _numbered (Tuple[str, int, int]): + `(brackets, i, n)` + + The `brackets` string is composed of two "bracket" characters, + `i` is the index, `n` is the total. + + The string `{i}/{n}` surrounded by the "brackets" is + prepended to the message. + + This is used to number steps in a procedure, with different + brackets specifying different major tasks. + + E.g. `_format_msg("hello", _numbered=("[]", 0, 5))` + `[0/5] hello` + + Returns: + The formatted message. + """ + + if isinstance(msg, str) or isinstance(msg, ColorfulString): + tags_str = "" + if _tags is not None: + tags_list = [] + for k, v in _tags.items(): + if v is True: + tags_list += [k] + continue + if v is False: + continue + + tags_list += [k + "=" + v] + if tags_list: + tags_str = cf.reset(cf.dimmed(" [{}]".format(", ".join(tags_list)))) + + numbering_str = "" + if _numbered is not None: + chars, i, n = _numbered + numbering_str = cf.dimmed(chars[0] + str(i) + "/" + str(n) + chars[1]) + " " + + if no_format: + # todo: throw if given args/kwargs? + return numbering_str + msg + tags_str + return numbering_str + msg.format(*args, **kwargs) + tags_str + + if kwargs: + raise ValueError("We do not support printing kwargs yet.") + + res = [msg, *args] + res = [str(x) for x in res] + return ", ".join(res) + + +# TODO: come up with a plan to unify logging. +# formatter = logging.Formatter( +# # TODO(maximsmol): figure out the required log level padding +# # width automatically +# fmt="[{asctime}] {levelname:6} {message}", +# datefmt="%x %X", +# # We want alignment on our level names +# style="{") + + +def _isatty(): + """More robust check for interactive terminal/tty.""" + try: + # https://stackoverflow.com/questions/6108330/ + # checking-for-interactive-shell-in-a-python-script + return sys.__stdin__.isatty() + except Exception: + # sometimes this can fail due to closed output + # either way, no-tty is generally safe fallback. + return False + + +class _CliLogger: + """Singleton class for CLI logging. + + Without calling 'cli_logger.configure', the CLILogger will default + to 'record' style logging. + + Attributes: + color_mode (str): + Can be "true", "false", or "auto". + + Enables or disables `colorful`. + + If `color_mode` is "auto", is set to `not stdout.isatty()` + indent_level (int): + The current indentation level. + + All messages will be indented by prepending `" " * indent_level` + vebosity (int): + Output verbosity. + + Low verbosity will disable `verbose` and `very_verbose` messages. + """ + + color_mode: str + # color_mode: Union[Literal["auto"], Literal["false"], Literal["true"]] + indent_level: int + interactive: bool + VALID_LOG_STYLES = ("auto", "record", "pretty") + + _autodetected_cf_colormode: int + + def __init__(self): + self.indent_level = 0 + + self._verbosity = 0 + self._verbosity_overriden = False + self._color_mode = "auto" + self._log_style = "record" + self.pretty = False + self.interactive = False + + # store whatever colorful has detected for future use if + # the color ouput is toggled (colorful detects # of supported colors, + # so it has some non-trivial logic to determine this) + self._autodetected_cf_colormode = cf.colorful.colormode + self.set_format() + + def set_format(self, format_tmpl=None): + if not format_tmpl: + from ray.autoscaler._private.constants import LOGGER_FORMAT + + format_tmpl = LOGGER_FORMAT + self._formatter = logging.Formatter(format_tmpl) + + def configure(self, log_style=None, color_mode=None, verbosity=None): + """Configures the logger according to values.""" + if log_style is not None: + self._set_log_style(log_style) + + if color_mode is not None: + self._set_color_mode(color_mode) + + if verbosity is not None: + self._set_verbosity(verbosity) + + self.detect_colors() + + @property + def log_style(self): + return self._log_style + + def _set_log_style(self, x): + """Configures interactivity and formatting.""" + self._log_style = x.lower() + self.interactive = _isatty() + + if self._log_style == "auto": + self.pretty = _isatty() + elif self._log_style == "record": + self.pretty = False + self._set_color_mode("false") + elif self._log_style == "pretty": + self.pretty = True + + @property + def color_mode(self): + return self._color_mode + + def _set_color_mode(self, x): + self._color_mode = x.lower() + self.detect_colors() + + @property + def verbosity(self): + if self._verbosity_overriden: + return self._verbosity + elif not self.pretty: + return 999 + return self._verbosity + + def _set_verbosity(self, x): + self._verbosity = x + self._verbosity_overriden = True + + def detect_colors(self): + """Update color output settings. + + Parse the `color_mode` string and optionally disable or force-enable + color output + (8-color ANSI if no terminal detected to be safe) in colorful. + """ + if self.color_mode == "true": + if self._autodetected_cf_colormode != cf.NO_COLORS: + cf.colormode = self._autodetected_cf_colormode + else: + cf.colormode = cf.ANSI_8_COLORS + return + if self.color_mode == "false": + cf.disable() + return + if self.color_mode == "auto": + # colorful autodetects tty settings + return + + raise ValueError("Invalid log color setting: " + self.color_mode) + + def newline(self): + """Print a line feed.""" + self.print("") + + def _print( + self, + msg: str, + _level_str: str = "INFO", + _linefeed: bool = True, + end: str = None, + ): + """Proxy for printing messages. + + Args: + msg: Message to print. + linefeed (bool): + If `linefeed` is `False` no linefeed is printed at the + end of the message. + """ + if self.pretty: + rendered_message = " " * self.indent_level + msg + else: + if msg.strip() == "": + return + caller_info = _external_caller_info() + record = logging.LogRecord( + name="cli", + # We override the level name later + # TODO(maximsmol): give approximate level #s to our log levels + level=0, + # The user-facing logs do not need this information anyway + # and it would be very tedious to extract since _print + # can be at varying depths in the call stack + # TODO(maximsmol): do it anyway to be extra + pathname=caller_info["filename"], + lineno=caller_info["lineno"], + msg=msg, + args={}, + # No exception + exc_info=None, + ) + record.levelname = _level_str + rendered_message = self._formatter.format(record) + + # We aren't using standard python logging convention, so we hardcode + # the log levels for now. + if _level_str in ["WARNING", "ERROR", "PANIC"]: + stream = sys.stderr + else: + stream = sys.stdout + + if not _linefeed: + stream.write(rendered_message) + stream.flush() + return + + kwargs = {"end": end} + print(rendered_message, file=stream, **kwargs) + + def indented(self): + """Context manager that starts an indented block of output.""" + cli_logger = self + + class IndentedContextManager: + def __enter__(self): + cli_logger.indent_level += 1 + + def __exit__(self, type, value, tb): + cli_logger.indent_level -= 1 + + return IndentedContextManager() + + def group(self, msg: str, *args: Any, **kwargs: Any): + """Print a group title in a special color and start an indented block. + + For arguments, see `_format_msg`. + """ + self.print(cf.dodgerBlue(msg), *args, **kwargs) + + return self.indented() + + def verbatim_error_ctx(self, msg: str, *args: Any, **kwargs: Any): + """Context manager for printing multi-line error messages. + + Displays a start sequence "!!! {optional message}" + and a matching end sequence "!!!". + + The string "!!!" can be used as a "tombstone" for searching. + + For arguments, see `_format_msg`. + """ + cli_logger = self + + class VerbatimErorContextManager: + def __enter__(self): + cli_logger.error(cf.bold("!!! ") + "{}", msg, *args, **kwargs) + + def __exit__(self, type, value, tb): + cli_logger.error(cf.bold("!!!")) + + return VerbatimErorContextManager() + + def labeled_value(self, key: str, msg: str, *args: Any, **kwargs: Any): + """Displays a key-value pair with special formatting. + + Args: + key: Label that is prepended to the message. + + For other arguments, see `_format_msg`. + """ + self._print(cf.skyBlue(key) + ": " + _format_msg(cf.bold(msg), *args, **kwargs)) + + def verbose(self, msg: str, *args: Any, **kwargs: Any): + """Prints a message if verbosity is not 0. + + For arguments, see `_format_msg`. + """ + if self.verbosity > 0: + self.print(msg, *args, _level_str="VINFO", **kwargs) + + def verbose_warning(self, msg, *args, **kwargs): + """Prints a formatted warning if verbosity is not 0. + + For arguments, see `_format_msg`. + """ + if self.verbosity > 0: + self._warning(msg, *args, _level_str="VWARN", **kwargs) + + def verbose_error(self, msg: str, *args: Any, **kwargs: Any): + """Logs an error if verbosity is not 0. + + For arguments, see `_format_msg`. + """ + if self.verbosity > 0: + self._error(msg, *args, _level_str="VERR", **kwargs) + + def very_verbose(self, msg: str, *args: Any, **kwargs: Any): + """Prints if verbosity is > 1. + + For arguments, see `_format_msg`. + """ + if self.verbosity > 1: + self.print(msg, *args, _level_str="VVINFO", **kwargs) + + def success(self, msg: str, *args: Any, **kwargs: Any): + """Prints a formatted success message. + + For arguments, see `_format_msg`. + """ + self.print(cf.limeGreen(msg), *args, _level_str="SUCC", **kwargs) + + def _warning(self, msg: str, *args: Any, _level_str: str = None, **kwargs: Any): + """Prints a formatted warning message. + + For arguments, see `_format_msg`. + """ + if _level_str is None: + raise ValueError("Log level not set.") + self.print(cf.orange(msg), *args, _level_str=_level_str, **kwargs) + + def warning(self, *args, **kwargs): + self._warning(*args, _level_str="WARN", **kwargs) + + def _error(self, msg: str, *args: Any, _level_str: str = None, **kwargs: Any): + """Prints a formatted error message. + + For arguments, see `_format_msg`. + """ + if _level_str is None: + raise ValueError("Log level not set.") + self.print(cf.red(msg), *args, _level_str=_level_str, **kwargs) + + def error(self, *args, **kwargs): + self._error(*args, _level_str="ERR", **kwargs) + + def panic(self, *args, **kwargs): + self._error(*args, _level_str="PANIC", **kwargs) + + # Fine to expose _level_str here, since this is a general log function. + def print( + self, + msg: str, + *args: Any, + _level_str: str = "INFO", + end: str = None, + **kwargs: Any, + ): + """Prints a message. + + For arguments, see `_format_msg`. + """ + self._print(_format_msg(msg, *args, **kwargs), _level_str=_level_str, end=end) + + def info(self, msg: str, no_format=True, *args, **kwargs): + self.print(msg, no_format=no_format, *args, **kwargs) + + def abort( + self, msg: Optional[str] = None, *args: Any, exc: Any = None, **kwargs: Any + ): + """Prints an error and aborts execution. + + Print an error and throw an exception to terminate the program + (the exception will not print a message). + """ + if msg is not None: + self._error(msg, *args, _level_str="PANIC", **kwargs) + + if exc is not None: + raise exc + + exc_cls = click.ClickException + if self.pretty: + exc_cls = SilentClickException + + if msg is None: + msg = "Exiting due to cli_logger.abort()" + raise exc_cls(msg) + + def doassert(self, val: bool, msg: str, *args: Any, **kwargs: Any): + """Handle assertion without throwing a scary exception. + + Args: + val: Value to check. + + For other arguments, see `_format_msg`. + """ + if not val: + exc = None + if not self.pretty: + exc = AssertionError() + + # TODO(maximsmol): rework asserts so that we get the expression + # that triggered the assert + # to do this, install a global try-catch + # for AssertionError and raise them normally + self.abort(msg, *args, exc=exc, **kwargs) + + def render_list(self, xs: List[str], separator: str = cf.reset(", ")): + """Render a list of bolded values using a non-bolded separator.""" + return separator.join([str(cf.bold(x)) for x in xs]) + + def confirm( + self, + yes: bool, + msg: str, + *args: Any, + _abort: bool = False, + _default: bool = False, + _timeout_s: Optional[float] = None, + **kwargs: Any, + ): + """Display a confirmation dialog. + + Valid answers are "y/yes/true/1" and "n/no/false/0". + + Args: + yes: If `yes` is `True` the dialog will default to "yes" + and continue without waiting for user input. + _abort (bool): + If `_abort` is `True`, + "no" means aborting the program. + _default (bool): + The default action to take if the user just presses enter + with no input. + _timeout_s (float): + If user has no input within _timeout_s seconds, the default + action is taken. None means no timeout. + """ + should_abort = _abort + default = _default + + if not self.interactive and not yes: + # no formatting around --yes here since this is non-interactive + self.error( + "This command requires user confirmation. " + "When running non-interactively, supply --yes to skip." + ) + raise ValueError("Non-interactive confirm without --yes.") + + if default: + yn_str = "Y/n" + else: + yn_str = "y/N" + + confirm_str = cf.underlined("Confirm [" + yn_str + "]:") + " " + + rendered_message = _format_msg(msg, *args, **kwargs) + # the rendered message ends with ascii coding + if rendered_message and not msg.endswith("\n"): + rendered_message += " " + + msg_len = len(rendered_message.split("\n")[-1]) + complete_str = rendered_message + confirm_str + + if yes: + self._print(complete_str + "y " + cf.dimmed("[automatic, due to --yes]")) + return True + + self._print(complete_str, _linefeed=False) + + res = None + yes_answers = ["y", "yes", "true", "1"] + no_answers = ["n", "no", "false", "0"] + try: + while True: + if _timeout_s is None: + ans = sys.stdin.readline() + elif sys.platform == "win32": + # Windows doesn't support select + start_time = time.time() + ans = "" + while True: + if (time.time() - start_time) >= _timeout_s: + self.newline() + ans = "\n" + break + elif msvcrt.kbhit(): + ch = msvcrt.getwch() + if ch in ("\n", "\r"): + self.newline() + ans = ans + "\n" + break + elif ch == "\b": + if ans: + ans = ans[:-1] + # Emulate backspace erasing + print("\b \b", end="", flush=True) + else: + ans = ans + ch + print(ch, end="", flush=True) + else: + time.sleep(0.1) + else: + ready, _, _ = select.select([sys.stdin], [], [], _timeout_s) + if not ready: + self.newline() + ans = "\n" + else: + ans = sys.stdin.readline() + + ans = ans.lower() + + if ans == "\n": + res = default + break + + ans = ans.strip() + if ans in yes_answers: + res = True + break + if ans in no_answers: + res = False + break + + indent = " " * msg_len + self.error( + "{}Invalid answer: {}. Expected {} or {}", + indent, + cf.bold(ans.strip()), + self.render_list(yes_answers, "/"), + self.render_list(no_answers, "/"), + ) + self._print(indent + confirm_str, _linefeed=False) + except KeyboardInterrupt: + self.newline() + res = default + + if not res and should_abort: + # todo: make sure we tell the user if they + # need to do cleanup + self._print("Exiting...") + raise SilentClickException( + "Exiting due to the response to confirm(should_abort=True)." + ) + + return res + + def prompt(self, msg: str, *args, **kwargs): + """Prompt the user for some text input. + + Args: + msg: The mesage to display to the user before the prompt. + + Returns: + The string entered by the user. + """ + complete_str = cf.underlined(msg) + rendered_message = _format_msg(complete_str, *args, **kwargs) + # the rendered message ends with ascii coding + if rendered_message and not msg.endswith("\n"): + rendered_message += " " + self._print(rendered_message, linefeed=False) + + res = "" + try: + ans = sys.stdin.readline() + ans = ans.lower() + res = ans.strip() + except KeyboardInterrupt: + self.newline() + + return res + + def flush(self): + sys.stdout.flush() + sys.stderr.flush() + + +class SilentClickException(click.ClickException): + """`ClickException` that does not print a message. + + Some of our tooling relies on catching ClickException in particular. + + However the default prints a message, which is undesirable since we expect + our code to log errors manually using `cli_logger.error()` to allow for + colors and other formatting. + """ + + def __init__(self, message: str): + super(SilentClickException, self).__init__(message) + + def show(self, file=None): + pass + + +cli_logger = _CliLogger() + +CLICK_LOGGING_OPTIONS = [ + click.option( + "--log-style", + required=False, + type=click.Choice(cli_logger.VALID_LOG_STYLES, case_sensitive=False), + default="auto", + help=( + "If 'pretty', outputs with formatting and color. If 'record', " + "outputs record-style without formatting. " + "'auto' defaults to 'pretty', and disables pretty logging " + "if stdin is *not* a TTY." + ), + ), + click.option( + "--log-color", + required=False, + type=click.Choice(["auto", "false", "true"], case_sensitive=False), + default="auto", + help=("Use color logging. Auto enables color logging if stdout is a TTY."), + ), + click.option("-v", "--verbose", default=None, count=True), +] + + +def add_click_logging_options(f: Callable) -> Callable: + for option in reversed(CLICK_LOGGING_OPTIONS): + f = option(f) + + @wraps(f) + def wrapper(*args, log_style=None, log_color=None, verbose=None, **kwargs): + cli_logger.configure(log_style, log_color, verbose) + return f(*args, **kwargs) + + return wrapper diff --git a/.venv/lib/python3.11/site-packages/ray/autoscaler/_private/cli_logger_demoall.py b/.venv/lib/python3.11/site-packages/ray/autoscaler/_private/cli_logger_demoall.py new file mode 100644 index 0000000000000000000000000000000000000000..2f8dcd2404b1e3cfdfb29adb69c387c7e99e7a7b --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/autoscaler/_private/cli_logger_demoall.py @@ -0,0 +1,40 @@ +#!/usr/bin/env python + +# This is an executable script that runs an example of every single CliLogger +# function for demonstration purposes. Primarily useful for tuning color and +# other formatting. + +from ray.autoscaler._private.cli_logger import cf, cli_logger + +cli_logger.configure(log_style="auto", verbosity=999) + +cli_logger.print(cf.bold("Bold ") + cf.italic("Italic ") + cf.underlined("Underlined")) +cli_logger.labeled_value("Label", "value") +cli_logger.print("List: {}", cli_logger.render_list([1, 2, 3])) +cli_logger.newline() +cli_logger.very_verbose("Very verbose") +cli_logger.verbose("Verbose") +cli_logger.verbose_warning("Verbose warning") +cli_logger.verbose_error("Verbose error") +cli_logger.print("Info") +cli_logger.success("Success") +cli_logger.warning("Warning") +cli_logger.error("Error") +cli_logger.newline() +try: + cli_logger.abort("Abort") +except Exception: + pass +try: + cli_logger.doassert(False, "Assert") +except Exception: + pass +cli_logger.newline() +cli_logger.confirm(True, "example") +cli_logger.newline() +with cli_logger.indented(): + cli_logger.print("Indented") +with cli_logger.group("Group"): + cli_logger.print("Group contents") +with cli_logger.verbatim_error_ctx("Verbtaim error"): + cli_logger.print("Error contents") diff --git a/.venv/lib/python3.11/site-packages/ray/autoscaler/_private/cluster_dump.py b/.venv/lib/python3.11/site-packages/ray/autoscaler/_private/cluster_dump.py new file mode 100644 index 0000000000000000000000000000000000000000..6984c16064361e17013da52f2cf4e6d3e51b3a5b --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/autoscaler/_private/cluster_dump.py @@ -0,0 +1,652 @@ +import os +import re +import subprocess +import sys +import tarfile +import tempfile +import threading +from concurrent.futures import ThreadPoolExecutor +from contextlib import contextmanager +from typing import List, Optional, Sequence, Tuple + +import yaml + +import ray # noqa: F401 +from ray.autoscaler._private.cli_logger import cli_logger +from ray.autoscaler._private.providers import _get_node_provider +from ray.autoscaler.tags import NODE_KIND_HEAD, NODE_KIND_WORKER, TAG_RAY_NODE_KIND + +# Import psutil after ray so the packaged version is used. +import psutil + +MAX_PARALLEL_SSH_WORKERS = 8 +DEFAULT_SSH_USER = "ubuntu" +DEFAULT_SSH_KEYS = ["~/ray_bootstrap_key.pem", "~/.ssh/ray-autoscaler_2_us-west-2.pem"] + + +class CommandFailed(RuntimeError): + pass + + +class LocalCommandFailed(CommandFailed): + pass + + +class RemoteCommandFailed(CommandFailed): + pass + + +class GetParameters: + def __init__( + self, + logs: bool = True, + debug_state: bool = True, + pip: bool = True, + processes: bool = True, + processes_verbose: bool = True, + processes_list: Optional[List[Tuple[str, bool]]] = None, + ): + self.logs = logs + self.debug_state = debug_state + self.pip = pip + self.processes = processes + self.processes_verbose = processes_verbose + self.processes_list = processes_list + + +class Node: + """Node (as in "machine")""" + + def __init__( + self, + host: str, + ssh_user: str = "ubuntu", + ssh_key: str = "~/ray_bootstrap_key.pem", + docker_container: Optional[str] = None, + is_head: bool = False, + ): + self.host = host + self.ssh_user = ssh_user + self.ssh_key = ssh_key + self.docker_container = docker_container + self.is_head = is_head + + +class Archive: + """Archive object to collect and compress files into a single file. + + Objects of this class can be passed around to different data collection + functions. These functions can use the :meth:`subdir` method to add + files to a sub directory of the archive. + + """ + + def __init__(self, file: Optional[str] = None): + self.file = file or tempfile.mkstemp(prefix="ray_logs_", suffix=".tar.gz")[1] + self.tar = None + self._lock = threading.Lock() + + @property + def is_open(self): + return bool(self.tar) + + def open(self): + self.tar = tarfile.open(self.file, "w:gz") + + def close(self): + self.tar.close() + self.tar = None + + def __enter__(self): + self.open() + return self + + def __exit__(self, exc_type, exc_val, exc_tb): + self.close() + + @contextmanager + def subdir(self, subdir: str, root: Optional[str] = "/"): + """Open a context to add files to the archive. + + Example: + + .. code-block:: python + + with Archive("file.tar.gz") as archive: + with archive.subdir("logfiles", root="/tmp/logs") as sd: + # Will be added as `logfiles/nested/file.txt` + sd.add("/tmp/logs/nested/file.txt") + + Args: + subdir: Subdir to which to add files to. Calling the + ``add(path)`` command will place files into the ``subdir`` + directory of the archive. + root: Root path. Files without an explicit ``arcname`` + will be named relatively to this path. + + Yields: + A context object that can be used to add files to the archive. + """ + root = os.path.abspath(root) + + class _Context: + @staticmethod + def add(path: str, arcname: Optional[str] = None): + path = os.path.abspath(path) + arcname = arcname or os.path.join(subdir, os.path.relpath(path, root)) + + self._lock.acquire() + self.tar.add(path, arcname=arcname) + self._lock.release() + + yield _Context() + + +### +# Functions to gather logs and information on the local node +### + + +def get_local_ray_logs( + archive: Archive, + exclude: Optional[Sequence[str]] = None, + session_log_dir: str = "/tmp/ray/session_latest", +) -> Archive: + """Copy local log files into an archive. + + Args: + archive: Archive object to add log files to. + exclude (Sequence[str]): Sequence of regex patterns. Files that match + any of these patterns will not be included in the archive. + session_dir: Path to the Ray session files. Defaults to + ``/tmp/ray/session_latest`` + + Returns: + Open archive object. + + """ + if not archive.is_open: + archive.open() + + exclude = exclude or [] + + session_log_dir = os.path.join(os.path.expanduser(session_log_dir), "logs") + + with archive.subdir("logs", root=session_log_dir) as sd: + for root, dirs, files in os.walk(session_log_dir): + for file in files: + file_path = os.path.join(root, file) + rel_path = os.path.relpath(file_path, start=session_log_dir) + # Skip file if it matches any pattern in `exclude` + if any(re.match(pattern, rel_path) for pattern in exclude): + continue + sd.add(file_path) + + return archive + + +def get_local_debug_state( + archive: Archive, session_dir: str = "/tmp/ray/session_latest" +) -> Archive: + """Copy local log files into an archive. + + Args: + archive: Archive object to add log files to. + session_dir: Path to the Ray session files. Defaults to + ``/tmp/ray/session_latest`` + + Returns: + Open archive object. + + """ + if not archive.is_open: + archive.open() + + session_dir = os.path.expanduser(session_dir) + debug_state_file = os.path.join(session_dir, "logs/debug_state.txt") + + if not os.path.exists(debug_state_file): + raise LocalCommandFailed("No `debug_state.txt` file found.") + + with archive.subdir("", root=session_dir) as sd: + sd.add(debug_state_file) + + return archive + + +def get_local_pip_packages(archive: Archive): + """Get currently installed pip packages and write into an archive. + + Args: + archive: Archive object to add meta files to. + + Returns: + Open archive object. + """ + if not archive.is_open: + archive.open() + + try: + from pip._internal.operations import freeze + except ImportError: # pip < 10.0 + from pip.operations import freeze + + with tempfile.NamedTemporaryFile("wt") as fp: + for line in freeze.freeze(): + fp.writelines([line, "\n"]) + + fp.flush() + with archive.subdir("") as sd: + sd.add(fp.name, "pip_packages.txt") + + return archive + + +def get_local_ray_processes( + archive: Archive, + processes: Optional[List[Tuple[str, bool]]] = None, + verbose: bool = False, +): + """Get the status of all the relevant ray processes. + Args: + archive: Archive object to add process info files to. + processes: List of processes to get information on. The first + element of the tuple is a string to filter by, and the second + element is a boolean indicating if we should filter by command + name (True) or command line including parameters (False) + verbose: If True, show entire executable command line. + If False, show just the first term. + Returns: + Open archive object. + """ + if not processes: + # local import to avoid circular dependencies + from ray.autoscaler._private.constants import RAY_PROCESSES + + processes = RAY_PROCESSES + + process_infos = [] + for process in psutil.process_iter(["pid", "name", "cmdline", "status"]): + try: + with process.oneshot(): + cmdline = " ".join(process.cmdline()) + process_infos.append( + ( + { + "executable": cmdline + if verbose + else cmdline.split("--", 1)[0][:-1], + "name": process.name(), + "pid": process.pid, + "status": process.status(), + }, + process.cmdline(), + ) + ) + except Exception as exc: + raise LocalCommandFailed(exc) from exc + + relevant_processes = {} + for process_dict, cmdline in process_infos: + for keyword, filter_by_cmd in processes: + if filter_by_cmd: + corpus = process_dict["name"] + else: + corpus = subprocess.list2cmdline(cmdline) + if keyword in corpus and process_dict["pid"] not in relevant_processes: + relevant_processes[process_dict["pid"]] = process_dict + + with tempfile.NamedTemporaryFile("wt") as fp: + for line in relevant_processes.values(): + fp.writelines([yaml.dump(line), "\n"]) + + fp.flush() + with archive.subdir("meta") as sd: + sd.add(fp.name, "process_info.txt") + + return archive + + +def get_all_local_data(archive: Archive, parameters: GetParameters): + """Get all local data. + + Gets: + - The Ray logs of the latest session + - The currently installed pip packages + + Args: + archive: Archive object to add meta files to. + parameters: Parameters (settings) for getting data. + + Returns: + Open archive object. + """ + if not archive.is_open: + archive.open() + + if parameters.logs: + try: + get_local_ray_logs(archive=archive) + except LocalCommandFailed as exc: + cli_logger.error(exc) + if parameters.debug_state: + try: + get_local_debug_state(archive=archive) + except LocalCommandFailed as exc: + cli_logger.error(exc) + if parameters.pip: + try: + get_local_pip_packages(archive=archive) + except LocalCommandFailed as exc: + cli_logger.error(exc) + if parameters.processes: + try: + get_local_ray_processes( + archive=archive, + processes=parameters.processes_list, + verbose=parameters.processes_verbose, + ) + except LocalCommandFailed as exc: + cli_logger.error(exc) + + return archive + + +### +# Functions to invoke remote scripts and gather data from remote nodes +### + + +def _wrap(items: List[str], quotes="'"): + return f"{quotes}{' '.join(items)}{quotes}" + + +def create_and_get_archive_from_remote_node( + remote_node: Node, parameters: GetParameters, script_path: str = "ray" +) -> Optional[str]: + """Create an archive containing logs on a remote node and transfer. + + This will call ``ray local-dump --stream`` on the remote + node. The resulting file will be saved locally in a temporary file and + returned. + + Args: + remote_node: Remote node to gather archive from. + script_path: Path to this script on the remote node. + parameters: Parameters (settings) for getting data. + + Returns: + Path to a temporary file containing the node's collected data. + + """ + cmd = [ + "ssh", + "-o StrictHostKeyChecking=no", + "-o UserKnownHostsFile=/dev/null", + "-o LogLevel=ERROR", + "-i", + remote_node.ssh_key, + f"{remote_node.ssh_user}@{remote_node.host}", + ] + + if remote_node.docker_container: + cmd += [ + "docker", + "exec", + remote_node.docker_container, + ] + + collect_cmd = [script_path, "local-dump", "--stream"] + collect_cmd += ["--logs"] if parameters.logs else ["--no-logs"] + collect_cmd += ["--debug-state"] if parameters.debug_state else ["--no-debug-state"] + collect_cmd += ["--pip"] if parameters.pip else ["--no-pip"] + collect_cmd += ["--processes"] if parameters.processes else ["--no-processes"] + if parameters.processes: + collect_cmd += ( + ["--processes-verbose"] + if parameters.processes_verbose + else ["--no-proccesses-verbose"] + ) + + cmd += ["/bin/bash", "-c", _wrap(collect_cmd, quotes='"')] + + cat = "node" if not remote_node.is_head else "head" + + cli_logger.print(f"Collecting data from remote node: {remote_node.host}") + tmp = tempfile.mkstemp(prefix=f"ray_{cat}_{remote_node.host}_", suffix=".tar.gz")[1] + with open(tmp, "wb") as fp: + try: + subprocess.check_call(cmd, stdout=fp, stderr=sys.stderr) + except subprocess.CalledProcessError as exc: + raise RemoteCommandFailed( + f"Gathering logs from remote node failed: {' '.join(cmd)}" + ) from exc + + return tmp + + +def create_and_add_remote_data_to_local_archive( + archive: Archive, remote_node: Node, parameters: GetParameters +): + """Create and get data from remote node and add to local archive. + + Args: + archive: Archive object to add remote data to. + remote_node: Remote node to gather archive from. + parameters: Parameters (settings) for getting data. + + Returns: + Open archive object. + """ + tmp = create_and_get_archive_from_remote_node(remote_node, parameters) + + if not archive.is_open: + archive.open() + + cat = "node" if not remote_node.is_head else "head" + + with archive.subdir("", root=os.path.dirname(tmp)) as sd: + sd.add(tmp, arcname=f"ray_{cat}_{remote_node.host}.tar.gz") + + return archive + + +def create_and_add_local_data_to_local_archive( + archive: Archive, parameters: GetParameters +): + """Create and get data from this node and add to archive. + + Args: + archive: Archive object to add remote data to. + parameters: Parameters (settings) for getting data. + + Returns: + Open archive object. + """ + with Archive() as local_data_archive: + get_all_local_data(local_data_archive, parameters) + + if not archive.is_open: + archive.open() + + with archive.subdir("", root=os.path.dirname(local_data_archive.file)) as sd: + sd.add(local_data_archive.file, arcname="local_node.tar.gz") + + os.remove(local_data_archive.file) + + return archive + + +def create_archive_for_remote_nodes( + archive: Archive, remote_nodes: Sequence[Node], parameters: GetParameters +): + """Create an archive combining data from the remote nodes. + + This will parallelize calls to get data from remote nodes. + + Args: + archive: Archive object to add remote data to. + remote_nodes (Sequence[Node]): Sequence of remote nodes. + parameters: Parameters (settings) for getting data. + + Returns: + Open archive object. + + """ + if not archive.is_open: + archive.open() + + with ThreadPoolExecutor(max_workers=MAX_PARALLEL_SSH_WORKERS) as executor: + for remote_node in remote_nodes: + executor.submit( + create_and_add_remote_data_to_local_archive, + archive=archive, + remote_node=remote_node, + parameters=parameters, + ) + + return archive + + +def create_archive_for_local_and_remote_nodes( + archive: Archive, remote_nodes: Sequence[Node], parameters: GetParameters +): + """Create an archive combining data from the local and remote nodes. + + This will parallelize calls to get data from remote nodes. + + Args: + archive: Archive object to add data to. + remote_nodes (Sequence[Node]): Sequence of remote nodes. + parameters: Parameters (settings) for getting data. + + Returns: + Open archive object. + + """ + if not archive.is_open: + archive.open() + + try: + create_and_add_local_data_to_local_archive(archive, parameters) + except CommandFailed as exc: + cli_logger.error(exc) + + create_archive_for_remote_nodes(archive, remote_nodes, parameters) + + cli_logger.print( + f"Collected data from local node and {len(remote_nodes)} " f"remote nodes." + ) + return archive + + +### +# Ray cluster info +### +def get_info_from_ray_cluster_config( + cluster_config: str, +) -> Tuple[List[str], str, str, Optional[str], Optional[str]]: + """Get information from Ray cluster config. + + Return list of host IPs, ssh user, ssh key file, and optional docker + container. + + Args: + cluster_config: Path to ray cluster config. + + Returns: + Tuple of list of host IPs, ssh user name, ssh key file path, + optional docker container name, optional cluster name. + """ + from ray.autoscaler._private.commands import _bootstrap_config + + cli_logger.print( + f"Retrieving cluster information from ray cluster file: " f"{cluster_config}" + ) + + cluster_config = os.path.expanduser(cluster_config) + + config = yaml.safe_load(open(cluster_config).read()) + config = _bootstrap_config(config, no_config_cache=True) + + provider = _get_node_provider(config["provider"], config["cluster_name"]) + head_nodes = provider.non_terminated_nodes({TAG_RAY_NODE_KIND: NODE_KIND_HEAD}) + worker_nodes = provider.non_terminated_nodes({TAG_RAY_NODE_KIND: NODE_KIND_WORKER}) + + hosts = [provider.external_ip(node) for node in head_nodes + worker_nodes] + ssh_user = config["auth"]["ssh_user"] + ssh_key = config["auth"]["ssh_private_key"] + + docker = None + docker_config = config.get("docker", None) + if docker_config: + docker = docker_config.get("container_name", None) + + cluster_name = config.get("cluster_name", None) + + return hosts, ssh_user, ssh_key, docker, cluster_name + + +def _info_from_params( + cluster: Optional[str] = None, + host: Optional[str] = None, + ssh_user: Optional[str] = None, + ssh_key: Optional[str] = None, + docker: Optional[str] = None, +): + """Parse command line arguments. + + Note: This returns a list of hosts, not a comma separated string! + """ + if not host and not cluster: + bootstrap_config = os.path.expanduser("~/ray_bootstrap_config.yaml") + if os.path.exists(bootstrap_config): + cluster = bootstrap_config + cli_logger.warning( + f"Detected cluster config file at {cluster}. " + f"If this is incorrect, specify with " + f"`ray cluster-dump `" + ) + elif cluster: + cluster = os.path.expanduser(cluster) + + cluster_name = None + + if cluster: + h, u, k, d, cluster_name = get_info_from_ray_cluster_config(cluster) + + ssh_user = ssh_user or u + ssh_key = ssh_key or k + docker = docker or d + hosts = host.split(",") if host else h + + if not hosts: + raise LocalCommandFailed( + f"Invalid cluster file or cluster has no running nodes: " f"{cluster}" + ) + elif host: + hosts = host.split(",") + else: + raise LocalCommandFailed( + "You need to either specify a `` or `--host`." + ) + + if not ssh_user: + ssh_user = DEFAULT_SSH_USER + cli_logger.warning( + f"Using default SSH user `{ssh_user}`. " + f"If this is incorrect, specify with `--ssh-user `" + ) + + if not ssh_key: + for cand_key in DEFAULT_SSH_KEYS: + cand_key_file = os.path.expanduser(cand_key) + if os.path.exists(cand_key_file): + ssh_key = cand_key_file + cli_logger.warning( + f"Auto detected SSH key file: {ssh_key}. " + f"If this is incorrect, specify with `--ssh-key `" + ) + break + + return cluster, hosts, ssh_user, ssh_key, docker, cluster_name diff --git a/.venv/lib/python3.11/site-packages/ray/autoscaler/_private/command_runner.py b/.venv/lib/python3.11/site-packages/ray/autoscaler/_private/command_runner.py new file mode 100644 index 0000000000000000000000000000000000000000..7d3d475f19e9890be347e6321d6bd2f328fd2528 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/autoscaler/_private/command_runner.py @@ -0,0 +1,921 @@ +import hashlib +import json +import logging +import os +import subprocess +import sys +import time +from getpass import getuser +from shlex import quote +from typing import Dict, List + +import click + +from ray._private.ray_constants import DEFAULT_OBJECT_STORE_MAX_MEMORY_BYTES +from ray.autoscaler._private.cli_logger import cf, cli_logger +from ray.autoscaler._private.constants import ( + AUTOSCALER_NODE_SSH_INTERVAL_S, + AUTOSCALER_NODE_START_WAIT_S, + DEFAULT_OBJECT_STORE_MEMORY_PROPORTION, +) +from ray.autoscaler._private.docker import ( + check_bind_mounts_cmd, + check_docker_image, + check_docker_running_cmd, + docker_start_cmds, + with_docker_exec, +) +from ray.autoscaler._private.log_timer import LogTimer +from ray.autoscaler._private.subprocess_output_util import ( + ProcessRunnerError, + is_output_redirected, + run_cmd_redirected, +) +from ray.autoscaler.command_runner import CommandRunnerInterface + +logger = logging.getLogger(__name__) + +# How long to wait for a node to start, in seconds +HASH_MAX_LENGTH = 10 +KUBECTL_RSYNC = os.path.join( + os.path.dirname(os.path.abspath(__file__)), "_kubernetes/kubectl-rsync.sh" +) +MAX_HOME_RETRIES = 3 +HOME_RETRY_DELAY_S = 5 + +_config = {"use_login_shells": True, "silent_rsync": True} + + +def is_rsync_silent(): + return _config["silent_rsync"] + + +def set_rsync_silent(val): + """Choose whether to silence rsync output. + + Most commands will want to list rsync'd files themselves rather than + print the default rsync spew. + """ + _config["silent_rsync"] = val + + +def is_using_login_shells(): + return _config["use_login_shells"] + + +def set_using_login_shells(val: bool): + """Choose between login and non-interactive shells. + + Non-interactive shells have the benefit of receiving less output from + subcommands (since progress bars and TTY control codes are not printed). + Sometimes this can be significant since e.g. `pip install` prints + hundreds of progress bar lines when downloading. + + Login shells have the benefit of working very close to how a proper bash + session does, regarding how scripts execute and how the environment is + setup. This is also how all commands were ran in the past. The only reason + to use login shells over non-interactive shells is if you need some weird + and non-robust tool to work. + + Args: + val: If true, login shells will be used to run all commands. + """ + _config["use_login_shells"] = val + + +def _with_environment_variables(cmd: str, environment_variables: Dict[str, object]): + """Prepend environment variables to a shell command. + + Args: + cmd: The base command. + environment_variables (Dict[str, object]): The set of environment + variables. If an environment variable value is a dict, it will + automatically be converted to a one line yaml string. + """ + + as_strings = [] + for key, val in environment_variables.items(): + val = json.dumps(val, separators=(",", ":")) + s = "export {}={};".format(key, quote(val)) + as_strings.append(s) + all_vars = "".join(as_strings) + return all_vars + cmd + + +def _with_interactive(cmd): + force_interactive = ( + f"source ~/.bashrc; " + f"export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && ({cmd})" + ) + return ["bash", "--login", "-c", "-i", quote(force_interactive)] + + +class SSHOptions: + def __init__(self, ssh_key, control_path=None, **kwargs): + self.ssh_key = ssh_key + self.arg_dict = { + # Supresses initial fingerprint verification. + "StrictHostKeyChecking": "no", + # SSH IP and fingerprint pairs no longer added to known_hosts. + # This is to remove a "REMOTE HOST IDENTIFICATION HAS CHANGED" + # warning if a new node has the same IP as a previously + # deleted node, because the fingerprints will not match in + # that case. + "UserKnownHostsFile": os.devnull, + # Try fewer extraneous key pairs. + "IdentitiesOnly": "yes", + # Abort if port forwarding fails (instead of just printing to + # stderr). + "ExitOnForwardFailure": "yes", + # Quickly kill the connection if network connection breaks (as + # opposed to hanging/blocking). + "ServerAliveInterval": 5, + "ServerAliveCountMax": 3, + } + if control_path: + self.arg_dict.update( + { + "ControlMaster": "auto", + "ControlPath": "{}/%C".format(control_path), + "ControlPersist": "10s", + } + ) + self.arg_dict.update(kwargs) + + def to_ssh_options_list(self, *, timeout=60): + self.arg_dict["ConnectTimeout"] = "{}s".format(timeout) + ssh_key_option = ["-i", self.ssh_key] if self.ssh_key else [] + return ssh_key_option + [ + x + for y in ( + ["-o", "{}={}".format(k, v)] + for k, v in self.arg_dict.items() + if v is not None + ) + for x in y + ] + + +class SSHCommandRunner(CommandRunnerInterface): + def __init__( + self, + log_prefix, + node_id, + provider, + auth_config, + cluster_name, + process_runner, + use_internal_ip, + ): + + ssh_control_hash = hashlib.sha1(cluster_name.encode()).hexdigest() + ssh_user_hash = hashlib.sha1(getuser().encode()).hexdigest() + ssh_control_path = "/tmp/ray_ssh_{}/{}".format( + ssh_user_hash[:HASH_MAX_LENGTH], ssh_control_hash[:HASH_MAX_LENGTH] + ) + + self.cluster_name = cluster_name + self.log_prefix = log_prefix + self.process_runner = process_runner + self.node_id = node_id + self.use_internal_ip = use_internal_ip + self.provider = provider + self.ssh_private_key = auth_config.get("ssh_private_key") + self.ssh_user = auth_config["ssh_user"] + self.ssh_control_path = ssh_control_path + self.ssh_ip = None + self.ssh_proxy_command = auth_config.get("ssh_proxy_command", None) + self.ssh_options = SSHOptions( + self.ssh_private_key, + self.ssh_control_path, + ProxyCommand=self.ssh_proxy_command, + ) + + def _get_node_ip(self): + if self.use_internal_ip: + return self.provider.internal_ip(self.node_id) + else: + return self.provider.external_ip(self.node_id) + + def _wait_for_ip(self, deadline): + # if we have IP do not print waiting info + ip = self._get_node_ip() + if ip is not None: + cli_logger.labeled_value("Fetched IP", ip) + return ip + + interval = AUTOSCALER_NODE_SSH_INTERVAL_S + with cli_logger.group("Waiting for IP"): + while time.time() < deadline and not self.provider.is_terminated( + self.node_id + ): + ip = self._get_node_ip() + if ip is not None: + cli_logger.labeled_value("Received", ip) + return ip + cli_logger.print( + "Not yet available, retrying in {} seconds", cf.bold(str(interval)) + ) + time.sleep(interval) + + return None + + def _set_ssh_ip_if_required(self): + if self.ssh_ip is not None: + return + + # We assume that this never changes. + # I think that's reasonable. + deadline = time.time() + AUTOSCALER_NODE_START_WAIT_S + with LogTimer(self.log_prefix + "Got IP"): + ip = self._wait_for_ip(deadline) + + cli_logger.doassert(ip is not None, "Could not get node IP.") # todo: msg + assert ip is not None, "Unable to find IP of node" + + self.ssh_ip = ip + + # This should run before any SSH commands and therefore ensure that + # the ControlPath directory exists, allowing SSH to maintain + # persistent sessions later on. + try: + os.makedirs(self.ssh_control_path, mode=0o700, exist_ok=True) + except OSError as e: + cli_logger.warning("{}", str(e)) # todo: msg + + def _run_helper( + self, final_cmd, with_output=False, exit_on_fail=False, silent=False + ): + """Run a command that was already setup with SSH and `bash` settings. + + Args: + cmd (List[str]): + Full command to run. Should include SSH options and other + processing that we do. + with_output (bool): + If `with_output` is `True`, command stdout will be captured and + returned. + exit_on_fail (bool): + If `exit_on_fail` is `True`, the process will exit + if the command fails (exits with a code other than 0). + + Raises: + ProcessRunnerError if using new log style and disabled + login shells. + click.ClickException if using login shells. + """ + try: + # For now, if the output is needed we just skip the new logic. + # In the future we could update the new logic to support + # capturing output, but it is probably not needed. + if not with_output: + return run_cmd_redirected( + final_cmd, + process_runner=self.process_runner, + silent=silent, + use_login_shells=is_using_login_shells(), + ) + else: + return self.process_runner.check_output(final_cmd) + except subprocess.CalledProcessError as e: + joined_cmd = " ".join(final_cmd) + if not is_using_login_shells(): + raise ProcessRunnerError( + "Command failed", + "ssh_command_failed", + code=e.returncode, + command=joined_cmd, + ) + + if exit_on_fail: + raise click.ClickException( + "Command failed:\n\n {}\n".format(joined_cmd) + ) from None + else: + fail_msg = "SSH command failed." + if is_output_redirected(): + fail_msg += " See above for the output from the failure." + raise click.ClickException(fail_msg) from None + finally: + # Do our best to flush output to terminal. + # See https://github.com/ray-project/ray/pull/19473. + sys.stdout.flush() + sys.stderr.flush() + + def run( + self, + cmd, + timeout=120, + exit_on_fail=False, + port_forward=None, + with_output=False, + environment_variables: Dict[str, object] = None, + run_env="auto", # Unused argument. + ssh_options_override_ssh_key="", + shutdown_after_run=False, + silent=False, + ): + if shutdown_after_run: + cmd += "; sudo shutdown -h now" + + if ssh_options_override_ssh_key: + if self.ssh_proxy_command: + ssh_options = SSHOptions( + ssh_options_override_ssh_key, ProxyCommand=self.ssh_proxy_command + ) + else: + ssh_options = SSHOptions(ssh_options_override_ssh_key) + else: + ssh_options = self.ssh_options + + assert isinstance( + ssh_options, SSHOptions + ), "ssh_options must be of type SSHOptions, got {}".format(type(ssh_options)) + + self._set_ssh_ip_if_required() + + if is_using_login_shells(): + ssh = ["ssh", "-tt"] + else: + ssh = ["ssh"] + + if port_forward: + with cli_logger.group("Forwarding ports"): + if not isinstance(port_forward, list): + port_forward = [port_forward] + for local, remote in port_forward: + cli_logger.verbose( + "Forwarding port {} to port {} on localhost.", + cf.bold(local), + cf.bold(remote), + ) # todo: msg + ssh += ["-L", "{}:localhost:{}".format(local, remote)] + + final_cmd = ( + ssh + + ssh_options.to_ssh_options_list(timeout=timeout) + + ["{}@{}".format(self.ssh_user, self.ssh_ip)] + ) + if cmd: + if environment_variables: + cmd = _with_environment_variables(cmd, environment_variables) + if is_using_login_shells(): + final_cmd += _with_interactive(cmd) + else: + final_cmd += [cmd] + else: + # We do this because `-o ControlMaster` causes the `-N` flag to + # still create an interactive shell in some ssh versions. + final_cmd.append("while true; do sleep 86400; done") + + cli_logger.verbose("Running `{}`", cf.bold(cmd)) + with cli_logger.indented(): + cli_logger.very_verbose( + "Full command is `{}`", cf.bold(" ".join(final_cmd)) + ) + + if cli_logger.verbosity > 0: + with cli_logger.indented(): + return self._run_helper( + final_cmd, with_output, exit_on_fail, silent=silent + ) + else: + return self._run_helper(final_cmd, with_output, exit_on_fail, silent=silent) + + def _create_rsync_filter_args(self, options): + rsync_excludes = options.get("rsync_exclude") or [] + rsync_filters = options.get("rsync_filter") or [] + + exclude_args = [ + ["--exclude", rsync_exclude] for rsync_exclude in rsync_excludes + ] + filter_args = [ + ["--filter", "dir-merge,- {}".format(rsync_filter)] + for rsync_filter in rsync_filters + ] + + # Combine and flatten the two lists + return [arg for args_list in exclude_args + filter_args for arg in args_list] + + def run_rsync_up(self, source, target, options=None): + self._set_ssh_ip_if_required() + options = options or {} + + command = ["rsync"] + command += [ + "--rsh", + subprocess.list2cmdline( + ["ssh"] + self.ssh_options.to_ssh_options_list(timeout=120) + ), + ] + command += ["-avz"] + command += self._create_rsync_filter_args(options=options) + command += [source, "{}@{}:{}".format(self.ssh_user, self.ssh_ip, target)] + cli_logger.verbose("Running `{}`", cf.bold(" ".join(command))) + self._run_helper(command, silent=is_rsync_silent()) + + def run_rsync_down(self, source, target, options=None): + self._set_ssh_ip_if_required() + + command = ["rsync"] + command += [ + "--rsh", + subprocess.list2cmdline( + ["ssh"] + self.ssh_options.to_ssh_options_list(timeout=120) + ), + ] + command += ["-avz"] + command += self._create_rsync_filter_args(options=options) + command += ["{}@{}:{}".format(self.ssh_user, self.ssh_ip, source), target] + cli_logger.verbose("Running `{}`", cf.bold(" ".join(command))) + self._run_helper(command, silent=is_rsync_silent()) + + def remote_shell_command_str(self): + if self.ssh_private_key: + return "ssh -o IdentitiesOnly=yes -i {} {}@{}\n".format( + self.ssh_private_key, self.ssh_user, self.ssh_ip + ) + else: + return "ssh -o IdentitiesOnly=yes {}@{}\n".format( + self.ssh_user, self.ssh_ip + ) + + +class DockerCommandRunner(CommandRunnerInterface): + def __init__(self, docker_config, **common_args): + self.ssh_command_runner = SSHCommandRunner(**common_args) + self.container_name = docker_config["container_name"] + self.docker_config = docker_config + self.home_dir = None + self.initialized = False + # Optionally use 'podman' instead of 'docker' + use_podman = docker_config.get("use_podman", False) + self.docker_cmd = "podman" if use_podman else "docker" + + def run( + self, + cmd, + timeout=120, + exit_on_fail=False, + port_forward=None, + with_output=False, + environment_variables: Dict[str, object] = None, + run_env="auto", + ssh_options_override_ssh_key="", + shutdown_after_run=False, + ): + if run_env == "auto": + run_env = ( + "host" + if (not bool(cmd) or cmd.find(self.docker_cmd) == 0) + else self.docker_cmd + ) + + if environment_variables: + cmd = _with_environment_variables(cmd, environment_variables) + + if run_env == "docker": + cmd = self._docker_expand_user(cmd, any_char=True) + if is_using_login_shells(): + cmd = " ".join(_with_interactive(cmd)) + cmd = with_docker_exec( + [cmd], + container_name=self.container_name, + with_interactive=is_using_login_shells(), + docker_cmd=self.docker_cmd, + )[0] + + if shutdown_after_run: + # sudo shutdown should run after `with_docker_exec` command above + cmd += "; sudo shutdown -h now" + # Do not pass shutdown_after_run argument to ssh_command_runner.run() + # since it is handled above. + return self.ssh_command_runner.run( + cmd, + timeout=timeout, + exit_on_fail=exit_on_fail, + port_forward=port_forward, + with_output=with_output, + ssh_options_override_ssh_key=ssh_options_override_ssh_key, + ) + + def run_rsync_up(self, source, target, options=None): + options = options or {} + host_destination = os.path.join( + self._get_docker_host_mount_location(self.ssh_command_runner.cluster_name), + target.lstrip("/"), + ) + + host_mount_location = os.path.dirname(host_destination.rstrip("/")) + self.ssh_command_runner.run( + f"mkdir -p {host_mount_location} && chown -R " + f"{self.ssh_command_runner.ssh_user} {host_mount_location}", + silent=is_rsync_silent(), + ) + + self.ssh_command_runner.run_rsync_up(source, host_destination, options=options) + if self._check_container_status() and not options.get( + "docker_mount_if_possible", False + ): + if os.path.isdir(source): + # Adding a "." means that docker copies the *contents* + # Without it, docker copies the source *into* the target + host_destination += "/." + + # This path may not exist inside the container. This ensures + # that the path is created! + prefix = with_docker_exec( + [ + "mkdir -p {}".format( + os.path.dirname(self._docker_expand_user(target)) + ) + ], + container_name=self.container_name, + with_interactive=is_using_login_shells(), + docker_cmd=self.docker_cmd, + )[0] + + self.ssh_command_runner.run( + "{} && rsync -e '{} exec -i' -avz {} {}:{}".format( + prefix, + self.docker_cmd, + host_destination, + self.container_name, + self._docker_expand_user(target), + ), + silent=is_rsync_silent(), + ) + + def run_rsync_down(self, source, target, options=None): + options = options or {} + host_source = os.path.join( + self._get_docker_host_mount_location(self.ssh_command_runner.cluster_name), + source.lstrip("/"), + ) + host_mount_location = os.path.dirname(host_source.rstrip("/")) + self.ssh_command_runner.run( + f"mkdir -p {host_mount_location} && chown -R " + f"{self.ssh_command_runner.ssh_user} {host_mount_location}", + silent=is_rsync_silent(), + ) + if source[-1] == "/": + source += "." + # Adding a "." means that docker copies the *contents* + # Without it, docker copies the source *into* the target + if not options.get("docker_mount_if_possible", False): + # NOTE: `--delete` is okay here because the container is the source + # of truth. + self.ssh_command_runner.run( + "rsync -e '{} exec -i' -avz --delete {}:{} {}".format( + self.docker_cmd, + self.container_name, + self._docker_expand_user(source), + host_source, + ), + silent=is_rsync_silent(), + ) + self.ssh_command_runner.run_rsync_down(host_source, target, options=options) + + def remote_shell_command_str(self): + inner_str = ( + self.ssh_command_runner.remote_shell_command_str() + .replace("ssh", "ssh -tt", 1) + .strip("\n") + ) + return inner_str + " {} exec -it {} /bin/bash\n".format( + self.docker_cmd, self.container_name + ) + + def _check_docker_installed(self): + no_exist = "NoExist" + output = self.ssh_command_runner.run( + f"command -v {self.docker_cmd} || echo '{no_exist}'", with_output=True + ) + cleaned_output = output.decode().strip() + if no_exist in cleaned_output or "docker" not in cleaned_output: + if self.docker_cmd == "docker": + install_commands = [ + "curl -fsSL https://get.docker.com -o get-docker.sh", + "sudo sh get-docker.sh", + "sudo usermod -aG docker $USER", + "sudo systemctl restart docker -f", + ] + else: + install_commands = [ + "sudo apt-get update", + "sudo apt-get -y install podman", + ] + + logger.error( + f"{self.docker_cmd.capitalize()} not installed. You can " + f"install {self.docker_cmd.capitalize()} by adding the " + "following commands to 'initialization_commands':\n" + + "\n".join(install_commands) + ) + + def _check_container_status(self): + if self.initialized: + return True + output = ( + self.ssh_command_runner.run( + check_docker_running_cmd(self.container_name, self.docker_cmd), + with_output=True, + ) + .decode("utf-8") + .strip() + ) + # Checks for the false positive where "true" is in the container name + return "true" in output.lower() and "no such object" not in output.lower() + + def _docker_expand_user(self, string, any_char=False): + user_pos = string.find("~") + if user_pos > -1: + if self.home_dir is None: + self.home_dir = ( + self.ssh_command_runner.run( + f"{self.docker_cmd} exec {self.container_name} " + "printenv HOME", + with_output=True, + ) + .decode("utf-8") + .strip() + ) + + if any_char: + return string.replace("~/", self.home_dir + "/") + + elif not any_char and user_pos == 0: + return string.replace("~", self.home_dir, 1) + + return string + + def _check_if_container_restart_is_needed( + self, image: str, cleaned_bind_mounts: Dict[str, str] + ) -> bool: + re_init_required = False + running_image = ( + self.run( + check_docker_image(self.container_name, self.docker_cmd), + with_output=True, + run_env="host", + ) + .decode("utf-8") + .strip() + ) + if running_image != image: + cli_logger.error( + "A container with name {} is running image {} instead " + + "of {} (which was provided in the YAML)", + self.container_name, + running_image, + image, + ) + mounts = ( + self.run( + check_bind_mounts_cmd(self.container_name, self.docker_cmd), + with_output=True, + run_env="host", + ) + .decode("utf-8") + .strip() + ) + try: + active_mounts = json.loads(mounts) + active_remote_mounts = { + mnt["Destination"].strip("/") for mnt in active_mounts + } + # Ignore ray bootstrap files. + requested_remote_mounts = { + self._docker_expand_user(remote).strip("/") + for remote in cleaned_bind_mounts.keys() + } + unfulfilled_mounts = requested_remote_mounts - active_remote_mounts + if unfulfilled_mounts: + re_init_required = True + cli_logger.warning( + "This Docker Container is already running. " + "Restarting the Docker container on " + "this node to pick up the following file_mounts {}", + unfulfilled_mounts, + ) + except json.JSONDecodeError: + cli_logger.verbose( + "Unable to check if file_mounts specified in the YAML " + "differ from those on the running container." + ) + return re_init_required + + def run_init( + self, *, as_head: bool, file_mounts: Dict[str, str], sync_run_yet: bool + ): + BOOTSTRAP_MOUNTS = ["~/ray_bootstrap_config.yaml", "~/ray_bootstrap_key.pem"] + + specific_image = self.docker_config.get( + f"{'head' if as_head else 'worker'}_image", self.docker_config.get("image") + ) + + self._check_docker_installed() + if self.docker_config.get("pull_before_run", True): + assert specific_image, ( + "Image must be included in config if " + "pull_before_run is specified" + ) + self.run( + "{} pull {}".format(self.docker_cmd, specific_image), run_env="host" + ) + else: + + self.run( + f"{self.docker_cmd} image inspect {specific_image} " + "1> /dev/null 2>&1 || " + f"{self.docker_cmd} pull {specific_image}" + ) + + # Bootstrap files cannot be bind mounted because docker opens the + # underlying inode. When the file is switched, docker becomes outdated. + cleaned_bind_mounts = file_mounts.copy() + for mnt in BOOTSTRAP_MOUNTS: + cleaned_bind_mounts.pop(mnt, None) + + docker_run_executed = False + + container_running = self._check_container_status() + requires_re_init = False + if container_running: + requires_re_init = self._check_if_container_restart_is_needed( + specific_image, cleaned_bind_mounts + ) + if requires_re_init: + self.run( + f"{self.docker_cmd} stop {self.container_name}", run_env="host" + ) + + if (not container_running) or requires_re_init: + if not sync_run_yet: + # Do not start the actual image as we need to run file_sync + # first to ensure that all folders are created with the + # correct ownership. Docker will create the folders with + # `root` as the owner. + return True + # Get home directory + image_env = ( + self.ssh_command_runner.run( + f"{self.docker_cmd} " + + "inspect -f '{{json .Config.Env}}' " + + specific_image, + with_output=True, + ) + .decode() + .strip() + ) + home_directory = "/root" + try: + for env_var in json.loads(image_env): + if env_var.startswith("HOME="): + home_directory = env_var.split("HOME=")[1] + break + except json.JSONDecodeError as e: + cli_logger.error( + "Unable to deserialize `image_env` to Python object. " + f"The `image_env` is:\n{image_env}" + ) + raise e + + user_docker_run_options = self.docker_config.get( + "run_options", [] + ) + self.docker_config.get( + f"{'head' if as_head else 'worker'}_run_options", [] + ) + start_command = docker_start_cmds( + self.ssh_command_runner.ssh_user, + specific_image, + cleaned_bind_mounts, + self.container_name, + self._configure_runtime( + self._auto_configure_shm(user_docker_run_options) + ), + self.ssh_command_runner.cluster_name, + home_directory, + self.docker_cmd, + ) + self.run(start_command, run_env="host") + docker_run_executed = True + + # Explicitly copy in ray bootstrap files. + for mount in BOOTSTRAP_MOUNTS: + if mount in file_mounts: + if not sync_run_yet: + # NOTE(ilr) This rsync is needed because when starting from + # a stopped instance, /tmp may be deleted and `run_init` + # is called before the first `file_sync` happens + self.run_rsync_up(file_mounts[mount], mount) + self.ssh_command_runner.run( + "rsync -e '{cmd} exec -i' -avz {src} {container}:{dst}".format( + cmd=self.docker_cmd, + src=os.path.join( + self._get_docker_host_mount_location( + self.ssh_command_runner.cluster_name + ), + mount, + ), + container=self.container_name, + dst=self._docker_expand_user(mount), + ) + ) + try: + # Check if the current user has read permission. + # If they do not, try to change ownership! + self.run( + f"cat {mount} >/dev/null 2>&1 || " + f"sudo chown $(id -u):$(id -g) {mount}" + ) + except Exception: + lsl_string = ( + self.run(f"ls -l {mount}", with_output=True) + .decode("utf-8") + .strip() + ) + # The string is of format + # + permissions = lsl_string.split(" ")[0] + owner = lsl_string.split(" ")[2] + group = lsl_string.split(" ")[3] + current_user = ( + self.run("whoami", with_output=True).decode("utf-8").strip() + ) + cli_logger.warning( + f"File ({mount}) is owned by user:{owner} and group:" + f"{group} with permissions ({permissions}). The " + f"current user ({current_user}) does not have " + "permission to read these files, and Ray may not be " + "able to autoscale. This can be resolved by " + "installing `sudo` in your container, or adding a " + f"command like 'chown {current_user} {mount}' to " + "your `setup_commands`." + ) + self.initialized = True + return docker_run_executed + + def _configure_runtime(self, run_options: List[str]) -> List[str]: + if self.docker_config.get("disable_automatic_runtime_detection"): + return run_options + + runtime_output = ( + self.ssh_command_runner.run( + f"{self.docker_cmd} " + "info -f '{{.Runtimes}}' ", with_output=True + ) + .decode() + .strip() + ) + if "nvidia-container-runtime" in runtime_output: + try: + self.ssh_command_runner.run("nvidia-smi", with_output=False) + return run_options + ["--runtime=nvidia"] + except Exception as e: + logger.warning( + "Nvidia Container Runtime is present, but no GPUs found." + ) + logger.debug(f"nvidia-smi error: {e}") + return run_options + + return run_options + + def _auto_configure_shm(self, run_options: List[str]) -> List[str]: + if self.docker_config.get("disable_shm_size_detection"): + return run_options + for run_opt in run_options: + if "--shm-size" in run_opt: + logger.info( + "Bypassing automatic SHM-Detection because of " + f"`run_option`: {run_opt}" + ) + return run_options + try: + shm_output = ( + self.ssh_command_runner.run( + "cat /proc/meminfo || true", with_output=True + ) + .decode() + .strip() + ) + available_memory = int( + [ln for ln in shm_output.split("\n") if "MemAvailable" in ln][ + 0 + ].split()[1] + ) + available_memory_bytes = available_memory * 1024 + # Overestimate SHM size by 10% + shm_size = min( + (available_memory_bytes * DEFAULT_OBJECT_STORE_MEMORY_PROPORTION * 1.1), + DEFAULT_OBJECT_STORE_MAX_MEMORY_BYTES, + ) + return run_options + [f"--shm-size='{shm_size}b'"] + except Exception as e: + logger.warning(f"Received error while trying to auto-compute SHM size {e}") + return run_options + + def _get_docker_host_mount_location(self, cluster_name: str) -> str: + """Return the docker host mount directory location.""" + # Imported here due to circular dependency in imports. + from ray.autoscaler.sdk import get_docker_host_mount_location + + return get_docker_host_mount_location(cluster_name) diff --git a/.venv/lib/python3.11/site-packages/ray/autoscaler/_private/commands.py b/.venv/lib/python3.11/site-packages/ray/autoscaler/_private/commands.py new file mode 100644 index 0000000000000000000000000000000000000000..9a9b9d91cc2fb30282e041bb03cf4170ad96dd42 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/autoscaler/_private/commands.py @@ -0,0 +1,1631 @@ +import copy +import datetime +import hashlib +import json +import logging +import os +import random +import shutil +import subprocess +import sys +import tempfile +import time +from concurrent.futures import ThreadPoolExecutor +from types import ModuleType +from typing import Any, Dict, List, Optional, Tuple, Union + +import click +import yaml + +import ray +from ray._private.usage import usage_lib +from ray.autoscaler._private import subprocess_output_util as cmd_output_util +from ray.autoscaler._private.autoscaler import AutoscalerSummary +from ray.autoscaler._private.cli_logger import cf, cli_logger +from ray.autoscaler._private.cluster_dump import ( + Archive, + GetParameters, + Node, + _info_from_params, + create_archive_for_local_and_remote_nodes, + create_archive_for_remote_nodes, + get_all_local_data, +) +from ray.autoscaler._private.command_runner import ( + set_rsync_silent, + set_using_login_shells, +) +from ray.autoscaler._private.constants import ( + AUTOSCALER_RESOURCE_REQUEST_CHANNEL, + MAX_PARALLEL_SHUTDOWN_WORKERS, +) +from ray.autoscaler._private.event_system import CreateClusterEvent, global_event_system +from ray.autoscaler._private.log_timer import LogTimer +from ray.autoscaler._private.node_provider_availability_tracker import ( + NodeAvailabilitySummary, +) +from ray.autoscaler._private.providers import ( + _NODE_PROVIDERS, + _PROVIDER_PRETTY_NAMES, + _get_node_provider, +) +from ray.autoscaler._private.updater import NodeUpdaterThread +from ray.autoscaler._private.util import ( + LoadMetricsSummary, + format_info_string, + hash_launch_conf, + hash_runtime_conf, + prepare_config, + validate_config, +) +from ray.autoscaler.node_provider import NodeProvider +from ray.autoscaler.tags import ( + NODE_KIND_HEAD, + NODE_KIND_WORKER, + STATUS_UNINITIALIZED, + STATUS_UP_TO_DATE, + TAG_RAY_LAUNCH_CONFIG, + TAG_RAY_NODE_KIND, + TAG_RAY_NODE_NAME, + TAG_RAY_NODE_STATUS, + TAG_RAY_USER_NODE_TYPE, +) +from ray.experimental.internal_kv import _internal_kv_put, internal_kv_get_gcs_client +from ray.util.debug import log_once + +try: # py3 + from shlex import quote +except ImportError: # py2 + from pipes import quote + + +logger = logging.getLogger(__name__) + +RUN_ENV_TYPES = ["auto", "host", "docker"] + +POLL_INTERVAL = 5 + +Port_forward = Union[Tuple[int, int], List[Tuple[int, int]]] + + +def try_logging_config(config: Dict[str, Any]) -> None: + if config["provider"]["type"] == "aws": + from ray.autoscaler._private.aws.config import log_to_cli + + log_to_cli(config) + + +def try_get_log_state(provider_config: Dict[str, Any]) -> Optional[dict]: + if provider_config["type"] == "aws": + from ray.autoscaler._private.aws.config import get_log_state + + return get_log_state() + return None + + +def try_reload_log_state(provider_config: Dict[str, Any], log_state: dict) -> None: + if not log_state: + return + if provider_config["type"] == "aws": + from ray.autoscaler._private.aws.config import reload_log_state + + return reload_log_state(log_state) + + +def debug_status( + status, error, verbose: bool = False, address: Optional[str] = None +) -> str: + """ + Return a debug string for the autoscaler. + + Args: + status: The autoscaler status string for v1 + error: The autoscaler error string for v1 + verbose: Whether to print verbose information. + address: The address of the cluster (gcs address). + + Returns: + str: A debug string for the cluster's status. + """ + from ray.autoscaler.v2.utils import is_autoscaler_v2 + + if is_autoscaler_v2(): + from ray.autoscaler.v2.sdk import get_cluster_status + from ray.autoscaler.v2.utils import ClusterStatusFormatter + + cluster_status = get_cluster_status(address) + status = ClusterStatusFormatter.format(cluster_status, verbose=verbose) + elif status: + status = status.decode("utf-8") + status_dict = json.loads(status) + lm_summary_dict = status_dict.get("load_metrics_report") + autoscaler_summary_dict = status_dict.get("autoscaler_report") + timestamp = status_dict.get("time") + gcs_request_time = status_dict.get("gcs_request_time") + non_terminated_nodes_time = status_dict.get("non_terminated_nodes_time") + if lm_summary_dict and autoscaler_summary_dict and timestamp: + lm_summary = LoadMetricsSummary(**lm_summary_dict) + node_availability_summary_dict = autoscaler_summary_dict.pop( + "node_availability_summary", {} + ) + node_availability_summary = NodeAvailabilitySummary.from_fields( + **node_availability_summary_dict + ) + autoscaler_summary = AutoscalerSummary( + node_availability_summary=node_availability_summary, + **autoscaler_summary_dict, + ) + report_time = datetime.datetime.fromtimestamp(timestamp) + status = format_info_string( + lm_summary, + autoscaler_summary, + time=report_time, + gcs_request_time=gcs_request_time, + non_terminated_nodes_time=non_terminated_nodes_time, + verbose=verbose, + ) + else: + status = ( + "No cluster status. It may take a few seconds " + "for the Ray internal services to start up." + ) + else: + status = ( + "No cluster status. It may take a few seconds " + "for the Ray internal services to start up." + ) + + if error: + status += "\n" + status += error.decode("utf-8") + + return status + + +def request_resources( + num_cpus: Optional[int] = None, bundles: Optional[List[dict]] = None +) -> None: + """Remotely request some CPU or GPU resources from the autoscaler. + + This function is to be called e.g. on a node before submitting a bunch of + ray.remote calls to ensure that resources rapidly become available. + + Args: + num_cpus: Scale the cluster to ensure this number of CPUs are + available. This request is persistent until another call to + request_resources() is made. + bundles (List[ResourceDict]): Scale the cluster to ensure this set of + resource shapes can fit. This request is persistent until another + call to request_resources() is made. + """ + if not ray.is_initialized(): + raise RuntimeError("Ray is not initialized yet") + to_request = [] + if num_cpus: + to_request += [{"CPU": 1}] * num_cpus + if bundles: + to_request += bundles + _internal_kv_put( + AUTOSCALER_RESOURCE_REQUEST_CHANNEL, json.dumps(to_request), overwrite=True + ) + + from ray.autoscaler.v2.utils import is_autoscaler_v2 + + if is_autoscaler_v2(): + from ray.autoscaler.v2.sdk import request_cluster_resources + + gcs_address = internal_kv_get_gcs_client().address + request_cluster_resources(gcs_address, to_request) + + +def create_or_update_cluster( + config_file: str, + override_min_workers: Optional[int], + override_max_workers: Optional[int], + no_restart: bool, + restart_only: bool, + yes: bool, + override_cluster_name: Optional[str] = None, + no_config_cache: bool = False, + redirect_command_output: Optional[bool] = False, + use_login_shells: bool = True, + no_monitor_on_head: bool = False, +) -> Dict[str, Any]: + """Creates or updates an autoscaling Ray cluster from a config json.""" + # no_monitor_on_head is an internal flag used by the Ray K8s operator. + # If True, prevents autoscaling config sync to the Ray head during cluster + # creation. See https://github.com/ray-project/ray/pull/13720. + set_using_login_shells(use_login_shells) + if not use_login_shells: + cmd_output_util.set_allow_interactive(False) + if redirect_command_output is None: + # Do not redirect by default. + cmd_output_util.set_output_redirected(False) + else: + cmd_output_util.set_output_redirected(redirect_command_output) + + def handle_yaml_error(e): + cli_logger.error("Cluster config invalid") + cli_logger.newline() + cli_logger.error("Failed to load YAML file " + cf.bold("{}"), config_file) + cli_logger.newline() + with cli_logger.verbatim_error_ctx("PyYAML error:"): + cli_logger.error(e) + cli_logger.abort() + + try: + config = yaml.safe_load(open(config_file).read()) + except FileNotFoundError: + cli_logger.abort( + "Provided cluster configuration file ({}) does not exist", + cf.bold(config_file), + ) + except yaml.parser.ParserError as e: + handle_yaml_error(e) + raise + except yaml.scanner.ScannerError as e: + handle_yaml_error(e) + raise + global_event_system.execute_callback( + CreateClusterEvent.up_started, {"cluster_config": config} + ) + + # todo: validate file_mounts, ssh keys, etc. + + importer = _NODE_PROVIDERS.get(config["provider"]["type"]) + if not importer: + cli_logger.abort( + "Unknown provider type " + cf.bold("{}") + "\n" + "Available providers are: {}", + config["provider"]["type"], + cli_logger.render_list( + [k for k in _NODE_PROVIDERS.keys() if _NODE_PROVIDERS[k] is not None] + ), + ) + + printed_overrides = False + + def handle_cli_override(key, override): + if override is not None: + if key in config: + nonlocal printed_overrides + printed_overrides = True + cli_logger.warning( + "`{}` override provided on the command line.\n" + " Using " + + cf.bold("{}") + + cf.dimmed(" [configuration file has " + cf.bold("{}") + "]"), + key, + override, + config[key], + ) + config[key] = override + + handle_cli_override("min_workers", override_min_workers) + handle_cli_override("max_workers", override_max_workers) + handle_cli_override("cluster_name", override_cluster_name) + + if printed_overrides: + cli_logger.newline() + + cli_logger.labeled_value("Cluster", config["cluster_name"]) + + cli_logger.newline() + config = _bootstrap_config(config, no_config_cache=no_config_cache) + + try_logging_config(config) + get_or_create_head_node( + config, + config_file, + no_restart, + restart_only, + yes, + override_cluster_name, + no_monitor_on_head, + ) + return config + + +CONFIG_CACHE_VERSION = 1 + + +def _bootstrap_config( + config: Dict[str, Any], no_config_cache: bool = False +) -> Dict[str, Any]: + config = prepare_config(config) + # NOTE: multi-node-type autoscaler is guaranteed to be in use after this. + + hasher = hashlib.sha1() + hasher.update(json.dumps([config], sort_keys=True).encode("utf-8")) + cache_key = os.path.join( + tempfile.gettempdir(), "ray-config-{}".format(hasher.hexdigest()) + ) + + if os.path.exists(cache_key) and not no_config_cache: + config_cache = json.loads(open(cache_key).read()) + if config_cache.get("_version", -1) == CONFIG_CACHE_VERSION: + # todo: is it fine to re-resolve? afaik it should be. + # we can have migrations otherwise or something + # but this seems overcomplicated given that resolving is + # relatively cheap + try_reload_log_state( + config_cache["config"]["provider"], + config_cache.get("provider_log_info"), + ) + + if log_once("_printed_cached_config_warning"): + cli_logger.verbose_warning( + "Loaded cached provider configuration from " + cf.bold("{}"), + cache_key, + ) + if cli_logger.verbosity == 0: + cli_logger.warning("Loaded cached provider configuration") + cli_logger.warning( + "If you experience issues with " + "the cloud provider, try re-running " + "the command with {}.", + cf.bold("--no-config-cache"), + ) + + return config_cache["config"] + else: + cli_logger.warning( + "Found cached cluster config " + "but the version " + cf.bold("{}") + " " + "(expected " + cf.bold("{}") + ") does not match.\n" + "This is normal if cluster launcher was updated.\n" + "Config will be re-resolved.", + config_cache.get("_version", "none"), + CONFIG_CACHE_VERSION, + ) + + importer = _NODE_PROVIDERS.get(config["provider"]["type"]) + if not importer: + raise NotImplementedError("Unsupported provider {}".format(config["provider"])) + + provider_cls = importer(config["provider"]) + + cli_logger.print( + "Checking {} environment settings", + _PROVIDER_PRETTY_NAMES.get(config["provider"]["type"]), + ) + try: + config = provider_cls.fillout_available_node_types_resources(config) + except Exception as exc: + if cli_logger.verbosity > 2: + logger.exception("Failed to autodetect node resources.") + else: + cli_logger.warning( + f"Failed to autodetect node resources: {str(exc)}. " + "You can see full stack trace with higher verbosity." + ) + + try: + # NOTE: if `resources` field is missing, validate_config for providers + # other than AWS and Kubernetes will fail (the schema error will ask + # the user to manually fill the resources) as we currently support + # autofilling resources for AWS and Kubernetes only. + validate_config(config) + except (ModuleNotFoundError, ImportError): + cli_logger.abort( + "Not all Ray autoscaler dependencies were found. " + "In Ray 1.4+, the Ray CLI, autoscaler, and dashboard will " + 'only be usable via `pip install "ray[default]"`. Please ' + "update your install command." + ) + resolved_config = provider_cls.bootstrap_config(config) + + if not no_config_cache: + with open(cache_key, "w") as f: + config_cache = { + "_version": CONFIG_CACHE_VERSION, + "provider_log_info": try_get_log_state(resolved_config["provider"]), + "config": resolved_config, + } + f.write(json.dumps(config_cache)) + return resolved_config + + +def teardown_cluster( + config_file: str, + yes: bool, + workers_only: bool, + override_cluster_name: Optional[str], + keep_min_workers: bool, +) -> None: + """Destroys all nodes of a Ray cluster described by a config json.""" + config = yaml.safe_load(open(config_file).read()) + if override_cluster_name is not None: + config["cluster_name"] = override_cluster_name + + config = _bootstrap_config(config) + + cli_logger.confirm(yes, "Destroying cluster.", _abort=True) + + if not workers_only: + try: + exec_cluster( + config_file, + cmd="ray stop", + run_env="auto", + screen=False, + tmux=False, + stop=False, + start=False, + override_cluster_name=override_cluster_name, + port_forward=None, + with_output=False, + ) + except Exception as e: + # todo: add better exception info + cli_logger.verbose_error("{}", str(e)) + cli_logger.warning( + "Exception occurred when stopping the cluster Ray runtime " + "(use -v to dump teardown exceptions)." + ) + cli_logger.warning( + "Ignoring the exception and " + "attempting to shut down the cluster nodes anyway." + ) + + provider = _get_node_provider(config["provider"], config["cluster_name"]) + + def remaining_nodes(): + workers = provider.non_terminated_nodes({TAG_RAY_NODE_KIND: NODE_KIND_WORKER}) + + if keep_min_workers: + min_workers = config.get("min_workers", 0) + cli_logger.print( + "{} random worker nodes will not be shut down. " + + cf.dimmed("(due to {})"), + cf.bold(min_workers), + cf.bold("--keep-min-workers"), + ) + + workers = random.sample(workers, len(workers) - min_workers) + + # todo: it's weird to kill the head node but not all workers + if workers_only: + cli_logger.print( + "The head node will not be shut down. " + cf.dimmed("(due to {})"), + cf.bold("--workers-only"), + ) + + return workers + + head = provider.non_terminated_nodes({TAG_RAY_NODE_KIND: NODE_KIND_HEAD}) + + return head + workers + + def run_docker_stop(node, container_name): + try: + updater = NodeUpdaterThread( + node_id=node, + provider_config=config["provider"], + provider=provider, + auth_config=config["auth"], + cluster_name=config["cluster_name"], + file_mounts=config["file_mounts"], + initialization_commands=[], + setup_commands=[], + ray_start_commands=[], + runtime_hash="", + file_mounts_contents_hash="", + is_head_node=False, + docker_config=config.get("docker"), + ) + + _exec( + updater, + f"docker stop {container_name}", + with_output=False, + run_env="host", + ) + except Exception: + cli_logger.warning(f"Docker stop failed on {node}") + + # Loop here to check that both the head and worker nodes are actually + # really gone + A = remaining_nodes() + + container_name = config.get("docker", {}).get("container_name") + if container_name: + # This is to ensure that the parallel SSH calls below do not mess with + # the users terminal. + output_redir = cmd_output_util.is_output_redirected() + cmd_output_util.set_output_redirected(True) + allow_interactive = cmd_output_util.does_allow_interactive() + cmd_output_util.set_allow_interactive(False) + + with ThreadPoolExecutor(max_workers=MAX_PARALLEL_SHUTDOWN_WORKERS) as executor: + for node in A: + executor.submit( + run_docker_stop, node=node, container_name=container_name + ) + cmd_output_util.set_output_redirected(output_redir) + cmd_output_util.set_allow_interactive(allow_interactive) + with LogTimer("teardown_cluster: done."): + while A: + provider.terminate_nodes(A) + + cli_logger.print( + "Requested {} nodes to shut down.", + cf.bold(len(A)), + _tags=dict(interval="1s"), + ) + + time.sleep(POLL_INTERVAL) # todo: interval should be a variable + A = remaining_nodes() + cli_logger.print( + "{} nodes remaining after {} second(s).", cf.bold(len(A)), POLL_INTERVAL + ) + cli_logger.success("No nodes remaining.") + + +def kill_node( + config_file: str, yes: bool, hard: bool, override_cluster_name: Optional[str] +) -> Optional[str]: + """Kills a random Raylet worker.""" + + config = yaml.safe_load(open(config_file).read()) + if override_cluster_name is not None: + config["cluster_name"] = override_cluster_name + config = _bootstrap_config(config) + + cli_logger.confirm(yes, "A random node will be killed.") + + provider = _get_node_provider(config["provider"], config["cluster_name"]) + nodes = provider.non_terminated_nodes({TAG_RAY_NODE_KIND: NODE_KIND_WORKER}) + if not nodes: + cli_logger.print("No worker nodes detected.") + return None + node = random.choice(nodes) + cli_logger.print("Shutdown " + cf.bold("{}"), node) + if hard: + provider.terminate_node(node) + else: + updater = NodeUpdaterThread( + node_id=node, + provider_config=config["provider"], + provider=provider, + auth_config=config["auth"], + cluster_name=config["cluster_name"], + file_mounts=config["file_mounts"], + initialization_commands=[], + setup_commands=[], + ray_start_commands=[], + runtime_hash="", + file_mounts_contents_hash="", + is_head_node=False, + docker_config=config.get("docker"), + ) + + _exec(updater, "ray stop", False, False) + + time.sleep(POLL_INTERVAL) + + if config.get("provider", {}).get("use_internal_ips", False): + node_ip = provider.internal_ip(node) + else: + node_ip = provider.external_ip(node) + + return node_ip + + +def monitor_cluster( + cluster_config_file: str, num_lines: int, override_cluster_name: Optional[str] +) -> None: + """Tails the autoscaler logs of a Ray cluster.""" + cmd = f"tail -n {num_lines} -f /tmp/ray/session_latest/logs/monitor*" + exec_cluster( + cluster_config_file, + cmd=cmd, + run_env="auto", + screen=False, + tmux=False, + stop=False, + start=False, + override_cluster_name=override_cluster_name, + port_forward=None, + ) + + +def warn_about_bad_start_command( + start_commands: List[str], no_monitor_on_head: bool = False +) -> None: + ray_start_cmd = list(filter(lambda x: "ray start" in x, start_commands)) + if len(ray_start_cmd) == 0: + cli_logger.warning( + "Ray runtime will not be started because `{}` is not in `{}`.", + cf.bold("ray start"), + cf.bold("head_start_ray_commands"), + ) + + autoscaling_config_in_ray_start_cmd = any( + "autoscaling-config" in x for x in ray_start_cmd + ) + if not (autoscaling_config_in_ray_start_cmd or no_monitor_on_head): + cli_logger.warning( + "The head node will not launch any workers because " + "`{}` does not have `{}` set.\n" + "Potential fix: add `{}` to the `{}` command under `{}`.", + cf.bold("ray start"), + cf.bold("--autoscaling-config"), + cf.bold("--autoscaling-config=~/ray_bootstrap_config.yaml"), + cf.bold("ray start"), + cf.bold("head_start_ray_commands"), + ) + + +def get_or_create_head_node( + config: Dict[str, Any], + printable_config_file: str, + no_restart: bool, + restart_only: bool, + yes: bool, + override_cluster_name: Optional[str], + no_monitor_on_head: bool = False, + _provider: Optional[NodeProvider] = None, + _runner: ModuleType = subprocess, +) -> None: + """Create the cluster head node, which in turn creates the workers.""" + global_event_system.execute_callback(CreateClusterEvent.cluster_booting_started) + provider = _provider or _get_node_provider( + config["provider"], config["cluster_name"] + ) + + config = copy.deepcopy(config) + head_node_tags = { + TAG_RAY_NODE_KIND: NODE_KIND_HEAD, + } + nodes = provider.non_terminated_nodes(head_node_tags) + if len(nodes) > 0: + head_node = nodes[0] + else: + head_node = None + + if not head_node: + cli_logger.confirm( + yes, "No head node found. Launching a new cluster.", _abort=True + ) + cli_logger.newline() + usage_lib.show_usage_stats_prompt(cli=True) + + if head_node: + if restart_only: + cli_logger.confirm( + yes, + "Updating cluster configuration and " + "restarting the cluster Ray runtime. " + "Setup commands will not be run due to `{}`.\n", + cf.bold("--restart-only"), + _abort=True, + ) + cli_logger.newline() + usage_lib.show_usage_stats_prompt(cli=True) + elif no_restart: + cli_logger.print( + "Cluster Ray runtime will not be restarted due to `{}`.", + cf.bold("--no-restart"), + ) + cli_logger.confirm( + yes, + "Updating cluster configuration and running setup commands.", + _abort=True, + ) + else: + cli_logger.print("Updating cluster configuration and running full setup.") + cli_logger.confirm( + yes, cf.bold("Cluster Ray runtime will be restarted."), _abort=True + ) + cli_logger.newline() + usage_lib.show_usage_stats_prompt(cli=True) + + cli_logger.newline() + # TODO(ekl) this logic is duplicated in node_launcher.py (keep in sync) + head_node_config = copy.deepcopy(config.get("head_node", {})) + # The above `head_node` field is deprecated in favor of per-node-type + # node_configs. We allow it for backwards-compatibility. + head_node_resources = None + head_node_labels = None + head_node_type = config.get("head_node_type") + if head_node_type: + head_node_tags[TAG_RAY_USER_NODE_TYPE] = head_node_type + head_config = config["available_node_types"][head_node_type] + head_node_config.update(head_config["node_config"]) + + # Not necessary to keep in sync with node_launcher.py + # Keep in sync with autoscaler.py _node_resources + head_node_resources = head_config.get("resources") + head_node_labels = head_config.get("labels") + + launch_hash = hash_launch_conf(head_node_config, config["auth"]) + creating_new_head = _should_create_new_head( + head_node, launch_hash, head_node_type, provider + ) + if creating_new_head: + with cli_logger.group("Acquiring an up-to-date head node"): + global_event_system.execute_callback( + CreateClusterEvent.acquiring_new_head_node + ) + if head_node is not None: + cli_logger.confirm(yes, "Relaunching the head node.", _abort=True) + + provider.terminate_node(head_node) + cli_logger.print("Terminated head node {}", head_node) + + head_node_tags[TAG_RAY_LAUNCH_CONFIG] = launch_hash + head_node_tags[TAG_RAY_NODE_NAME] = "ray-{}-head".format( + config["cluster_name"] + ) + head_node_tags[TAG_RAY_NODE_STATUS] = STATUS_UNINITIALIZED + provider.create_node(head_node_config, head_node_tags, 1) + cli_logger.print("Launched a new head node") + + start = time.time() + head_node = None + with cli_logger.group("Fetching the new head node"): + while True: + if time.time() - start > 50: + cli_logger.abort( + "Head node fetch timed out. Failed to create head node." + ) + nodes = provider.non_terminated_nodes(head_node_tags) + if len(nodes) == 1: + head_node = nodes[0] + break + time.sleep(POLL_INTERVAL) + cli_logger.newline() + + global_event_system.execute_callback(CreateClusterEvent.head_node_acquired) + + with cli_logger.group( + "Setting up head node", + _numbered=("<>", 1, 1), + # cf.bold(provider.node_tags(head_node)[TAG_RAY_NODE_NAME]), + _tags=dict(), + ): # add id, ARN to tags? + # TODO(ekl) right now we always update the head node even if the + # hash matches. + # We could prompt the user for what they want to do here. + # No need to pass in cluster_sync_files because we use this + # hash to set up the head node + (runtime_hash, file_mounts_contents_hash) = hash_runtime_conf( + config["file_mounts"], None, config + ) + + if not no_monitor_on_head: + # Return remote_config_file to avoid prematurely closing it. + config, remote_config_file = _set_up_config_for_head_node( + config, provider, no_restart + ) + cli_logger.print("Prepared bootstrap config") + + if restart_only: + # Docker may re-launch nodes, requiring setup + # commands to be rerun. + if config.get("docker", {}).get("container_name"): + setup_commands = config["head_setup_commands"] + else: + setup_commands = [] + ray_start_commands = config["head_start_ray_commands"] + # If user passed in --no-restart and we're not creating a new head, + # omit start commands. + elif no_restart and not creating_new_head: + setup_commands = config["head_setup_commands"] + ray_start_commands = [] + else: + setup_commands = config["head_setup_commands"] + ray_start_commands = config["head_start_ray_commands"] + + if not no_restart: + warn_about_bad_start_command(ray_start_commands, no_monitor_on_head) + + updater = NodeUpdaterThread( + node_id=head_node, + provider_config=config["provider"], + provider=provider, + auth_config=config["auth"], + cluster_name=config["cluster_name"], + file_mounts=config["file_mounts"], + initialization_commands=config["initialization_commands"], + setup_commands=setup_commands, + ray_start_commands=ray_start_commands, + process_runner=_runner, + runtime_hash=runtime_hash, + file_mounts_contents_hash=file_mounts_contents_hash, + is_head_node=True, + node_resources=head_node_resources, + node_labels=head_node_labels, + rsync_options={ + "rsync_exclude": config.get("rsync_exclude"), + "rsync_filter": config.get("rsync_filter"), + }, + docker_config=config.get("docker"), + restart_only=restart_only, + ) + updater.start() + updater.join() + + # Refresh the node cache so we see the external ip if available + provider.non_terminated_nodes(head_node_tags) + + if updater.exitcode != 0: + # todo: this does not follow the mockup and is not good enough + cli_logger.abort("Failed to setup head node.") + sys.exit(1) + + global_event_system.execute_callback( + CreateClusterEvent.cluster_booting_completed, + { + "head_node_id": head_node, + }, + ) + + monitor_str = "tail -n 100 -f /tmp/ray/session_latest/logs/monitor*" + if override_cluster_name: + modifiers = " --cluster-name={}".format(quote(override_cluster_name)) + else: + modifiers = "" + + cli_logger.newline() + with cli_logger.group("Useful commands:"): + printable_config_file = os.path.abspath(printable_config_file) + + cli_logger.print("To terminate the cluster:") + cli_logger.print(cf.bold(f" ray down {printable_config_file}{modifiers}")) + cli_logger.newline() + + cli_logger.print("To retrieve the IP address of the cluster head:") + cli_logger.print( + cf.bold(f" ray get-head-ip {printable_config_file}{modifiers}") + ) + cli_logger.newline() + + cli_logger.print( + "To port-forward the cluster's Ray Dashboard to the local machine:" + ) + cli_logger.print(cf.bold(f" ray dashboard {printable_config_file}{modifiers}")) + cli_logger.newline() + + cli_logger.print( + "To submit a job to the cluster, port-forward the " + "Ray Dashboard in another terminal and run:" + ) + cli_logger.print( + cf.bold( + " ray job submit --address http://localhost: " + "--working-dir . -- python my_script.py" + ) + ) + cli_logger.newline() + + cli_logger.print("To connect to a terminal on the cluster head for debugging:") + cli_logger.print(cf.bold(f" ray attach {printable_config_file}{modifiers}")) + cli_logger.newline() + + cli_logger.print("To monitor autoscaling:") + cli_logger.print( + cf.bold( + f" ray exec {printable_config_file}{modifiers} {quote(monitor_str)}" + ) + ) + cli_logger.newline() + + +def _should_create_new_head( + head_node_id: Optional[str], + new_launch_hash: str, + new_head_node_type: str, + provider: NodeProvider, +) -> bool: + """Decides whether a new head node needs to be created. + + We need a new head if at least one of the following holds: + (a) There isn't an existing head node + (b) The user-submitted head node_config differs from the existing head + node's node_config. + (c) The user-submitted head node_type key differs from the existing head + node's node_type. + + Args: + head_node_id (Optional[str]): head node id if a head exists, else None + new_launch_hash: hash of current user-submitted head config + new_head_node_type: current user-submitted head node-type key + + Returns: + bool: True if a new Ray head node should be launched, False otherwise + """ + if not head_node_id: + # No head node exists, need to create it. + return True + + # Pull existing head's data. + head_tags = provider.node_tags(head_node_id) + current_launch_hash = head_tags.get(TAG_RAY_LAUNCH_CONFIG) + current_head_type = head_tags.get(TAG_RAY_USER_NODE_TYPE) + + # Compare to current head + hashes_mismatch = new_launch_hash != current_launch_hash + types_mismatch = new_head_node_type != current_head_type + + new_head_required = hashes_mismatch or types_mismatch + + # Warn user + if new_head_required: + with cli_logger.group( + "Currently running head node is out-of-date with cluster configuration" + ): + if hashes_mismatch: + cli_logger.print( + "Current hash is {}, expected {}", + cf.bold(current_launch_hash), + cf.bold(new_launch_hash), + ) + + if types_mismatch: + cli_logger.print( + "Current head node type is {}, expected {}", + cf.bold(current_head_type), + cf.bold(new_head_node_type), + ) + + return new_head_required + + +def _set_up_config_for_head_node( + config: Dict[str, Any], provider: NodeProvider, no_restart: bool +) -> Tuple[Dict[str, Any], Any]: + """Prepares autoscaling config and, if needed, ssh key, to be mounted onto + the Ray head node for use by the autoscaler. + + Returns the modified config and the temporary config file that will be + mounted onto the head node. + """ + # Rewrite the auth config so that the head + # node can update the workers + remote_config = copy.deepcopy(config) + + # drop proxy options if they exist, otherwise + # head node won't be able to connect to workers + remote_config["auth"].pop("ssh_proxy_command", None) + + # Drop the head_node field if it was introduced. It is technically not a + # valid field in the config, but it may have been introduced after + # validation (see _bootstrap_config() call to + # provider_cls.bootstrap_config(config)). The head node will never try to + # launch a head node so it doesn't need these defaults. + remote_config.pop("head_node", None) + + if "ssh_private_key" in config["auth"]: + remote_key_path = "~/ray_bootstrap_key.pem" + remote_config["auth"]["ssh_private_key"] = remote_key_path + + # Adjust for new file locations + new_mounts = {} + for remote_path in config["file_mounts"]: + new_mounts[remote_path] = remote_path + remote_config["file_mounts"] = new_mounts + remote_config["no_restart"] = no_restart + + remote_config = provider.prepare_for_head_node(remote_config) + + # Now inject the rewritten config and SSH key into the head node + remote_config_file = tempfile.NamedTemporaryFile("w", prefix="ray-bootstrap-") + remote_config_file.write(json.dumps(remote_config)) + remote_config_file.flush() + config["file_mounts"].update( + {"~/ray_bootstrap_config.yaml": remote_config_file.name} + ) + + if "ssh_private_key" in config["auth"]: + config["file_mounts"].update( + { + remote_key_path: config["auth"]["ssh_private_key"], + } + ) + + return config, remote_config_file + + +def attach_cluster( + config_file: str, + start: bool, + use_screen: bool, + use_tmux: bool, + override_cluster_name: Optional[str], + no_config_cache: bool = False, + new: bool = False, + port_forward: Optional[Port_forward] = None, +) -> None: + """Attaches to a screen for the specified cluster. + + Arguments: + config_file: path to the cluster yaml + start: whether to start the cluster if it isn't up + use_screen: whether to use screen as multiplexer + use_tmux: whether to use tmux as multiplexer + override_cluster_name: set the name of the cluster + new: whether to force a new screen + port_forward ( (int,int) or list[(int,int)] ): port(s) to forward + """ + + if use_tmux: + if new: + cmd = "tmux new" + else: + cmd = "tmux attach || tmux new" + elif use_screen: + if new: + cmd = "screen -L" + else: + cmd = "screen -L -xRR" + else: + if new: + raise ValueError("--new only makes sense if passing --screen or --tmux") + cmd = "$SHELL" + + exec_cluster( + config_file, + cmd=cmd, + run_env="auto", + screen=False, + tmux=False, + stop=False, + start=start, + override_cluster_name=override_cluster_name, + no_config_cache=no_config_cache, + port_forward=port_forward, + _allow_uninitialized_state=True, + ) + + +def exec_cluster( + config_file: str, + *, + cmd: Optional[str] = None, + run_env: str = "auto", + screen: bool = False, + tmux: bool = False, + stop: bool = False, + start: bool = False, + override_cluster_name: Optional[str] = None, + no_config_cache: bool = False, + port_forward: Optional[Port_forward] = None, + with_output: bool = False, + _allow_uninitialized_state: bool = False, + extra_screen_args: Optional[str] = None, +) -> str: + """Runs a command on the specified cluster. + + Arguments: + config_file: path to the cluster yaml + cmd: command to run + run_env: whether to run the command on the host or in a container. + Select between "auto", "host" and "docker" + screen: whether to run in a screen + extra_screen_args: optional custom additional args to screen command + tmux: whether to run in a tmux session + stop: whether to stop the cluster after command run + start: whether to start the cluster if it isn't up + override_cluster_name: set the name of the cluster + port_forward ( (int, int) or list[(int, int)] ): port(s) to forward + _allow_uninitialized_state: whether to execute on an uninitialized head + node. + """ + assert not (screen and tmux), "Can specify only one of `screen` or `tmux`." + assert run_env in RUN_ENV_TYPES, "--run_env must be in {}".format(RUN_ENV_TYPES) + # TODO(rliaw): We default this to True to maintain backwards-compat. + # In the future we would want to support disabling login-shells + # and interactivity. + cmd_output_util.set_allow_interactive(True) + + config = yaml.safe_load(open(config_file).read()) + if override_cluster_name is not None: + config["cluster_name"] = override_cluster_name + config = _bootstrap_config(config, no_config_cache=no_config_cache) + + head_node = _get_running_head_node( + config, + config_file, + override_cluster_name, + create_if_needed=start, + _allow_uninitialized_state=_allow_uninitialized_state, + ) + + provider = _get_node_provider(config["provider"], config["cluster_name"]) + updater = NodeUpdaterThread( + node_id=head_node, + provider_config=config["provider"], + provider=provider, + auth_config=config["auth"], + cluster_name=config["cluster_name"], + file_mounts=config["file_mounts"], + initialization_commands=[], + setup_commands=[], + ray_start_commands=[], + runtime_hash="", + file_mounts_contents_hash="", + is_head_node=True, + rsync_options={ + "rsync_exclude": config.get("rsync_exclude"), + "rsync_filter": config.get("rsync_filter"), + }, + docker_config=config.get("docker"), + ) + if cmd and stop: + cmd = "; ".join( + [ + cmd, + "ray stop", + "ray teardown ~/ray_bootstrap_config.yaml --yes --workers-only", + "sudo shutdown -h now", + ] + ) + + result = _exec( + updater, + cmd, + screen, + tmux, + port_forward=port_forward, + with_output=with_output, + run_env=run_env, + shutdown_after_run=False, + extra_screen_args=extra_screen_args, + ) + if tmux or screen: + attach_command_parts = ["ray attach", config_file] + if override_cluster_name is not None: + attach_command_parts.append( + "--cluster-name={}".format(override_cluster_name) + ) + if tmux: + attach_command_parts.append("--tmux") + elif screen: + attach_command_parts.append("--screen") + + attach_command = " ".join(attach_command_parts) + cli_logger.print("Run `{}` to check command status.", cf.bold(attach_command)) + return result + + +def _exec( + updater: NodeUpdaterThread, + cmd: Optional[str] = None, + screen: bool = False, + tmux: bool = False, + port_forward: Optional[Port_forward] = None, + with_output: bool = False, + run_env: str = "auto", + shutdown_after_run: bool = False, + extra_screen_args: Optional[str] = None, +) -> str: + if cmd: + if screen: + wrapped_cmd = [ + "screen", + "-L", + "-dm", + ] + + if extra_screen_args is not None and len(extra_screen_args) > 0: + wrapped_cmd += [extra_screen_args] + + wrapped_cmd += [ + "bash", + "-c", + quote(cmd + "; exec bash"), + ] + cmd = " ".join(wrapped_cmd) + elif tmux: + # TODO: Consider providing named session functionality + wrapped_cmd = [ + "tmux", + "new", + "-d", + "bash", + "-c", + quote(cmd + "; exec bash"), + ] + cmd = " ".join(wrapped_cmd) + return updater.cmd_runner.run( + cmd, + exit_on_fail=True, + port_forward=port_forward, + with_output=with_output, + run_env=run_env, + shutdown_after_run=shutdown_after_run, + ) + + +def rsync( + config_file: str, + source: Optional[str], + target: Optional[str], + override_cluster_name: Optional[str], + down: bool, + ip_address: Optional[str] = None, + use_internal_ip: bool = False, + no_config_cache: bool = False, + all_nodes: bool = False, + should_bootstrap: bool = True, + _runner: ModuleType = subprocess, +) -> None: + """Rsyncs files. + + Arguments: + config_file: path to the cluster yaml + source: source dir + target: target dir + override_cluster_name: set the name of the cluster + down: whether we're syncing remote -> local + ip_address: Address of node. Raise Exception + if both ip_address and 'all_nodes' are provided. + use_internal_ip: Whether the provided ip_address is + public or private. + all_nodes: whether to sync worker nodes in addition to the head node + should_bootstrap: whether to bootstrap cluster config before syncing + """ + if bool(source) != bool(target): + cli_logger.abort("Expected either both a source and a target, or neither.") + + assert bool(source) == bool( + target + ), "Must either provide both or neither source and target." + + if ip_address and all_nodes: + cli_logger.abort("Cannot provide both ip_address and 'all_nodes'.") + + config = yaml.safe_load(open(config_file).read()) + if override_cluster_name is not None: + config["cluster_name"] = override_cluster_name + if should_bootstrap: + config = _bootstrap_config(config, no_config_cache=no_config_cache) + + is_file_mount = False + if source and target: + for remote_mount in config.get("file_mounts", {}).keys(): + if (source if down else target).startswith(remote_mount): + is_file_mount = True + break + + provider = _get_node_provider(config["provider"], config["cluster_name"]) + + def rsync_to_node(node_id, is_head_node): + updater = NodeUpdaterThread( + node_id=node_id, + provider_config=config["provider"], + provider=provider, + auth_config=config["auth"], + cluster_name=config["cluster_name"], + file_mounts=config["file_mounts"], + initialization_commands=[], + setup_commands=[], + ray_start_commands=[], + runtime_hash="", + use_internal_ip=use_internal_ip, + process_runner=_runner, + file_mounts_contents_hash="", + is_head_node=is_head_node, + rsync_options={ + "rsync_exclude": config.get("rsync_exclude"), + "rsync_filter": config.get("rsync_filter"), + }, + docker_config=config.get("docker"), + ) + if down: + rsync = updater.rsync_down + else: + rsync = updater.rsync_up + + if source and target: + # print rsync progress for single file rsync + if cli_logger.verbosity > 0: + cmd_output_util.set_output_redirected(False) + set_rsync_silent(False) + rsync(source, target, is_file_mount) + else: + updater.sync_file_mounts(rsync) + + nodes = [] + head_node = _get_running_head_node( + config, config_file, override_cluster_name, create_if_needed=False + ) + if ip_address: + nodes = [provider.get_node_id(ip_address, use_internal_ip=use_internal_ip)] + else: + nodes = [head_node] + if all_nodes: + nodes.extend(_get_worker_nodes(config, override_cluster_name)) + + for node_id in nodes: + rsync_to_node(node_id, is_head_node=(node_id == head_node)) + + +def get_head_node_ip( + config_file: str, override_cluster_name: Optional[str] = None +) -> str: + """Returns head node IP for given configuration file if exists.""" + + config = yaml.safe_load(open(config_file).read()) + if override_cluster_name is not None: + config["cluster_name"] = override_cluster_name + + provider = _get_node_provider(config["provider"], config["cluster_name"]) + head_node = _get_running_head_node(config, config_file, override_cluster_name) + provider_cfg = config.get("provider", {}) + # Get internal IP if using internal IPs and + # use_external_head_ip is not specified + if provider_cfg.get("use_internal_ips", False) and not provider_cfg.get( + "use_external_head_ip", False + ): + head_node_ip = provider.internal_ip(head_node) + else: + head_node_ip = provider.external_ip(head_node) + + return head_node_ip + + +def get_worker_node_ips( + config_file: str, override_cluster_name: Optional[str] = None +) -> List[str]: + """Returns worker node IPs for given configuration file.""" + + config = yaml.safe_load(open(config_file).read()) + if override_cluster_name is not None: + config["cluster_name"] = override_cluster_name + + provider = _get_node_provider(config["provider"], config["cluster_name"]) + nodes = provider.non_terminated_nodes({TAG_RAY_NODE_KIND: NODE_KIND_WORKER}) + + if config.get("provider", {}).get("use_internal_ips", False): + return [provider.internal_ip(node) for node in nodes] + else: + return [provider.external_ip(node) for node in nodes] + + +def _get_worker_nodes( + config: Dict[str, Any], override_cluster_name: Optional[str] +) -> List[str]: + """Returns worker node ids for given configuration.""" + # todo: technically could be reused in get_worker_node_ips + if override_cluster_name is not None: + config["cluster_name"] = override_cluster_name + + provider = _get_node_provider(config["provider"], config["cluster_name"]) + return provider.non_terminated_nodes({TAG_RAY_NODE_KIND: NODE_KIND_WORKER}) + + +def _get_running_head_node( + config: Dict[str, Any], + printable_config_file: str, + override_cluster_name: Optional[str], + create_if_needed: bool = False, + _provider: Optional[NodeProvider] = None, + _allow_uninitialized_state: bool = False, +) -> str: + """Get a valid, running head node. + Args: + config (Dict[str, Any]): Cluster Config dictionary + printable_config_file: Used for printing formatted CLI commands. + override_cluster_name: Passed to `get_or_create_head_node` to + override the cluster name present in `config`. + create_if_needed: Create a head node if one is not present. + _provider: [For testing], a Node Provider to use. + _allow_uninitialized_state: Whether to return a head node that + is not 'UP TO DATE'. This is used to allow `ray attach` and + `ray exec` to debug a cluster in a bad state. + + """ + provider = _provider or _get_node_provider( + config["provider"], config["cluster_name"] + ) + head_node_tags = { + TAG_RAY_NODE_KIND: NODE_KIND_HEAD, + } + nodes = provider.non_terminated_nodes(head_node_tags) + head_node = None + _backup_head_node = None + for node in nodes: + node_state = provider.node_tags(node).get(TAG_RAY_NODE_STATUS) + if node_state == STATUS_UP_TO_DATE: + head_node = node + else: + _backup_head_node = node + cli_logger.warning(f"Head node ({node}) is in state {node_state}.") + + if head_node is not None: + return head_node + elif create_if_needed: + get_or_create_head_node( + config, + printable_config_file=printable_config_file, + restart_only=False, + no_restart=False, + yes=True, + override_cluster_name=override_cluster_name, + ) + # NOTE: `_allow_uninitialized_state` is forced to False if + # `create_if_needed` is set to True. This is to ensure that the + # commands executed after creation occur on an actually running + # cluster. + return _get_running_head_node( + config, + printable_config_file, + override_cluster_name, + create_if_needed=False, + _allow_uninitialized_state=False, + ) + else: + if _allow_uninitialized_state and _backup_head_node is not None: + cli_logger.warning( + f"The head node being returned: {_backup_head_node} is not " + "`up-to-date`. If you are not debugging a startup issue " + "it is recommended to restart this head node with: {}", + cf.bold(f" ray down {printable_config_file}"), + ) + + return _backup_head_node + raise RuntimeError( + "Head node of cluster ({}) not found!".format(config["cluster_name"]) + ) + + +def get_local_dump_archive( + stream: bool = False, + output: Optional[str] = None, + logs: bool = True, + debug_state: bool = True, + pip: bool = True, + processes: bool = True, + processes_verbose: bool = False, + tempfile: Optional[str] = None, +) -> Optional[str]: + if stream and output: + raise ValueError( + "You can only use either `--output` or `--stream`, but not both." + ) + + parameters = GetParameters( + logs=logs, + debug_state=debug_state, + pip=pip, + processes=processes, + processes_verbose=processes_verbose, + ) + + with Archive(file=tempfile) as archive: + get_all_local_data(archive, parameters) + + tmp = archive.file + + if stream: + with open(tmp, "rb") as fp: + os.write(1, fp.read()) + os.remove(tmp) + return None + + target = output or os.path.join(os.getcwd(), os.path.basename(tmp)) + shutil.move(tmp, target) + cli_logger.print(f"Created local data archive at {target}") + + return target + + +def get_cluster_dump_archive( + cluster_config_file: Optional[str] = None, + host: Optional[str] = None, + ssh_user: Optional[str] = None, + ssh_key: Optional[str] = None, + docker: Optional[str] = None, + local: Optional[bool] = None, + output: Optional[str] = None, + logs: bool = True, + debug_state: bool = True, + pip: bool = True, + processes: bool = True, + processes_verbose: bool = False, + tempfile: Optional[str] = None, +) -> Optional[str]: + # Inform the user what kind of logs are collected (before actually + # collecting, so they can abort) + content_str = "" + if logs: + content_str += ( + " - The logfiles of your Ray session\n" + " This usually includes Python outputs (stdout/stderr)\n" + ) + + if debug_state: + content_str += ( + " - Debug state information on your Ray cluster \n" + " e.g. number of workers, drivers, objects, etc.\n" + ) + + if pip: + content_str += " - Your installed Python packages (`pip freeze`)\n" + + if processes: + content_str += ( + " - Information on your running Ray processes\n" + " This includes command line arguments\n" + ) + + cli_logger.warning( + "You are about to create a cluster dump. This will collect data from " + "cluster nodes.\n\n" + "The dump will contain this information:\n\n" + f"{content_str}\n" + f"If you are concerned about leaking private information, extract " + f"the archive and inspect its contents before sharing it with " + f"anyone." + ) + + # Parse arguments (e.g. fetch info from cluster config) + ( + cluster_config_file, + hosts, + ssh_user, + ssh_key, + docker, + cluster_name, + ) = _info_from_params(cluster_config_file, host, ssh_user, ssh_key, docker) + + nodes = [ + Node(host=h, ssh_user=ssh_user, ssh_key=ssh_key, docker_container=docker) + for h in hosts + ] + + if not nodes: + cli_logger.error( + "No nodes found. Specify with `--host` or by passing a ray " + "cluster config to `--cluster`." + ) + return None + + if cluster_config_file: + nodes[0].is_head = True + + if local is None: + # If called with a cluster config, this was probably started + # from a laptop + local = not bool(cluster_config_file) + + parameters = GetParameters( + logs=logs, + debug_state=debug_state, + pip=pip, + processes=processes, + processes_verbose=processes_verbose, + ) + + with Archive(file=tempfile) as archive: + if local: + create_archive_for_local_and_remote_nodes( + archive, remote_nodes=nodes, parameters=parameters + ) + else: + create_archive_for_remote_nodes( + archive, remote_nodes=nodes, parameters=parameters + ) + + if not output: + if cluster_name: + filename = ( + f"{cluster_name}_" f"{datetime.datetime.now():%Y-%m-%d_%H-%M-%S}.tar.gz" + ) + else: + filename = ( + f"collected_logs_" f"{datetime.datetime.now():%Y-%m-%d_%H-%M-%S}.tar.gz" + ) + output = os.path.join(os.getcwd(), filename) + else: + output = os.path.expanduser(output) + + shutil.move(archive.file, output) + return output + + +def confirm(msg: str, yes: bool) -> Optional[bool]: + return None if yes else click.confirm(msg, abort=True) diff --git a/.venv/lib/python3.11/site-packages/ray/autoscaler/_private/constants.py b/.venv/lib/python3.11/site-packages/ray/autoscaler/_private/constants.py new file mode 100644 index 0000000000000000000000000000000000000000..b515257f45c06f58139ffb7bfd9f926d4b67f8ec --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/autoscaler/_private/constants.py @@ -0,0 +1,140 @@ +import os +import sys + +from ray._private.ray_constants import ( # noqa F401 + AUTOSCALER_RESOURCE_REQUEST_CHANNEL, + DEFAULT_OBJECT_STORE_MEMORY_PROPORTION, + LABELS_ENVIRONMENT_VARIABLE, + LOGGER_FORMAT, + RESOURCES_ENVIRONMENT_VARIABLE, +) + + +def env_integer(key, default): + if key in os.environ: + val = os.environ[key] + if val == "inf": + return sys.maxsize + else: + return int(val) + return default + + +# Whether autoscaler cluster status logging is enabled. Set to 0 disable. +AUTOSCALER_STATUS_LOG = env_integer("RAY_ENABLE_CLUSTER_STATUS_LOG", 1) + +# The name of the environment variable for plugging in a utilization scorer. +AUTOSCALER_UTILIZATION_SCORER_KEY = "RAY_AUTOSCALER_UTILIZATION_SCORER" + +# Whether to avoid launching GPU nodes for CPU only tasks. +AUTOSCALER_CONSERVE_GPU_NODES = env_integer("AUTOSCALER_CONSERVE_GPU_NODES", 1) + +# How long to wait for a node to start and terminate, in seconds. +AUTOSCALER_NODE_START_WAIT_S = env_integer("AUTOSCALER_NODE_START_WAIT_S", 900) +AUTOSCALER_NODE_TERMINATE_WAIT_S = env_integer("AUTOSCALER_NODE_TERMINATE_WAIT_S", 900) + +# Interval at which to check if node SSH became available. +AUTOSCALER_NODE_SSH_INTERVAL_S = env_integer("AUTOSCALER_NODE_SSH_INTERVAL_S", 5) + +# Abort autoscaling if more than this number of errors are encountered. This +# is a safety feature to prevent e.g. runaway node launches. +AUTOSCALER_MAX_NUM_FAILURES = env_integer("AUTOSCALER_MAX_NUM_FAILURES", 5) + +# The maximum number of nodes to launch in a single request. +# Multiple requests may be made for this batch size, up to +# the limit of AUTOSCALER_MAX_CONCURRENT_LAUNCHES. +AUTOSCALER_MAX_LAUNCH_BATCH = env_integer("AUTOSCALER_MAX_LAUNCH_BATCH", 5) + +# Max number of nodes to launch at a time. +AUTOSCALER_MAX_CONCURRENT_LAUNCHES = env_integer( + "AUTOSCALER_MAX_CONCURRENT_LAUNCHES", 10 +) + +# Default upscaling speed for the autoscaler. This specifies how many nodes +# to request at a time, where the desired number to upscale is +# min(1, upscaling_speed * current_num_nodes) +# e.g. 1.0 means to request enough nodes to double +# the cluster size in each round of requests. +# When the upscaling speed is 0.0, the autoscaler will request 1 node. +DEFAULT_UPSCALING_SPEED = 0.0 + +# Interval at which to perform autoscaling updates. +AUTOSCALER_UPDATE_INTERVAL_S = env_integer("AUTOSCALER_UPDATE_INTERVAL_S", 5) + +# The autoscaler will attempt to restart Ray on nodes it hasn't heard from +# in more than this interval. +AUTOSCALER_HEARTBEAT_TIMEOUT_S = env_integer("AUTOSCALER_HEARTBEAT_TIMEOUT_S", 30) +# The maximum number of nodes (including failed nodes) that the autoscaler will +# track for logging purposes. +AUTOSCALER_MAX_NODES_TRACKED = 1500 + +AUTOSCALER_MAX_FAILURES_DISPLAYED = 20 + +AUTOSCALER_NODE_AVAILABILITY_MAX_STALENESS_S = env_integer( + "AUTOSCALER_NODE_AVAILABILITY_MAX_STALENESS_S", 30 * 60 +) + +AUTOSCALER_REPORT_PER_NODE_STATUS = ( + env_integer("AUTOSCALER_REPORT_PER_NODE_STATUS", 1) == 1 +) + +# The maximum allowed resource demand vector size to guarantee the resource +# demand scheduler bin packing algorithm takes a reasonable amount of time +# to run. +AUTOSCALER_MAX_RESOURCE_DEMAND_VECTOR_SIZE = 1000 + +# Port that autoscaler prometheus metrics will be exported to +AUTOSCALER_METRIC_PORT = env_integer("AUTOSCALER_METRIC_PORT", 44217) + +# Max number of retries to AWS (default is 5, time increases exponentially) +BOTO_MAX_RETRIES = env_integer("BOTO_MAX_RETRIES", 12) +# Max number of retries to create an EC2 node (retry different subnet) +BOTO_CREATE_MAX_RETRIES = env_integer("BOTO_CREATE_MAX_RETRIES", 5) + +# ray home path in the container image +RAY_HOME = "/home/ray" + +# The order of this list matters! `scripts.py` kills the ray processes in order of this +# list. Think twice when you add to this list. +# Invariants: +# RAYLET must be the first in the list. +# GCS SERVER must be the last in the list. +RAY_PROCESSES = [ + # The first element is the substring to filter. + # The second element, if True, is to filter ps results by command name + # (only the first 15 charactors of the executable name on Linux); + # if False, is to filter ps results by command with all its arguments. + # See STANDARD FORMAT SPECIFIERS section of + # http://man7.org/linux/man-pages/man1/ps.1.html + # about comm and args. This can help avoid killing non-ray processes. + # Format: + # Keyword to filter, filter by command (True)/filter by args (False) + ["raylet", True], + ["plasma_store", True], + ["monitor.py", False], + ["ray.util.client.server", False], + ["default_worker.py", False], # Python worker. + ["setup_worker.py", False], # Python environment setup worker. + # For mac osx, setproctitle doesn't change the process name returned + # by psutil but only cmdline. + [ + "ray::", + sys.platform != "darwin", + ], # Python worker. TODO(mehrdadn): Fix for Windows + ["io.ray.runtime.runner.worker.DefaultWorker", False], # Java worker. + ["log_monitor.py", False], + ["reporter.py", False], + [os.path.join("dashboard", "agent.py"), False], + [os.path.join("dashboard", "dashboard.py"), False], + [os.path.join("runtime_env", "agent", "main.py"), False], + ["ray_process_reaper.py", False], + ["gcs_server", True], +] + +# Max Concurrent SSH Calls to stop Docker +MAX_PARALLEL_SHUTDOWN_WORKERS = env_integer("MAX_PARALLEL_SHUTDOWN_WORKERS", 50) + +DISABLE_NODE_UPDATERS_KEY = "disable_node_updaters" +DISABLE_LAUNCH_CONFIG_CHECK_KEY = "disable_launch_config_check" +FOREGROUND_NODE_LAUNCH_KEY = "foreground_node_launch" +WORKER_LIVENESS_CHECK_KEY = "worker_liveness_check" diff --git a/.venv/lib/python3.11/site-packages/ray/autoscaler/_private/docker.py b/.venv/lib/python3.11/site-packages/ray/autoscaler/_private/docker.py new file mode 100644 index 0000000000000000000000000000000000000000..116ad69610f9dd8d8ef35e6ceb636bd11c80200f --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/autoscaler/_private/docker.py @@ -0,0 +1,129 @@ +from pathlib import Path +from typing import Any, Dict + +from ray.autoscaler._private.cli_logger import cli_logger + +try: # py3 + from shlex import quote +except ImportError: # py2 + from pipes import quote + + +def _check_docker_file_mounts(file_mounts: Dict[str, str]) -> None: + """Checks if files are passed as file_mounts. This is a problem for Docker + based clusters because when a file is bind-mounted in Docker, updates to + the file on the host do not always propagate to the container. Using + directories is recommended. + """ + for remote, local in file_mounts.items(): + if Path(local).is_file(): + cli_logger.warning( + f"File Mount: ({remote}:{local}) refers to a file.\n To ensure" + " this mount updates properly, please use a directory." + ) + + +def validate_docker_config(config: Dict[str, Any]) -> None: + """Checks whether the Docker configuration is valid.""" + if "docker" not in config: + return + + _check_docker_file_mounts(config.get("file_mounts", {})) + + docker_image = config["docker"].get("image") + cname = config["docker"].get("container_name") + + head_docker_image = config["docker"].get("head_image", docker_image) + + worker_docker_image = config["docker"].get("worker_image", docker_image) + + image_present = docker_image or (head_docker_image and worker_docker_image) + if (not cname) and (not image_present): + return + else: + assert cname and image_present, "Must provide a container & image name" + + return None + + +def with_docker_exec( + cmds, container_name, docker_cmd, env_vars=None, with_interactive=False +): + assert docker_cmd, "Must provide docker command" + env_str = "" + if env_vars: + env_str = " ".join(["-e {env}=${env}".format(env=env) for env in env_vars]) + return [ + "docker exec {interactive} {env} {container} /bin/bash -c {cmd} ".format( + interactive="-it" if with_interactive else "", + env=env_str, + container=container_name, + cmd=quote(cmd), + ) + for cmd in cmds + ] + + +def _check_helper(cname, template, docker_cmd): + return " ".join( + [docker_cmd, "inspect", "-f", "'{{" + template + "}}'", cname, "||", "true"] + ) + + +def check_docker_running_cmd(cname, docker_cmd): + return _check_helper(cname, ".State.Running", docker_cmd) + + +def check_bind_mounts_cmd(cname, docker_cmd): + return _check_helper(cname, "json .Mounts", docker_cmd) + + +def check_docker_image(cname, docker_cmd): + return _check_helper(cname, ".Config.Image", docker_cmd) + + +def docker_start_cmds( + user, + image, + mount_dict, + container_name, + user_options, + cluster_name, + home_directory, + docker_cmd, +): + # Imported here due to circular dependency. + from ray.autoscaler.sdk import get_docker_host_mount_location + + docker_mount_prefix = get_docker_host_mount_location(cluster_name) + mount = {f"{docker_mount_prefix}/{dst}": dst for dst in mount_dict} + + mount_flags = " ".join( + [ + "-v {src}:{dest}".format(src=k, dest=v.replace("~/", home_directory + "/")) + for k, v in mount.items() + ] + ) + + # for click, used in ray cli + env_vars = {"LC_ALL": "C.UTF-8", "LANG": "C.UTF-8"} + env_flags = " ".join( + ["-e {name}={val}".format(name=k, val=v) for k, v in env_vars.items()] + ) + + user_options_str = " ".join(user_options) + docker_run = [ + docker_cmd, + "run", + "--rm", + "--name {}".format(container_name), + "-d", + "-it", + mount_flags, + env_flags, + user_options_str, + "--net=host", + image, + "bash", + ] + return " ".join(docker_run) diff --git a/.venv/lib/python3.11/site-packages/ray/autoscaler/_private/event_summarizer.py b/.venv/lib/python3.11/site-packages/ray/autoscaler/_private/event_summarizer.py new file mode 100644 index 0000000000000000000000000000000000000000..2f8d3494d191b7b5908b84ccf3481e273ec4e129 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/autoscaler/_private/event_summarizer.py @@ -0,0 +1,75 @@ +import time +from threading import RLock +from typing import Any, Callable, Dict, List + + +class EventSummarizer: + """Utility that aggregates related log messages to reduce log spam.""" + + def __init__(self): + self.events_by_key: Dict[str, int] = {} + # Messages to send in next summary batch. + self.messages_to_send: List[str] = [] + # Tracks TTL of messages. A message will not be re-sent once it is + # added here, until its TTL expires. + self.throttled_messages: Dict[str, float] = {} + + # Event summarizer is used by the main thread and + # by node launcher child threads. + self.lock = RLock() + + def add( + self, template: str, *, quantity: Any, aggregate: Callable[[Any, Any], Any] + ) -> None: + """Add a log message, which will be combined by template. + + Args: + template: Format string with one placeholder for quantity. + quantity: Quantity to aggregate. + aggregate: Aggregation function used to combine the + quantities. The result is inserted into the template to + produce the final log message. + """ + with self.lock: + # Enforce proper sentence structure. + if not template.endswith("."): + template += "." + if template in self.events_by_key: + self.events_by_key[template] = aggregate( + self.events_by_key[template], quantity + ) + else: + self.events_by_key[template] = quantity + + def add_once_per_interval(self, message: str, key: str, interval_s: int): + """Add a log message, which is throttled once per interval by a key. + + Args: + message: The message to log. + key: The key to use to deduplicate the message. + interval_s: Throttling interval in seconds. + """ + with self.lock: + if key not in self.throttled_messages: + self.throttled_messages[key] = time.time() + interval_s + self.messages_to_send.append(message) + + def summary(self) -> List[str]: + """Generate the aggregated log summary of all added events.""" + with self.lock: + out = [] + for template, quantity in self.events_by_key.items(): + out.append(template.format(quantity)) + out.extend(self.messages_to_send) + return out + + def clear(self) -> None: + """Clear the events added.""" + with self.lock: + self.events_by_key.clear() + self.messages_to_send.clear() + # Expire any messages that have reached their TTL. This allows them + # to be sent again. + for k, t in list(self.throttled_messages.items()): + if time.time() > t: + del self.throttled_messages[k] diff --git a/.venv/lib/python3.11/site-packages/ray/autoscaler/_private/event_system.py b/.venv/lib/python3.11/site-packages/ray/autoscaler/_private/event_system.py new file mode 100644 index 0000000000000000000000000000000000000000..db4bfc1ce101e3558a4996ef7fb94e64aee1fbb9 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/autoscaler/_private/event_system.py @@ -0,0 +1,106 @@ +from enum import Enum, auto +from typing import Any, Callable, Dict, List, Optional, Union + +from ray.autoscaler._private.cli_logger import cli_logger + + +class CreateClusterEvent(Enum): + """Events to track in ray.autoscaler.sdk.create_or_update_cluster. + + Attributes: + up_started : Invoked at the beginning of create_or_update_cluster. + ssh_keypair_downloaded : Invoked when the ssh keypair is downloaded. + cluster_booting_started : Invoked when when the cluster booting starts. + acquiring_new_head_node : Invoked before the head node is acquired. + head_node_acquired : Invoked after the head node is acquired. + ssh_control_acquired : Invoked when the node is being updated. + run_initialization_cmd : Invoked before all initialization + commands are called and again before each initialization command. + run_setup_cmd : Invoked before all setup commands are + called and again before each setup command. + start_ray_runtime : Invoked before ray start commands are run. + start_ray_runtime_completed : Invoked after ray start commands + are run. + cluster_booting_completed : Invoked after cluster booting + is completed. + """ + + up_started = auto() + ssh_keypair_downloaded = auto() + cluster_booting_started = auto() + acquiring_new_head_node = auto() + head_node_acquired = auto() + ssh_control_acquired = auto() + run_initialization_cmd = auto() + run_setup_cmd = auto() + start_ray_runtime = auto() + start_ray_runtime_completed = auto() + cluster_booting_completed = auto() + + +class _EventSystem: + """Event system that handles storing and calling callbacks for events. + + Attributes: + callback_map (Dict[str, List[Callable]]) : Stores list of callbacks + for events when registered. + """ + + def __init__(self): + self.callback_map = {} + + def add_callback_handler( + self, + event: str, + callback: Union[Callable[[Dict], None], List[Callable[[Dict], None]]], + ): + """Stores callback handler for event. + + Args: + event: Event that callback should be called on. See + CreateClusterEvent for details on the events available to be + registered against. + callback (Callable[[Dict], None]): Callable object that is invoked + when specified event occurs. + """ + if event not in CreateClusterEvent.__members__.values(): + cli_logger.warning( + f"{event} is not currently tracked, and this" + " callback will not be invoked." + ) + + self.callback_map.setdefault(event, []).extend( + [callback] if type(callback) is not list else callback + ) + + def execute_callback( + self, event: CreateClusterEvent, event_data: Optional[Dict[str, Any]] = None + ): + """Executes all callbacks for event. + + Args: + event: Event that is invoked. See CreateClusterEvent + for details on the available events. + event_data (Dict[str, Any]): Argument that is passed to each + callable object stored for this particular event. + """ + if event_data is None: + event_data = {} + + event_data["event_name"] = event + if event in self.callback_map: + for callback in self.callback_map[event]: + callback(event_data) + + def clear_callbacks_for_event(self, event: str): + """Clears stored callable objects for event. + + Args: + event: Event that has callable objects stored in map. + See CreateClusterEvent for details on the available events. + """ + if event in self.callback_map: + del self.callback_map[event] + + +global_event_system = _EventSystem() diff --git a/.venv/lib/python3.11/site-packages/ray/autoscaler/_private/fake_multi_node/__init__.py b/.venv/lib/python3.11/site-packages/ray/autoscaler/_private/fake_multi_node/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/.venv/lib/python3.11/site-packages/ray/autoscaler/_private/fake_multi_node/command_runner.py b/.venv/lib/python3.11/site-packages/ray/autoscaler/_private/fake_multi_node/command_runner.py new file mode 100644 index 0000000000000000000000000000000000000000..e1444eef7ee1b1196de9c2e6b6a2699b4a60eb9c --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/autoscaler/_private/fake_multi_node/command_runner.py @@ -0,0 +1,91 @@ +import os +import subprocess +from typing import Dict, List, Tuple + +from ray.autoscaler._private.docker import with_docker_exec +from ray.autoscaler.command_runner import CommandRunnerInterface + + +class FakeDockerCommandRunner(CommandRunnerInterface): + """Command runner for the fke docker multinode cluster. + + This command runner uses ``docker exec`` and ``docker cp`` to + run commands and copy files, respectively. + + The regular ``DockerCommandRunner`` is made for use in SSH settings + where Docker runs on a remote hose. In contrast, this command runner + does not wrap the docker commands in ssh calls. + """ + + def __init__(self, docker_config, **common_args): + self.container_name = docker_config["container_name"] + self.docker_config = docker_config + self.home_dir = None + self.initialized = False + # Optionally use 'podman' instead of 'docker' + use_podman = docker_config.get("use_podman", False) + self.docker_cmd = "podman" if use_podman else "docker" + + def _run_shell(self, cmd: str, timeout: int = 120) -> str: + return subprocess.check_output( + cmd, shell=True, timeout=timeout, encoding="utf-8" + ) + + def run( + self, + cmd: str = None, + timeout: int = 120, + exit_on_fail: bool = False, + port_forward: List[Tuple[int, int]] = None, + with_output: bool = False, + environment_variables: Dict[str, object] = None, + run_env: str = "auto", + ssh_options_override_ssh_key: str = "", + shutdown_after_run: bool = False, + ) -> str: + prefix = with_docker_exec( + [cmd], + container_name=self.container_name, + with_interactive=False, + docker_cmd=self.docker_cmd, + )[0] + return self._run_shell(prefix) + + def run_init( + self, *, as_head: bool, file_mounts: Dict[str, str], sync_run_yet: bool + ): + pass + + def remote_shell_command_str(self): + return "{} exec -it {} bash".format(self.docker_cmd, self.container_name) + + def run_rsync_down(self, source, target, options=None): + docker_dir = os.path.dirname(self._docker_expand_user(source)) + + self._run_shell(f"docker cp {self.container_name}:{docker_dir} {target}") + + def run_rsync_up(self, source, target, options=None): + docker_dir = os.path.dirname(self._docker_expand_user(target)) + self.run(cmd=f"mkdir -p {docker_dir}") + + self._run_shell(f"docker cp {source} {self.container_name}:{docker_dir}") + + def _docker_expand_user(self, string, any_char=False): + user_pos = string.find("~") + if user_pos > -1: + if self.home_dir is None: + self.home_dir = self._run_shell( + with_docker_exec( + ["printenv HOME"], + container_name=self.container_name, + docker_cmd=self.docker_cmd, + ) + ).strip() + + if any_char: + return string.replace("~/", self.home_dir + "/") + + elif not any_char and user_pos == 0: + return string.replace("~", self.home_dir, 1) + + return string diff --git a/.venv/lib/python3.11/site-packages/ray/autoscaler/_private/fake_multi_node/docker_monitor.py b/.venv/lib/python3.11/site-packages/ray/autoscaler/_private/fake_multi_node/docker_monitor.py new file mode 100644 index 0000000000000000000000000000000000000000..14c5e4646cf4625677259cf82c218de70c5982f6 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/autoscaler/_private/fake_multi_node/docker_monitor.py @@ -0,0 +1,246 @@ +"""Fake multinode docker monitoring script. + +This script is the "docker compose server" for the fake_multinode +provider using Docker compose. It should be started before running +`RAY_FAKE_CLUSTER=1 ray up `. + +This script reads the volume directory from a supplied fake multinode +docker cluster config file. +It then waits until a docker-compose.yaml file is created in the same +directory, which is done by the `ray up` command. + +It then watches for changes in the docker-compose.yaml file and runs +`docker compose up` whenever changes are detected. This will start docker +containers as requested by the autoscaler. + +Generally, the docker-compose.yaml will be mounted in the head node of the +cluster, which will then continue to change it according to the autoscaler +requirements. + +Additionally, this script monitors the docker container status using +`docker status` and writes it into a `status.json`. This information is +again used by the autoscaler to determine if any nodes have died. +""" +import argparse +import json +import os +import shutil +import subprocess +import time +from typing import Any, Dict, List, Optional + +import yaml + + +def _read_yaml(path: str): + with open(path, "rt") as f: + return yaml.safe_load(f) + + +def _update_docker_compose( + docker_compose_path: str, project_name: str, status: Optional[Dict[str, Any]] +) -> bool: + docker_compose_config = _read_yaml(docker_compose_path) + + if not docker_compose_config: + print("Docker compose currently empty") + return False + + cmd = ["up", "-d"] + if status and len(status) > 0: + cmd += ["--no-recreate"] + + shutdown = False + if not docker_compose_config["services"]: + # If no more nodes, run `down` instead of `up` + print("Shutting down nodes") + cmd = ["down"] + shutdown = True + try: + subprocess.check_call( + ["docker", "compose", "-f", docker_compose_path, "-p", project_name] + + cmd + + [ + "--remove-orphans", + ] + ) + except Exception as e: + print(f"Ran into error when updating docker compose: {e}") + # Ignore error + + return shutdown + + +def _get_ip( + project_name: str, + container_name: str, + override_network: Optional[str] = None, + retry_times: int = 3, +) -> Optional[str]: + network = override_network or f"{project_name}_ray_local" + + cmd = [ + "docker", + "inspect", + "-f", + '"{{ .NetworkSettings.Networks' f".{network}.IPAddress" ' }}"', + f"{container_name}", + ] + for i in range(retry_times): + try: + ip_address = subprocess.check_output(cmd, encoding="utf-8") + except Exception: + time.sleep(1) + else: + return ip_address.strip().strip('"').strip('\\"') + return None + + +def _update_docker_status( + docker_compose_path: str, project_name: str, docker_status_path: str +): + data_str = "" + try: + data_str = ( + subprocess.check_output( + [ + "docker", + "compose", + "-f", + docker_compose_path, + "-p", + project_name, + "ps", + "--format", + "json", + ] + ) + .decode("utf-8") + .strip() + .split("\n") + ) + data: List[Dict[str, str]] = [] + for line in data_str: + line = line.strip() + if line: + data.append(json.loads(line)) + except Exception as e: + print(f"Ran into error when fetching status: {e}") + print(f"docker compose ps output: {data_str}") + return None + + status = {} + for container in data: + node_id = container["Service"] + container_name = container["Name"] + if container["State"] == "running": + ip = _get_ip(project_name, container_name) + else: + ip = "" + container["IP"] = ip + status[node_id] = container + + with open(docker_status_path, "wt") as f: + json.dump(status, f) + + return status + + +def monitor_docker( + docker_compose_path: str, + status_path: str, + project_name: str, + update_interval: float = 1.0, +): + while not os.path.exists(docker_compose_path): + # Wait until cluster is created + time.sleep(0.5) + + print("Docker compose config detected, starting status monitoring") + + # Make sure this is always writeable from inside the containers + os.chmod(docker_compose_path, 0o777) + + docker_config = {"force_update": True} + + # Force update + next_update = time.monotonic() - 1.0 + shutdown = False + status = None + + # Loop: + # If the config changed, update cluster. + # Every `update_interval` seconds, update docker status. + while not shutdown: + new_docker_config = _read_yaml(docker_compose_path) + if new_docker_config != docker_config: + # Update cluster + shutdown = _update_docker_compose(docker_compose_path, project_name, status) + + # Force status update + next_update = time.monotonic() - 1.0 + + if time.monotonic() > next_update: + # Update docker status + status = _update_docker_status( + docker_compose_path, project_name, status_path + ) + next_update = time.monotonic() + update_interval + + docker_config = new_docker_config + time.sleep(0.1) + + print("Cluster shut down, terminating monitoring script.") + + +def start_monitor(config_file: str): + cluster_config = _read_yaml(config_file) + + provider_config = cluster_config["provider"] + assert provider_config["type"] == "fake_multinode_docker", ( + f"The docker monitor only works with providers of type " + f"`fake_multinode_docker`, got `{provider_config['type']}`" + ) + + project_name = provider_config["project_name"] + + volume_dir = provider_config["shared_volume_dir"] + os.makedirs(volume_dir, mode=0o755, exist_ok=True) + + # Create bootstrap config + bootstrap_config_path = os.path.join(volume_dir, "bootstrap_config.yaml") + shutil.copy(config_file, bootstrap_config_path) + + # These two files usually don't exist, yet + docker_compose_config_path = os.path.join(volume_dir, "docker-compose.yaml") + + docker_status_path = os.path.join(volume_dir, "status.json") + + if os.path.exists(docker_compose_config_path): + # We wait until this file exists, so remove it if it exists + # from a previous run. + os.remove(docker_compose_config_path) + + if os.path.exists(docker_status_path): + os.remove(docker_status_path) + # Create empty file so it can be mounted + with open(docker_status_path, "wt") as f: + f.write("{}") + + print( + f"Starting monitor process. Please start Ray cluster with:\n" + f" RAY_FAKE_CLUSTER=1 ray up {config_file}" + ) + monitor_docker(docker_compose_config_path, docker_status_path, project_name) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument( + "config_file", + help="Path to cluster config file containing a fake docker " + "cluster configuration.", + ) + args = parser.parse_args() + + start_monitor(args.config_file) diff --git a/.venv/lib/python3.11/site-packages/ray/autoscaler/_private/fake_multi_node/test_utils.py b/.venv/lib/python3.11/site-packages/ray/autoscaler/_private/fake_multi_node/test_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..9deccf8536d1b8c2191ce94aeb49ce8bc8a864c3 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/autoscaler/_private/fake_multi_node/test_utils.py @@ -0,0 +1,398 @@ +import json +import logging +import os +import random +import shutil +import subprocess +import sys +import tempfile +import threading +import time +from typing import Any, Dict, Optional + +import yaml + +import ray +from ray._private.dict import deep_update +from ray.autoscaler._private.fake_multi_node.node_provider import ( + FAKE_DOCKER_DEFAULT_CLIENT_PORT, + FAKE_DOCKER_DEFAULT_GCS_PORT, +) +from ray.util.queue import Empty, Queue + +logger = logging.getLogger(__name__) + +DEFAULT_DOCKER_IMAGE = "rayproject/ray:nightly-py{major}{minor}-cpu" + + +class ResourcesNotReadyError(RuntimeError): + pass + + +class DockerCluster: + """Docker cluster wrapper. + + Creates a directory for starting a fake multinode docker cluster. + + Includes APIs to update the cluster config as needed in tests, + and to start and connect to the cluster. + """ + + def __init__(self, config: Optional[Dict[str, Any]] = None): + self._base_config_file = os.path.join( + os.path.dirname(__file__), "example_docker.yaml" + ) + self._tempdir = None + self._config_file = None + self._nodes_file = None + self._nodes = {} + self._status_file = None + self._status = {} + self._partial_config = config + self._cluster_config = None + self._docker_image = None + + self._monitor_script = os.path.join( + os.path.dirname(__file__), "docker_monitor.py" + ) + self._monitor_process = None + + self._execution_thread = None + self._execution_event = threading.Event() + self._execution_queue = None + + @property + def config_file(self): + return self._config_file + + @property + def cluster_config(self): + return self._cluster_config + + @property + def cluster_dir(self): + return self._tempdir + + @property + def gcs_port(self): + return self._cluster_config.get("provider", {}).get( + "host_gcs_port", FAKE_DOCKER_DEFAULT_GCS_PORT + ) + + @property + def client_port(self): + return self._cluster_config.get("provider", {}).get( + "host_client_port", FAKE_DOCKER_DEFAULT_CLIENT_PORT + ) + + def connect(self, client: bool = True, timeout: int = 120, **init_kwargs): + """Connect to the docker-compose Ray cluster. + + Assumes the cluster is at RAY_TESTHOST (defaults to + ``127.0.0.1``). + + Args: + client: If True, uses Ray client to connect to the + cluster. If False, uses GCS to connect to the cluster. + timeout: Connection timeout in seconds. + **init_kwargs: kwargs to pass to ``ray.init()``. + + """ + host = os.environ.get("RAY_TESTHOST", "127.0.0.1") + + if client: + port = self.client_port + address = f"ray://{host}:{port}" + else: + port = self.gcs_port + address = f"{host}:{port}" + + timeout_at = time.monotonic() + timeout + while time.monotonic() < timeout_at: + try: + ray.init(address, **init_kwargs) + self.wait_for_resources({"CPU": 1}) + except ResourcesNotReadyError: + time.sleep(1) + continue + else: + break + + try: + ray.cluster_resources() + except Exception as e: + raise RuntimeError(f"Timed out connecting to Ray: {e}") + + def remote_execution_api(self) -> "RemoteAPI": + """Create an object to control cluster state from within the cluster.""" + self._execution_queue = Queue(actor_options={"num_cpus": 0}) + stop_event = self._execution_event + + def entrypoint(): + while not stop_event.is_set(): + try: + cmd, kwargs = self._execution_queue.get(timeout=1) + except Empty: + continue + + if cmd == "kill_node": + self.kill_node(**kwargs) + + self._execution_thread = threading.Thread(target=entrypoint) + self._execution_thread.start() + + return RemoteAPI(self._execution_queue) + + @staticmethod + def wait_for_resources(resources: Dict[str, float], timeout: int = 60): + """Wait until Ray cluster resources are available + + Args: + resources: Minimum resources needed before + this function returns. + timeout: Timeout in seconds. + + """ + timeout = time.monotonic() + timeout + + available = ray.cluster_resources() + while any(available.get(k, 0.0) < v for k, v in resources.items()): + if time.monotonic() > timeout: + raise ResourcesNotReadyError( + f"Timed out waiting for resources: {resources}" + ) + time.sleep(1) + available = ray.cluster_resources() + + def update_config(self, config: Optional[Dict[str, Any]] = None): + """Update autoscaling config. + + Does a deep update of the base config with a new configuration. + This can change autoscaling behavior. + + Args: + config: Partial config to update current + config with. + + """ + assert self._tempdir, "Call setup() first" + + config = config or {} + + if config: + self._partial_config = config + + if not config.get("provider", {}).get("image"): + # No image specified, trying to parse from buildkite + docker_image = os.environ.get("RAY_DOCKER_IMAGE", None) + + if not docker_image: + # If still no docker image, use one according to Python version + mj = sys.version_info.major + mi = sys.version_info.minor + + docker_image = DEFAULT_DOCKER_IMAGE.format(major=mj, minor=mi) + + self._docker_image = docker_image + + with open(self._base_config_file, "rt") as f: + cluster_config = yaml.safe_load(f) + + if self._partial_config: + deep_update(cluster_config, self._partial_config, new_keys_allowed=True) + + if self._docker_image: + cluster_config["provider"]["image"] = self._docker_image + + cluster_config["provider"]["shared_volume_dir"] = self._tempdir + + self._cluster_config = cluster_config + + with open(self._config_file, "wt") as f: + yaml.safe_dump(self._cluster_config, f) + + logging.info(f"Updated cluster config to: {self._cluster_config}") + + def maybe_pull_image(self): + if self._docker_image: + try: + images_str = subprocess.check_output( + f"docker image inspect {self._docker_image}", shell=True + ) + images = json.loads(images_str) + except Exception as e: + logger.error(f"Error inspecting image {self._docker_image}: {e}") + return + + if not images: + try: + subprocess.check_call( + f"docker pull {self._docker_image}", shell=True + ) + except Exception as e: + logger.error(f"Error pulling image {self._docker_image}: {e}") + + def setup(self): + """Setup docker compose cluster environment. + + Creates the temporary directory, writes the initial config file, + and pulls the docker image, if required. + """ + self._tempdir = tempfile.mkdtemp(dir=os.environ.get("RAY_TEMPDIR", None)) + os.chmod(self._tempdir, 0o777) + self._config_file = os.path.join(self._tempdir, "cluster.yaml") + self._nodes_file = os.path.join(self._tempdir, "nodes.json") + self._status_file = os.path.join(self._tempdir, "status.json") + self.update_config() + self.maybe_pull_image() + + def teardown(self, keep_dir: bool = False): + """Tear down docker compose cluster environment. + + Args: + keep_dir: If True, cluster directory + will not be removed after termination. + """ + if not keep_dir: + shutil.rmtree(self._tempdir) + self._tempdir = None + self._config_file = None + + def _start_monitor(self): + self._monitor_process = subprocess.Popen( + [sys.executable, self._monitor_script, self.config_file] + ) + time.sleep(2) + + def _stop_monitor(self): + if self._monitor_process: + self._monitor_process.wait(timeout=30) + if self._monitor_process.poll() is None: + self._monitor_process.terminate() + self._monitor_process = None + + def start(self): + """Start docker compose cluster. + + Starts the monitor process and runs ``ray up``. + """ + self._start_monitor() + + subprocess.check_call( + f"RAY_FAKE_CLUSTER=1 ray up -y {self.config_file}", shell=True + ) + + def stop(self): + """Stop docker compose cluster. + + Runs ``ray down`` and stops the monitor process. + """ + if ray.is_initialized: + ray.shutdown() + + subprocess.check_call( + f"RAY_FAKE_CLUSTER=1 ray down -y {self.config_file}", shell=True + ) + + self._stop_monitor() + self._execution_event.set() + + def _update_nodes(self): + with open(self._nodes_file, "rt") as f: + self._nodes = json.load(f) + + def _update_status(self): + with open(self._status_file, "rt") as f: + self._status = json.load(f) + + def _get_node( + self, + node_id: Optional[str] = None, + num: Optional[int] = None, + rand: Optional[str] = None, + ) -> str: + self._update_nodes() + if node_id: + assert ( + not num and not rand + ), "Only provide either `node_id`, `num`, or `random`." + elif num: + assert ( + not node_id and not rand + ), "Only provide either `node_id`, `num`, or `random`." + base = "fffffffffffffffffffffffffffffffffffffffffffffffffff" + node_id = base + str(num).zfill(5) + elif rand: + assert ( + not node_id and not num + ), "Only provide either `node_id`, `num`, or `random`." + assert rand in [ + "worker", + "any", + ], "`random` must be one of ['worker', 'any']" + choices = list(self._nodes.keys()) + if rand == "worker": + choices.remove( + "fffffffffffffffffffffffffffffffffffffffffffffffffff00000" + ) + # Else: any + node_id = random.choice(choices) + + assert node_id in self._nodes, f"Node with ID {node_id} is not in active nodes." + return node_id + + def _get_docker_container(self, node_id: str) -> Optional[str]: + self._update_status() + node_status = self._status.get(node_id) + if not node_status: + return None + + return node_status["Name"] + + def kill_node( + self, + node_id: Optional[str] = None, + num: Optional[int] = None, + rand: Optional[str] = None, + ): + """Kill node. + + If ``node_id`` is given, kill that node. + + If ``num`` is given, construct node_id from this number, and kill + that node. + + If ``rand`` is given (as either ``worker`` or ``any``), kill a random + node. + """ + node_id = self._get_node(node_id=node_id, num=num, rand=rand) + container = self._get_docker_container(node_id=node_id) + subprocess.check_call(f"docker kill {container}", shell=True) + + +class RemoteAPI: + """Remote API to control cluster state from within cluster tasks. + + This API uses a Ray queue to interact with an execution thread on the + host machine that will execute commands passed to the queue. + + Instances of this class can be serialized and passed to Ray remote actors + to interact with cluster state (but they can also be used outside actors). + + The API subset is limited to specific commands. + + Args: + queue: Ray queue to push command instructions to. + + """ + + def __init__(self, queue: Queue): + self._queue = queue + + def kill_node( + self, + node_id: Optional[str] = None, + num: Optional[int] = None, + rand: Optional[str] = None, + ): + self._queue.put(("kill_node", dict(node_id=node_id, num=num, rand=rand))) diff --git a/.venv/lib/python3.11/site-packages/ray/autoscaler/_private/legacy_info_string.py b/.venv/lib/python3.11/site-packages/ray/autoscaler/_private/legacy_info_string.py new file mode 100644 index 0000000000000000000000000000000000000000..830078345455218a005f7cbf734d0f7c88d93b98 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/autoscaler/_private/legacy_info_string.py @@ -0,0 +1,37 @@ +import logging + +from ray._private.ray_constants import DEBUG_AUTOSCALING_STATUS_LEGACY +from ray.experimental.internal_kv import _internal_kv_initialized, _internal_kv_put + +"""This file provides legacy support for the old info string in order to +ensure the dashboard's `api/cluster_status` does not break backwards +compatibilty. +""" + +logger = logging.getLogger(__name__) + + +def legacy_log_info_string(autoscaler, nodes): + tmp = "Cluster status: " + tmp += info_string(autoscaler, nodes) + tmp += "\n" + tmp += autoscaler.load_metrics.info_string() + tmp += "\n" + tmp += autoscaler.resource_demand_scheduler.debug_string( + nodes, + autoscaler.pending_launches.breakdown(), + autoscaler.load_metrics.get_resource_utilization(), + ) + if _internal_kv_initialized(): + _internal_kv_put(DEBUG_AUTOSCALING_STATUS_LEGACY, tmp, overwrite=True) + logger.debug(tmp) + + +def info_string(autoscaler, nodes): + suffix = "" + if autoscaler.updaters: + suffix += " ({} updating)".format(len(autoscaler.updaters)) + if autoscaler.num_failed_updates: + suffix += " ({} failed to update)".format(len(autoscaler.num_failed_updates)) + + return "{} nodes{}".format(len(nodes), suffix) diff --git a/.venv/lib/python3.11/site-packages/ray/autoscaler/_private/load_metrics.py b/.venv/lib/python3.11/site-packages/ray/autoscaler/_private/load_metrics.py new file mode 100644 index 0000000000000000000000000000000000000000..07192084d89b4d83c9feeac17d6f3b56c4875766 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/autoscaler/_private/load_metrics.py @@ -0,0 +1,375 @@ +import logging +import time +from collections import Counter +from functools import reduce +from typing import Dict, List + +from ray._private.gcs_utils import PlacementGroupTableData +from ray.autoscaler._private.constants import ( + AUTOSCALER_MAX_RESOURCE_DEMAND_VECTOR_SIZE, + AUTOSCALER_REPORT_PER_NODE_STATUS, +) +from ray.autoscaler._private.util import ( + DictCount, + LoadMetricsSummary, + NodeIP, + ResourceDict, +) +from ray.core.generated.common_pb2 import PlacementStrategy + +logger = logging.getLogger(__name__) + + +def add_resources(dict1: Dict[str, float], dict2: Dict[str, float]) -> Dict[str, float]: + """Add the values in two dictionaries. + + Returns: + dict: A new dictionary (inputs remain unmodified). + """ + new_dict = dict1.copy() + for k, v in dict2.items(): + new_dict[k] = v + new_dict.get(k, 0) + return new_dict + + +def freq_of_dicts(dicts: List[Dict], serializer=None, deserializer=dict) -> DictCount: + """Count a list of dictionaries (or unhashable types). + + This is somewhat annoying because mutable data structures aren't hashable, + and set/dict keys must be hashable. + + Args: + dicts (List[D]): A list of dictionaries to be counted. + serializer (D -> S): A custom serialization function. The output type S + must be hashable. The default serializer converts a dictionary into + a frozenset of KV pairs. + deserializer (S -> U): A custom deserialization function. See the + serializer for information about type S. For dictionaries U := D. + + Returns: + List[Tuple[U, int]]: Returns a list of tuples. Each entry in the list + is a tuple containing a unique entry from `dicts` and its + corresponding frequency count. + """ + if serializer is None: + serializer = lambda d: frozenset(d.items()) # noqa: E731 + + freqs = Counter(serializer(d) for d in dicts) + as_list = [] + for as_set, count in freqs.items(): + as_list.append((deserializer(as_set), count)) + return as_list + + +class LoadMetrics: + """Container for cluster load metrics. + + Metrics here are updated from raylet heartbeats. The autoscaler + queries these metrics to determine when to scale up, and which nodes + can be removed. + """ + + def __init__(self): + self.last_heartbeat_time_by_ip = {} + self.static_resources_by_ip = {} + self.dynamic_resources_by_ip = {} + self.raylet_id_by_ip = {} + self.waiting_bundles = [] + self.infeasible_bundles = [] + self.pending_placement_groups = [] + self.resource_requests = [] + self.cluster_full_of_actors_detected = False + self.ray_nodes_last_used_time_by_ip = {} + + def __bool__(self): + """A load metrics instance is Falsey iff the autoscaler process + has not received a resource message from the GCS. + """ + return bool(self.raylet_id_by_ip) + + def update( + self, + ip: str, + raylet_id: bytes, + static_resources: Dict[str, Dict], + dynamic_resources: Dict[str, Dict], + node_idle_duration_s: float, + waiting_bundles: List[Dict[str, float]] = None, + infeasible_bundles: List[Dict[str, float]] = None, + pending_placement_groups: List[PlacementGroupTableData] = None, + cluster_full_of_actors_detected: bool = False, + ): + self.static_resources_by_ip[ip] = static_resources + self.raylet_id_by_ip[ip] = raylet_id + self.cluster_full_of_actors_detected = cluster_full_of_actors_detected + + if not waiting_bundles: + waiting_bundles = [] + if not infeasible_bundles: + infeasible_bundles = [] + if not pending_placement_groups: + pending_placement_groups = [] + + # We are not guaranteed to have a corresponding dynamic resource + # for every static resource because dynamic resources are based on + # the available resources in the heartbeat, which does not exist + # if it is zero. Thus, we have to update dynamic resources here. + dynamic_resources_update = dynamic_resources.copy() + for resource_name, capacity in self.static_resources_by_ip[ip].items(): + if resource_name not in dynamic_resources_update: + dynamic_resources_update[resource_name] = 0.0 + self.dynamic_resources_by_ip[ip] = dynamic_resources_update + + now = time.time() + self.ray_nodes_last_used_time_by_ip[ip] = now - node_idle_duration_s + self.last_heartbeat_time_by_ip[ip] = now + self.waiting_bundles = waiting_bundles + self.infeasible_bundles = infeasible_bundles + self.pending_placement_groups = pending_placement_groups + + def mark_active(self, ip): + assert ip is not None, "IP should be known at this time" + logger.debug("Node {} is newly setup, treating as active".format(ip)) + self.last_heartbeat_time_by_ip[ip] = time.time() + + def is_active(self, ip): + return ip in self.last_heartbeat_time_by_ip + + def prune_active_ips(self, active_ips: List[str]): + """The Raylet ips stored by LoadMetrics are obtained by polling + the GCS in Monitor.update_load_metrics(). + + On the other hand, the autoscaler gets a list of node ips from + its NodeProvider. + + This method removes from LoadMetrics the ips unknown to the autoscaler. + + Args: + active_ips (List[str]): The node ips known to the autoscaler. + """ + active_ips = set(active_ips) + + def prune(mapping, should_log): + unwanted_ips = set(mapping) - active_ips + for unwanted_ip in unwanted_ips: + if should_log: + logger.info("LoadMetrics: " f"Removed ip: {unwanted_ip}.") + del mapping[unwanted_ip] + if unwanted_ips and should_log: + logger.info( + "LoadMetrics: " + "Removed {} stale ip mappings: {} not in {}".format( + len(unwanted_ips), unwanted_ips, active_ips + ) + ) + assert not (unwanted_ips & set(mapping)) + + prune(self.ray_nodes_last_used_time_by_ip, should_log=True) + prune(self.static_resources_by_ip, should_log=False) + prune(self.raylet_id_by_ip, should_log=False) + prune(self.dynamic_resources_by_ip, should_log=False) + prune(self.last_heartbeat_time_by_ip, should_log=False) + + def get_node_resources(self): + """Return a list of node resources (static resource sizes). + + Example: + >>> from ray.autoscaler._private.load_metrics import LoadMetrics + >>> metrics = LoadMetrics(...) # doctest: +SKIP + >>> metrics.get_node_resources() # doctest: +SKIP + [{"CPU": 1}, {"CPU": 4, "GPU": 8}] # for two different nodes + """ + return self.static_resources_by_ip.values() + + def get_static_node_resources_by_ip(self) -> Dict[NodeIP, ResourceDict]: + """Return a dict of node resources for every node ip. + + Example: + >>> from ray.autoscaler._private.load_metrics import LoadMetrics + >>> metrics = LoadMetrics(...) # doctest: +SKIP + >>> metrics.get_static_node_resources_by_ip() # doctest: +SKIP + {127.0.0.1: {"CPU": 1}, 127.0.0.2: {"CPU": 4, "GPU": 8}} + """ + return self.static_resources_by_ip + + def get_resource_utilization(self): + return self.dynamic_resources_by_ip + + def _get_resource_usage(self): + resources_used = {} + resources_total = {} + for ip, max_resources in self.static_resources_by_ip.items(): + avail_resources = self.dynamic_resources_by_ip[ip] + for resource_id, amount in max_resources.items(): + used = amount - avail_resources[resource_id] + if resource_id not in resources_used: + resources_used[resource_id] = 0.0 + resources_total[resource_id] = 0.0 + resources_used[resource_id] += used + resources_total[resource_id] += amount + used = max(0, used) + + return resources_used, resources_total + + def get_resource_demand_vector(self, clip=True): + if clip: + # Bound the total number of bundles to + # 2xMAX_RESOURCE_DEMAND_VECTOR_SIZE. This guarantees the resource + # demand scheduler bin packing algorithm takes a reasonable amount + # of time to run. + return ( + self.waiting_bundles[:AUTOSCALER_MAX_RESOURCE_DEMAND_VECTOR_SIZE] + + self.infeasible_bundles[:AUTOSCALER_MAX_RESOURCE_DEMAND_VECTOR_SIZE] + ) + else: + return self.waiting_bundles + self.infeasible_bundles + + def get_resource_requests(self): + return self.resource_requests + + def get_pending_placement_groups(self): + return self.pending_placement_groups + + def resources_avail_summary(self) -> str: + """Return a concise string of cluster size to report to event logs. + + For example, "3 CPUs, 4 GPUs". + """ + total_resources = ( + reduce(add_resources, self.static_resources_by_ip.values()) + if self.static_resources_by_ip + else {} + ) + out = "{} CPUs".format(int(total_resources.get("CPU", 0))) + if "GPU" in total_resources: + out += ", {} GPUs".format(int(total_resources["GPU"])) + if "TPU" in total_resources: + out += ", {} TPUs".format(int(total_resources["TPU"])) + return out + + def summary(self): + available_resources = ( + reduce(add_resources, self.dynamic_resources_by_ip.values()) + if self.dynamic_resources_by_ip + else {} + ) + total_resources = ( + reduce(add_resources, self.static_resources_by_ip.values()) + if self.static_resources_by_ip + else {} + ) + usage_dict = {} + for key in total_resources: + if key in ["memory", "object_store_memory"]: + total = total_resources[key] + available = available_resources[key] + usage_dict[key] = (total - available, total) + else: + total = total_resources[key] + usage_dict[key] = (total - available_resources[key], total) + + summarized_demand_vector = freq_of_dicts( + self.get_resource_demand_vector(clip=False) + ) + summarized_resource_requests = freq_of_dicts(self.get_resource_requests()) + + def placement_group_serializer(pg): + bundles = tuple( + frozenset(bundle.unit_resources.items()) for bundle in pg.bundles + ) + return (bundles, pg.strategy) + + def placement_group_deserializer(pg_tuple): + # We marshal this as a dictionary so that we can easily json.dumps + # it later. + # TODO (Alex): Would there be a benefit to properly + # marshalling this (into a protobuf)? + bundles = list(map(dict, pg_tuple[0])) + return { + "bundles": freq_of_dicts(bundles), + "strategy": PlacementStrategy.Name(pg_tuple[1]), + } + + summarized_placement_groups = freq_of_dicts( + self.get_pending_placement_groups(), + serializer=placement_group_serializer, + deserializer=placement_group_deserializer, + ) + nodes_summary = freq_of_dicts(self.static_resources_by_ip.values()) + + usage_by_node = None + if AUTOSCALER_REPORT_PER_NODE_STATUS: + usage_by_node = {} + for ip, totals in self.static_resources_by_ip.items(): + available = self.dynamic_resources_by_ip.get(ip, {}) + usage_by_node[ip] = {} + for resource, total in totals.items(): + usage_by_node[ip][resource] = ( + total - available.get(resource, 0), + total, + ) + + return LoadMetricsSummary( + usage=usage_dict, + resource_demand=summarized_demand_vector, + pg_demand=summarized_placement_groups, + request_demand=summarized_resource_requests, + node_types=nodes_summary, + usage_by_node=usage_by_node, + ) + + def set_resource_requests(self, requested_resources): + if requested_resources is not None: + assert isinstance(requested_resources, list), requested_resources + self.resource_requests = [ + request for request in requested_resources if len(request) > 0 + ] + + def info_string(self): + return " - " + "\n - ".join( + ["{}: {}".format(k, v) for k, v in sorted(self._info().items())] + ) + + def _info(self): + resources_used, resources_total = self._get_resource_usage() + + now = time.time() + idle_times = [now - t for t in self.ray_nodes_last_used_time_by_ip.values()] + heartbeat_times = [now - t for t in self.last_heartbeat_time_by_ip.values()] + most_delayed_heartbeats = sorted( + self.last_heartbeat_time_by_ip.items(), key=lambda pair: pair[1] + )[:5] + most_delayed_heartbeats = {ip: (now - t) for ip, t in most_delayed_heartbeats} + + def format_resource(key, value): + if key in ["object_store_memory", "memory"]: + return "{} GiB".format(round(value / (1024 * 1024 * 1024), 2)) + else: + return round(value, 2) + + return { + "ResourceUsage": ", ".join( + [ + "{}/{} {}".format( + format_resource(rid, resources_used[rid]), + format_resource(rid, resources_total[rid]), + rid, + ) + for rid in sorted(resources_used) + if not rid.startswith("node:") + ] + ), + "NodeIdleSeconds": "Min={} Mean={} Max={}".format( + int(min(idle_times)) if idle_times else -1, + int(float(sum(idle_times)) / len(idle_times)) if idle_times else -1, + int(max(idle_times)) if idle_times else -1, + ), + "TimeSinceLastHeartbeat": "Min={} Mean={} Max={}".format( + int(min(heartbeat_times)) if heartbeat_times else -1, + int(float(sum(heartbeat_times)) / len(heartbeat_times)) + if heartbeat_times + else -1, + int(max(heartbeat_times)) if heartbeat_times else -1, + ), + "MostDelayedHeartbeats": most_delayed_heartbeats, + } diff --git a/.venv/lib/python3.11/site-packages/ray/autoscaler/_private/loader.py b/.venv/lib/python3.11/site-packages/ray/autoscaler/_private/loader.py new file mode 100644 index 0000000000000000000000000000000000000000..a46c40dd341f4f1e7ff09d728747db98a2a4ce70 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/autoscaler/_private/loader.py @@ -0,0 +1,15 @@ +import importlib + + +def load_function_or_class(path): + """Load a function or class at runtime given a full path. + + Example of the path: mypkg.mysubpkg.myclass + """ + class_data = path.split(".") + if len(class_data) < 2: + raise ValueError("You need to pass a valid path like mymodule.provider_class") + module_path = ".".join(class_data[:-1]) + fn_or_class_str = class_data[-1] + module = importlib.import_module(module_path) + return getattr(module, fn_or_class_str) diff --git a/.venv/lib/python3.11/site-packages/ray/autoscaler/_private/local/__init__.py b/.venv/lib/python3.11/site-packages/ray/autoscaler/_private/local/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/.venv/lib/python3.11/site-packages/ray/autoscaler/_private/local/__pycache__/__init__.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/autoscaler/_private/local/__pycache__/__init__.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..ee139884231c1fa1f05f68fcdb0e5a120820a9e7 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/autoscaler/_private/local/__pycache__/__init__.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/autoscaler/_private/local/__pycache__/config.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/autoscaler/_private/local/__pycache__/config.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..344a5c8a70c4359a57f70e65931bd38f32a769af Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/autoscaler/_private/local/__pycache__/config.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/autoscaler/_private/local/__pycache__/coordinator_node_provider.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/autoscaler/_private/local/__pycache__/coordinator_node_provider.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e8b365657ffb6d51d1948cc372f261911c23f538 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/autoscaler/_private/local/__pycache__/coordinator_node_provider.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/autoscaler/_private/local/__pycache__/node_provider.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/autoscaler/_private/local/__pycache__/node_provider.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..99e66b023ebb680b50831f770dae92686a2e1013 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/autoscaler/_private/local/__pycache__/node_provider.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/autoscaler/_private/local/config.py b/.venv/lib/python3.11/site-packages/ray/autoscaler/_private/local/config.py new file mode 100644 index 0000000000000000000000000000000000000000..4a461c4cfbeb04d5b908ec2544711c204117eefb --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/autoscaler/_private/local/config.py @@ -0,0 +1,121 @@ +import copy +import os +from typing import Any, Dict + +from ray._private.utils import get_ray_temp_dir +from ray.autoscaler._private.cli_logger import cli_logger + +unsupported_field_message = "The field {} is not supported for on-premise clusters." + +LOCAL_CLUSTER_NODE_TYPE = "local.cluster.node" + + +def prepare_local(config: Dict[str, Any]) -> Dict[str, Any]: + """ + Prepare local cluster config for ingestion by cluster launcher and + autoscaler. + """ + config = copy.deepcopy(config) + for field in "head_node", "worker_nodes", "available_node_types": + if config.get(field): + err_msg = unsupported_field_message.format(field) + cli_logger.abort(err_msg) + # We use a config with a single node type for on-prem clusters. + # Resources internally detected by Ray are not overridden by the autoscaler + # (see NodeProvider.do_update) + config["available_node_types"] = { + LOCAL_CLUSTER_NODE_TYPE: {"node_config": {}, "resources": {}} + } + config["head_node_type"] = LOCAL_CLUSTER_NODE_TYPE + if "coordinator_address" in config["provider"]: + config = prepare_coordinator(config) + else: + config = prepare_manual(config) + return config + + +def prepare_coordinator(config: Dict[str, Any]) -> Dict[str, Any]: + config = copy.deepcopy(config) + # User should explicitly set the max number of workers for the coordinator + # to allocate. + if "max_workers" not in config: + cli_logger.abort( + "The field `max_workers` is required when using an " + "automatically managed on-premise cluster." + ) + node_type = config["available_node_types"][LOCAL_CLUSTER_NODE_TYPE] + # The autoscaler no longer uses global `min_workers`. + # Move `min_workers` to the node_type config. + node_type["min_workers"] = config.pop("min_workers", 0) + node_type["max_workers"] = config["max_workers"] + return config + + +def prepare_manual(config: Dict[str, Any]) -> Dict[str, Any]: + """Validates and sets defaults for configs of manually managed on-prem + clusters. + + - Checks for presence of required `worker_ips` and `head_ips` fields. + - Defaults min and max workers to the number of `worker_ips`. + - Caps min and max workers at the number of `worker_ips`. + - Writes min and max worker info into the single worker node type. + """ + config = copy.deepcopy(config) + if ("worker_ips" not in config["provider"]) or ( + "head_ip" not in config["provider"] + ): + cli_logger.abort( + "Please supply a `head_ip` and list of `worker_ips`. " + "Alternatively, supply a `coordinator_address`." + ) + num_ips = len(config["provider"]["worker_ips"]) + node_type = config["available_node_types"][LOCAL_CLUSTER_NODE_TYPE] + # Default to keeping all provided ips in the cluster. + config.setdefault("max_workers", num_ips) + + # The autoscaler no longer uses global `min_workers`. + # We will move `min_workers` to the node_type config. + min_workers = config.pop("min_workers", num_ips) + max_workers = config["max_workers"] + + if min_workers > num_ips: + cli_logger.warning( + f"The value of `min_workers` supplied ({min_workers}) is greater" + f" than the number of available worker ips ({num_ips})." + f" Setting `min_workers={num_ips}`." + ) + node_type["min_workers"] = num_ips + else: + node_type["min_workers"] = min_workers + + if max_workers > num_ips: + cli_logger.warning( + f"The value of `max_workers` supplied ({max_workers}) is greater" + f" than the number of available worker ips ({num_ips})." + f" Setting `max_workers={num_ips}`." + ) + node_type["max_workers"] = num_ips + config["max_workers"] = num_ips + else: + node_type["max_workers"] = max_workers + + if max_workers < num_ips: + cli_logger.warning( + f"The value of `max_workers` supplied ({max_workers}) is less" + f" than the number of available worker ips ({num_ips})." + f" At most {max_workers} Ray worker nodes will connect to the cluster." + ) + + return config + + +def get_lock_path(cluster_name: str) -> str: + return os.path.join(get_ray_temp_dir(), "cluster-{}.lock".format(cluster_name)) + + +def get_state_path(cluster_name: str) -> str: + return os.path.join(get_ray_temp_dir(), "cluster-{}.state".format(cluster_name)) + + +def bootstrap_local(config: Dict[str, Any]) -> Dict[str, Any]: + return config diff --git a/.venv/lib/python3.11/site-packages/ray/autoscaler/_private/local/coordinator_node_provider.py b/.venv/lib/python3.11/site-packages/ray/autoscaler/_private/local/coordinator_node_provider.py new file mode 100644 index 0000000000000000000000000000000000000000..a74d593d36c732243d80a57944f938167576c057 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/autoscaler/_private/local/coordinator_node_provider.py @@ -0,0 +1,110 @@ +import json +import logging +from http.client import RemoteDisconnected + +from ray.autoscaler.node_provider import NodeProvider +from ray.autoscaler.tags import TAG_RAY_CLUSTER_NAME + +logger = logging.getLogger(__name__) + + +class CoordinatorSenderNodeProvider(NodeProvider): + """NodeProvider for automatically managed private/local clusters. + + The cluster management is handled by a remote coordinating server. + The server listens on , therefore, the address + should be provided in the provider section in the cluster config. + The server receieves HTTP requests from this class and uses + LocalNodeProvider to get their responses. + """ + + def __init__(self, provider_config, cluster_name): + NodeProvider.__init__(self, provider_config, cluster_name) + self.coordinator_address = provider_config["coordinator_address"] + + def _get_http_response(self, request): + headers = { + "Content-Type": "application/json", + } + request_message = json.dumps(request).encode() + http_coordinator_address = "http://" + self.coordinator_address + + try: + import requests # `requests` is not part of stdlib. + from requests.exceptions import ConnectionError + + r = requests.get( + http_coordinator_address, + data=request_message, + headers=headers, + timeout=None, + ) + except (RemoteDisconnected, ConnectionError): + logger.exception( + "Could not connect to: " + + http_coordinator_address + + ". Did you run python coordinator_server.py" + + " --ips --port ?" + ) + raise + except ImportError: + logger.exception( + "Not all Ray Autoscaler dependencies were found. " + "In Ray 1.4+, the Ray CLI, autoscaler, and dashboard will " + 'only be usable via `pip install "ray[default]"`. Please ' + "update your install command." + ) + raise + + response = r.json() + return response + + def non_terminated_nodes(self, tag_filters): + # Only get the non terminated nodes associated with this cluster name. + tag_filters[TAG_RAY_CLUSTER_NAME] = self.cluster_name + request = {"type": "non_terminated_nodes", "args": (tag_filters,)} + return self._get_http_response(request) + + def is_running(self, node_id): + request = {"type": "is_running", "args": (node_id,)} + return self._get_http_response(request) + + def is_terminated(self, node_id): + request = {"type": "is_terminated", "args": (node_id,)} + return self._get_http_response(request) + + def node_tags(self, node_id): + request = {"type": "node_tags", "args": (node_id,)} + return self._get_http_response(request) + + def external_ip(self, node_id): + request = {"type": "external_ip", "args": (node_id,)} + response = self._get_http_response(request) + return response + + def internal_ip(self, node_id): + request = {"type": "internal_ip", "args": (node_id,)} + response = self._get_http_response(request) + return response + + def create_node(self, node_config, tags, count): + # Tag the newly created node with this cluster name. Helps to get + # the right nodes when calling non_terminated_nodes. + tags[TAG_RAY_CLUSTER_NAME] = self.cluster_name + request = { + "type": "create_node", + "args": (node_config, tags, count), + } + self._get_http_response(request) + + def set_node_tags(self, node_id, tags): + request = {"type": "set_node_tags", "args": (node_id, tags)} + self._get_http_response(request) + + def terminate_node(self, node_id): + request = {"type": "terminate_node", "args": (node_id,)} + self._get_http_response(request) + + def terminate_nodes(self, node_ids): + request = {"type": "terminate_nodes", "args": (node_ids,)} + self._get_http_response(request) diff --git a/.venv/lib/python3.11/site-packages/ray/autoscaler/_private/local/node_provider.py b/.venv/lib/python3.11/site-packages/ray/autoscaler/_private/local/node_provider.py new file mode 100644 index 0000000000000000000000000000000000000000..cd3b7b64166fab3766cfe29a762b845c1b372398 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/autoscaler/_private/local/node_provider.py @@ -0,0 +1,304 @@ +import json +import logging +import os +import socket +from threading import RLock + +from filelock import FileLock + +from ray.autoscaler._private.local.config import ( + LOCAL_CLUSTER_NODE_TYPE, + bootstrap_local, + get_lock_path, + get_state_path, +) +from ray.autoscaler.node_provider import NodeProvider +from ray.autoscaler.tags import ( + NODE_KIND_HEAD, + NODE_KIND_WORKER, + STATUS_UP_TO_DATE, + TAG_RAY_NODE_KIND, + TAG_RAY_NODE_NAME, + TAG_RAY_NODE_STATUS, + TAG_RAY_USER_NODE_TYPE, +) + +logger = logging.getLogger(__name__) + +filelock_logger = logging.getLogger("filelock") +filelock_logger.setLevel(logging.WARNING) + + +class ClusterState: + def __init__(self, lock_path, save_path, provider_config): + self.lock = RLock() + os.makedirs(os.path.dirname(lock_path), exist_ok=True) + self.file_lock = FileLock(lock_path) + self.save_path = save_path + + with self.lock: + with self.file_lock: + if os.path.exists(self.save_path): + workers = json.loads(open(self.save_path).read()) + head_config = workers.get(provider_config["head_ip"]) + if ( + not head_config + or head_config.get("tags", {}).get(TAG_RAY_NODE_KIND) + != NODE_KIND_HEAD + ): + workers = {} + logger.info("Head IP changed - recreating cluster.") + else: + workers = {} + logger.info( + "ClusterState: Loaded cluster state: {}".format(list(workers)) + ) + for worker_ip in provider_config["worker_ips"]: + if worker_ip not in workers: + workers[worker_ip] = { + "tags": {TAG_RAY_NODE_KIND: NODE_KIND_WORKER}, + "state": "terminated", + } + else: + assert ( + workers[worker_ip]["tags"][TAG_RAY_NODE_KIND] + == NODE_KIND_WORKER + ) + if provider_config["head_ip"] not in workers: + workers[provider_config["head_ip"]] = { + "tags": {TAG_RAY_NODE_KIND: NODE_KIND_HEAD}, + "state": "terminated", + } + else: + assert ( + workers[provider_config["head_ip"]]["tags"][TAG_RAY_NODE_KIND] + == NODE_KIND_HEAD + ) + # Relevant when a user reduces the number of workers + # without changing the headnode. + list_of_node_ips = list(provider_config["worker_ips"]) + list_of_node_ips.append(provider_config["head_ip"]) + for worker_ip in list(workers): + if worker_ip not in list_of_node_ips: + del workers[worker_ip] + + # Set external head ip, if provided by user. + # Necessary if calling `ray up` from outside the network. + # Refer to LocalNodeProvider.external_ip function. + external_head_ip = provider_config.get("external_head_ip") + if external_head_ip: + head = workers[provider_config["head_ip"]] + head["external_ip"] = external_head_ip + + assert len(workers) == len(provider_config["worker_ips"]) + 1 + with open(self.save_path, "w") as f: + logger.debug( + "ClusterState: Writing cluster state: {}".format(workers) + ) + f.write(json.dumps(workers)) + + def get(self): + with self.lock: + with self.file_lock: + workers = json.loads(open(self.save_path).read()) + return workers + + def put(self, worker_id, info): + assert "tags" in info + assert "state" in info + with self.lock: + with self.file_lock: + workers = self.get() + workers[worker_id] = info + with open(self.save_path, "w") as f: + logger.info( + "ClusterState: " + "Writing cluster state: {}".format(list(workers)) + ) + f.write(json.dumps(workers)) + + +class OnPremCoordinatorState(ClusterState): + """Generates & updates the state file of CoordinatorSenderNodeProvider. + + Unlike ClusterState, which generates a cluster specific file with + predefined head and worker ips, OnPremCoordinatorState overwrites + ClusterState's __init__ function to generate and manage a unified + file of the status of all the nodes for multiple clusters. + """ + + def __init__(self, lock_path, save_path, list_of_node_ips): + self.lock = RLock() + self.file_lock = FileLock(lock_path) + self.save_path = save_path + + with self.lock: + with self.file_lock: + if os.path.exists(self.save_path): + nodes = json.loads(open(self.save_path).read()) + else: + nodes = {} + logger.info( + "OnPremCoordinatorState: " + "Loaded on prem coordinator state: {}".format(nodes) + ) + + # Filter removed node ips. + for node_ip in list(nodes): + if node_ip not in list_of_node_ips: + del nodes[node_ip] + + for node_ip in list_of_node_ips: + if node_ip not in nodes: + nodes[node_ip] = { + "tags": {}, + "state": "terminated", + } + assert len(nodes) == len(list_of_node_ips) + with open(self.save_path, "w") as f: + logger.info( + "OnPremCoordinatorState: " + "Writing on prem coordinator state: {}".format(nodes) + ) + f.write(json.dumps(nodes)) + + +class LocalNodeProvider(NodeProvider): + """NodeProvider for private/local clusters. + + `node_id` is overloaded to also be `node_ip` in this class. + + When `cluster_name` is provided, it manages a single cluster in a cluster + specific state file. But when `cluster_name` is None, it manages multiple + clusters in a unified state file that requires each node to be tagged with + TAG_RAY_CLUSTER_NAME in create and non_terminated_nodes function calls to + associate each node with the right cluster. + + The current use case of managing multiple clusters is by + OnPremCoordinatorServer which receives node provider HTTP requests + from CoordinatorSenderNodeProvider and uses LocalNodeProvider to get + the responses. + """ + + def __init__(self, provider_config, cluster_name): + NodeProvider.__init__(self, provider_config, cluster_name) + + if cluster_name: + lock_path = get_lock_path(cluster_name) + state_path = get_state_path(cluster_name) + self.state = ClusterState( + lock_path, + state_path, + provider_config, + ) + self.use_coordinator = False + else: + # LocalNodeProvider with a coordinator server. + self.state = OnPremCoordinatorState( + "/tmp/coordinator.lock", + "/tmp/coordinator.state", + provider_config["list_of_node_ips"], + ) + self.use_coordinator = True + + def non_terminated_nodes(self, tag_filters): + workers = self.state.get() + matching_ips = [] + for worker_ip, info in workers.items(): + if info["state"] == "terminated": + continue + ok = True + for k, v in tag_filters.items(): + if info["tags"].get(k) != v: + ok = False + break + if ok: + matching_ips.append(worker_ip) + return matching_ips + + def is_running(self, node_id): + return self.state.get()[node_id]["state"] == "running" + + def is_terminated(self, node_id): + return not self.is_running(node_id) + + def node_tags(self, node_id): + return self.state.get()[node_id]["tags"] + + def external_ip(self, node_id): + """Returns an external ip if the user has supplied one. + Otherwise, use the same logic as internal_ip below. + + This can be used to call ray up from outside the network, for example + if the Ray cluster exists in an AWS VPC and we're interacting with + the cluster from a laptop (where using an internal_ip will not work). + + Useful for debugging the local node provider with cloud VMs.""" + + node_state = self.state.get()[node_id] + ext_ip = node_state.get("external_ip") + if ext_ip: + return ext_ip + else: + return socket.gethostbyname(node_id) + + def internal_ip(self, node_id): + return socket.gethostbyname(node_id) + + def set_node_tags(self, node_id, tags): + with self.state.file_lock: + info = self.state.get()[node_id] + info["tags"].update(tags) + self.state.put(node_id, info) + + def create_node(self, node_config, tags, count): + """Creates min(count, currently available) nodes.""" + node_type = tags[TAG_RAY_NODE_KIND] + with self.state.file_lock: + workers = self.state.get() + for node_id, info in workers.items(): + if info["state"] == "terminated" and ( + self.use_coordinator or info["tags"][TAG_RAY_NODE_KIND] == node_type + ): + info["tags"] = tags + info["state"] = "running" + self.state.put(node_id, info) + count = count - 1 + if count == 0: + return + + def terminate_node(self, node_id): + workers = self.state.get() + info = workers[node_id] + info["state"] = "terminated" + self.state.put(node_id, info) + + @staticmethod + def bootstrap_config(cluster_config): + return bootstrap_local(cluster_config) + + +def record_local_head_state_if_needed(local_provider: LocalNodeProvider) -> None: + """This function is called on the Ray head from StandardAutoscaler.reset + to record the head node's own existence in the cluster state file. + + This is necessary because `provider.create_node` in + `commands.get_or_create_head_node` records the head state on the + cluster-launching machine but not on the head. + """ + head_ip = local_provider.provider_config["head_ip"] + cluster_name = local_provider.cluster_name + # If the head node is not marked as created in the cluster state file, + if head_ip not in local_provider.non_terminated_nodes({}): + # These tags are based on the ones in commands.get_or_create_head_node; + # keep in sync. + head_tags = { + TAG_RAY_NODE_KIND: NODE_KIND_HEAD, + TAG_RAY_USER_NODE_TYPE: LOCAL_CLUSTER_NODE_TYPE, + TAG_RAY_NODE_NAME: "ray-{}-head".format(cluster_name), + TAG_RAY_NODE_STATUS: STATUS_UP_TO_DATE, + } + # Mark the head node as created in the cluster state file. + local_provider.create_node(node_config={}, tags=head_tags, count=1) + + assert head_ip in local_provider.non_terminated_nodes({}) diff --git a/.venv/lib/python3.11/site-packages/ray/autoscaler/_private/log_timer.py b/.venv/lib/python3.11/site-packages/ray/autoscaler/_private/log_timer.py new file mode 100644 index 0000000000000000000000000000000000000000..9e43d4692201c257c9971634c64633908a228f6e --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/autoscaler/_private/log_timer.py @@ -0,0 +1,33 @@ +import datetime +import logging + +from ray.autoscaler._private.cli_logger import cli_logger + +logger = logging.getLogger(__name__) + + +class LogTimer: + def __init__(self, message, show_status=False): + self._message = message + self._show_status = show_status + + def __enter__(self): + self._start_time = datetime.datetime.utcnow() + + def __exit__(self, *error_vals): + if cli_logger.log_style != "record": + return + + td = datetime.datetime.utcnow() - self._start_time + status = "" + if self._show_status: + status = "failed" if any(error_vals) else "succeeded" + cli_logger.print( + " ".join( + [ + self._message, + status, + "[LogTimer={:.0f}ms]".format(td.total_seconds() * 1000), + ] + ) + ) diff --git a/.venv/lib/python3.11/site-packages/ray/autoscaler/_private/monitor.py b/.venv/lib/python3.11/site-packages/ray/autoscaler/_private/monitor.py new file mode 100644 index 0000000000000000000000000000000000000000..ebede2890a6f1417f9f979e209da79d3641eca26 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/autoscaler/_private/monitor.py @@ -0,0 +1,719 @@ +"""Autoscaler monitoring loop daemon.""" + +import argparse +import json +import logging +import os +import signal +import sys +import time +import traceback +from collections import Counter +from dataclasses import asdict +from typing import Any, Callable, Dict, Optional, Union + +import ray +import ray._private.ray_constants as ray_constants +import ray._private.utils +from ray._private.event.event_logger import get_event_logger +from ray._private.ray_logging import setup_component_logger +from ray._raylet import GcsClient +from ray.autoscaler._private.autoscaler import StandardAutoscaler +from ray.autoscaler._private.commands import teardown_cluster +from ray.autoscaler._private.constants import ( + AUTOSCALER_MAX_RESOURCE_DEMAND_VECTOR_SIZE, + AUTOSCALER_METRIC_PORT, + AUTOSCALER_UPDATE_INTERVAL_S, + DISABLE_LAUNCH_CONFIG_CHECK_KEY, +) +from ray.autoscaler._private.event_summarizer import EventSummarizer +from ray.autoscaler._private.load_metrics import LoadMetrics +from ray.autoscaler._private.prom_metrics import AutoscalerPrometheusMetrics +from ray.autoscaler._private.util import format_readonly_node_type +from ray.autoscaler.v2.sdk import get_cluster_resource_state +from ray.core.generated import gcs_pb2 +from ray.core.generated.event_pb2 import Event as RayEvent +from ray.experimental.internal_kv import ( + _initialize_internal_kv, + _internal_kv_del, + _internal_kv_get, + _internal_kv_initialized, + _internal_kv_put, +) + +try: + import prometheus_client +except ImportError: + prometheus_client = None + + +logger = logging.getLogger(__name__) + + +def parse_resource_demands(resource_load_by_shape): + """Handle the message.resource_load_by_shape protobuf for the demand + based autoscaling. Catch and log all exceptions so this doesn't + interfere with the utilization based autoscaler until we're confident + this is stable. Worker queue backlogs are added to the appropriate + resource demand vector. + + Args: + resource_load_by_shape (pb2.gcs.ResourceLoad): The resource demands + in protobuf form or None. + + Returns: + List[ResourceDict]: Waiting bundles (ready and feasible). + List[ResourceDict]: Infeasible bundles. + """ + waiting_bundles, infeasible_bundles = [], [] + try: + for resource_demand_pb in list(resource_load_by_shape.resource_demands): + request_shape = dict(resource_demand_pb.shape) + for _ in range(resource_demand_pb.num_ready_requests_queued): + waiting_bundles.append(request_shape) + for _ in range(resource_demand_pb.num_infeasible_requests_queued): + infeasible_bundles.append(request_shape) + + # Infeasible and ready states for tasks are (logically) + # mutually exclusive. + if resource_demand_pb.num_infeasible_requests_queued > 0: + backlog_queue = infeasible_bundles + else: + backlog_queue = waiting_bundles + for _ in range(resource_demand_pb.backlog_size): + backlog_queue.append(request_shape) + if ( + len(waiting_bundles + infeasible_bundles) + > AUTOSCALER_MAX_RESOURCE_DEMAND_VECTOR_SIZE + ): + break + except Exception: + logger.exception("Failed to parse resource demands.") + + return waiting_bundles, infeasible_bundles + + +# Readonly provider config (e.g., for laptop mode, manually setup clusters). +BASE_READONLY_CONFIG = { + "cluster_name": "default", + "max_workers": 0, + "upscaling_speed": 1.0, + "docker": {}, + "idle_timeout_minutes": 0, + "provider": { + "type": "readonly", + "use_node_id_as_ip": True, # For emulated multi-node on laptop. + DISABLE_LAUNCH_CONFIG_CHECK_KEY: True, # No launch check. + }, + "auth": {}, + "available_node_types": { + "ray.head.default": {"resources": {}, "node_config": {}, "max_workers": 0} + }, + "head_node_type": "ray.head.default", + "file_mounts": {}, + "cluster_synced_files": [], + "file_mounts_sync_continuously": False, + "rsync_exclude": [], + "rsync_filter": [], + "initialization_commands": [], + "setup_commands": [], + "head_setup_commands": [], + "worker_setup_commands": [], + "head_start_ray_commands": [], + "worker_start_ray_commands": [], +} + + +class Monitor: + """Autoscaling monitor. + + This process periodically collects stats from the GCS and triggers + autoscaler updates. + """ + + def __init__( + self, + address: str, + autoscaling_config: Union[str, Callable[[], Dict[str, Any]]], + log_dir: str = None, + prefix_cluster_info: bool = False, + monitor_ip: Optional[str] = None, + retry_on_failure: bool = True, + ): + self.gcs_address = address + worker = ray._private.worker.global_worker + # TODO: eventually plumb ClusterID through to here + self.gcs_client = GcsClient(address=self.gcs_address) + + if monitor_ip: + monitor_addr = f"{monitor_ip}:{AUTOSCALER_METRIC_PORT}" + self.gcs_client.internal_kv_put( + b"AutoscalerMetricsAddress", monitor_addr.encode(), True, None + ) + _initialize_internal_kv(self.gcs_client) + if monitor_ip: + monitor_addr = f"{monitor_ip}:{AUTOSCALER_METRIC_PORT}" + self.gcs_client.internal_kv_put( + b"AutoscalerMetricsAddress", monitor_addr.encode(), True, None + ) + self._session_name = self.get_session_name(self.gcs_client) + logger.info(f"session_name: {self._session_name}") + worker.mode = 0 + head_node_ip = self.gcs_address.split(":")[0] + + self.load_metrics = LoadMetrics() + self.last_avail_resources = None + self.event_summarizer = EventSummarizer() + self.prefix_cluster_info = prefix_cluster_info + self.retry_on_failure = retry_on_failure + self.autoscaling_config = autoscaling_config + self.autoscaler = None + # If set, we are in a manually created cluster (non-autoscaling) and + # simply mirroring what the GCS tells us the cluster node types are. + self.readonly_config = None + + if log_dir: + try: + self.event_logger = get_event_logger( + RayEvent.SourceType.AUTOSCALER, log_dir + ) + except Exception: + self.event_logger = None + else: + self.event_logger = None + + self.prom_metrics = AutoscalerPrometheusMetrics(session_name=self._session_name) + + if monitor_ip and prometheus_client: + # If monitor_ip wasn't passed in, then don't attempt to start the + # metric server to keep behavior identical to before metrics were + # introduced + try: + logger.info( + "Starting autoscaler metrics server on port {}".format( + AUTOSCALER_METRIC_PORT + ) + ) + kwargs = {"addr": "127.0.0.1"} if head_node_ip == "127.0.0.1" else {} + prometheus_client.start_http_server( + port=AUTOSCALER_METRIC_PORT, + registry=self.prom_metrics.registry, + **kwargs, + ) + + # Reset some gauges, since we don't know which labels have + # leaked if the autoscaler was restarted. + self.prom_metrics.pending_nodes.clear() + self.prom_metrics.active_nodes.clear() + except Exception: + logger.exception( + "An exception occurred while starting the metrics server." + ) + elif not prometheus_client: + logger.warning( + "`prometheus_client` not found, so metrics will not be exported." + ) + + logger.info("Monitor: Started") + + def _initialize_autoscaler(self): + if self.autoscaling_config: + autoscaling_config = self.autoscaling_config + else: + # This config mirrors the current setup of the manually created + # cluster. Each node gets its own unique node type. + self.readonly_config = BASE_READONLY_CONFIG + + # Note that the "available_node_types" of the config can change. + def get_latest_readonly_config(): + return self.readonly_config + + autoscaling_config = get_latest_readonly_config + self.autoscaler = StandardAutoscaler( + autoscaling_config, + self.load_metrics, + self.gcs_client, + self._session_name, + prefix_cluster_info=self.prefix_cluster_info, + event_summarizer=self.event_summarizer, + prom_metrics=self.prom_metrics, + ) + + def update_load_metrics(self): + """Fetches resource usage data from GCS and updates load metrics.""" + + response = self.gcs_client.get_all_resource_usage(timeout=60) + resources_batch_data = response.resource_usage_data + log_resource_batch_data_if_desired(resources_batch_data) + + # This is a workaround to get correct idle_duration_ms + # from "get_cluster_resource_state" + # ref: https://github.com/ray-project/ray/pull/48519#issuecomment-2481659346 + cluster_resource_state = get_cluster_resource_state(self.gcs_client) + ray_node_states = cluster_resource_state.node_states + ray_nodes_idle_duration_ms_by_id = { + node.node_id: node.idle_duration_ms for node in ray_node_states + } + + # Tell the readonly node provider what nodes to report. + if self.readonly_config: + new_nodes = [] + for msg in list(resources_batch_data.batch): + node_id = msg.node_id.hex() + new_nodes.append((node_id, msg.node_manager_address)) + self.autoscaler.provider._set_nodes(new_nodes) + + mirror_node_types = {} + cluster_full = False + if ( + hasattr(response, "cluster_full_of_actors_detected_by_gcs") + and response.cluster_full_of_actors_detected_by_gcs + ): + # GCS has detected the cluster full of actors. + cluster_full = True + for resource_message in resources_batch_data.batch: + node_id = resource_message.node_id + # Generate node type config based on GCS reported node list. + if self.readonly_config: + # Keep prefix in sync with ReadonlyNodeProvider. + node_type = format_readonly_node_type(node_id.hex()) + resources = {} + for k, v in resource_message.resources_total.items(): + resources[k] = v + mirror_node_types[node_type] = { + "resources": resources, + "node_config": {}, + "max_workers": 1, + } + if ( + hasattr(resource_message, "cluster_full_of_actors_detected") + and resource_message.cluster_full_of_actors_detected + ): + # A worker node has detected the cluster full of actors. + cluster_full = True + total_resources = dict(resource_message.resources_total) + available_resources = dict(resource_message.resources_available) + + waiting_bundles, infeasible_bundles = parse_resource_demands( + resources_batch_data.resource_load_by_shape + ) + + pending_placement_groups = list( + resources_batch_data.placement_group_load.placement_group_data + ) + + use_node_id_as_ip = self.autoscaler is not None and self.autoscaler.config[ + "provider" + ].get("use_node_id_as_ip", False) + + # "use_node_id_as_ip" is a hack meant to address situations in + # which there's more than one Ray node residing at a given ip. + # TODO (Dmitri): Stop using ips as node identifiers. + # https://github.com/ray-project/ray/issues/19086 + if use_node_id_as_ip: + peloton_id = total_resources.get("NODE_ID_AS_RESOURCE") + # Legacy support https://github.com/ray-project/ray/pull/17312 + if peloton_id is not None: + ip = str(int(peloton_id)) + else: + ip = node_id.hex() + else: + ip = resource_message.node_manager_address + + idle_duration_s = 0.0 + if node_id in ray_nodes_idle_duration_ms_by_id: + idle_duration_s = ray_nodes_idle_duration_ms_by_id[node_id] / 1000 + else: + logger.warning( + f"node_id {node_id} not found in ray_nodes_idle_duration_ms_by_id" + ) + + self.load_metrics.update( + ip, + node_id, + total_resources, + available_resources, + idle_duration_s, + waiting_bundles, + infeasible_bundles, + pending_placement_groups, + cluster_full, + ) + if self.readonly_config: + self.readonly_config["available_node_types"].update(mirror_node_types) + + def get_session_name(self, gcs_client: GcsClient) -> Optional[str]: + """Obtain the session name from the GCS. + + If the GCS doesn't respond, session name is considered None. + In this case, the metrics reported from the monitor won't have + the correct session name. + """ + if not _internal_kv_initialized(): + return None + + session_name = gcs_client.internal_kv_get( + b"session_name", + ray_constants.KV_NAMESPACE_SESSION, + timeout=10, + ) + + if session_name: + session_name = session_name.decode() + + return session_name + + def update_resource_requests(self): + """Fetches resource requests from the internal KV and updates load.""" + if not _internal_kv_initialized(): + return + data = _internal_kv_get( + ray._private.ray_constants.AUTOSCALER_RESOURCE_REQUEST_CHANNEL + ) + if data: + try: + resource_request = json.loads(data) + self.load_metrics.set_resource_requests(resource_request) + except Exception: + logger.exception("Error parsing resource requests") + + def _run(self): + """Run the monitor loop.""" + + while True: + try: + gcs_request_start_time = time.time() + self.update_load_metrics() + gcs_request_time = time.time() - gcs_request_start_time + self.update_resource_requests() + self.update_event_summary() + load_metrics_summary = self.load_metrics.summary() + status = { + "gcs_request_time": gcs_request_time, + "time": time.time(), + "monitor_pid": os.getpid(), + } + + if self.autoscaler and not self.load_metrics: + # load_metrics is Falsey iff we haven't collected any + # resource messages from the GCS, which can happen at startup if + # the GCS hasn't yet received data from the Raylets. + # In this case, do not do an autoscaler update. + # Wait to get load metrics. + logger.info( + "Autoscaler has not yet received load metrics. Waiting." + ) + elif self.autoscaler: + # Process autoscaling actions + update_start_time = time.time() + self.autoscaler.update() + status["autoscaler_update_time"] = time.time() - update_start_time + autoscaler_summary = self.autoscaler.summary() + try: + self.emit_metrics( + load_metrics_summary, + autoscaler_summary, + self.autoscaler.all_node_types, + ) + except Exception: + logger.exception("Error emitting metrics") + + if autoscaler_summary: + status["autoscaler_report"] = asdict(autoscaler_summary) + status[ + "non_terminated_nodes_time" + ] = ( + self.autoscaler.non_terminated_nodes.non_terminated_nodes_time # noqa: E501 + ) + + for msg in self.event_summarizer.summary(): + # Need to prefix each line of the message for the lines to + # get pushed to the driver logs. + for line in msg.split("\n"): + logger.info( + "{}{}".format( + ray_constants.LOG_PREFIX_EVENT_SUMMARY, line + ) + ) + if self.event_logger: + self.event_logger.info(line) + + self.event_summarizer.clear() + + status["load_metrics_report"] = asdict(load_metrics_summary) + as_json = json.dumps(status) + if _internal_kv_initialized(): + _internal_kv_put( + ray_constants.DEBUG_AUTOSCALING_STATUS, as_json, overwrite=True + ) + except Exception: + # By default, do not exit the monitor on failure. + if self.retry_on_failure: + logger.exception("Monitor: Execution exception. Trying again...") + else: + raise + + # Wait for a autoscaler update interval before processing the next + # round of messages. + time.sleep(AUTOSCALER_UPDATE_INTERVAL_S) + + def emit_metrics(self, load_metrics_summary, autoscaler_summary, node_types): + if autoscaler_summary is None: + return None + + for resource_name in ["CPU", "GPU", "TPU"]: + _, total = load_metrics_summary.usage.get(resource_name, (0, 0)) + pending = autoscaler_summary.pending_resources.get(resource_name, 0) + self.prom_metrics.cluster_resources.labels( + resource=resource_name, + SessionName=self.prom_metrics.session_name, + ).set(total) + self.prom_metrics.pending_resources.labels( + resource=resource_name, + SessionName=self.prom_metrics.session_name, + ).set(pending) + + pending_node_count = Counter() + for _, node_type, _ in autoscaler_summary.pending_nodes: + pending_node_count[node_type] += 1 + + for node_type, count in autoscaler_summary.pending_launches.items(): + pending_node_count[node_type] += count + + for node_type in node_types: + count = pending_node_count[node_type] + self.prom_metrics.pending_nodes.labels( + SessionName=self.prom_metrics.session_name, + NodeType=node_type, + ).set(count) + + for node_type in node_types: + count = autoscaler_summary.active_nodes.get(node_type, 0) + self.prom_metrics.active_nodes.labels( + SessionName=self.prom_metrics.session_name, + NodeType=node_type, + ).set(count) + + failed_node_counts = Counter() + for _, node_type in autoscaler_summary.failed_nodes: + failed_node_counts[node_type] += 1 + + # NOTE: This metric isn't reset with monitor resets. This means it will + # only be updated when the autoscaler' node tracker remembers failed + # nodes. If the node type failure is evicted from the autoscaler, the + # metric may not update for a while. + for node_type, count in failed_node_counts.items(): + self.prom_metrics.recently_failed_nodes.labels( + SessionName=self.prom_metrics.session_name, + NodeType=node_type, + ).set(count) + + def update_event_summary(self): + """Report the current size of the cluster. + + To avoid log spam, only cluster size changes (CPU, GPU or TPU count change) + are reported to the event summarizer. The event summarizer will report + only the latest cluster size per batch. + """ + avail_resources = self.load_metrics.resources_avail_summary() + if not self.readonly_config and avail_resources != self.last_avail_resources: + self.event_summarizer.add( + "Resized to {}.", # e.g., Resized to 100 CPUs, 4 GPUs, 4 TPUs. + quantity=avail_resources, + aggregate=lambda old, new: new, + ) + self.last_avail_resources = avail_resources + + def destroy_autoscaler_workers(self): + """Cleanup the autoscaler, in case of an exception in the run() method. + + We kill the worker nodes, but retain the head node in order to keep + logs around, keeping costs minimal. This monitor process runs on the + head node anyway, so this is more reliable.""" + + if self.autoscaler is None: + return # Nothing to clean up. + + if self.autoscaling_config is None: + # This is a logic error in the program. Can't do anything. + logger.error("Monitor: Cleanup failed due to lack of autoscaler config.") + return + + logger.info("Monitor: Exception caught. Taking down workers...") + clean = False + while not clean: + try: + teardown_cluster( + config_file=self.autoscaling_config, + yes=True, # Non-interactive. + workers_only=True, # Retain head node for logs. + override_cluster_name=None, + keep_min_workers=True, # Retain minimal amount of workers. + ) + clean = True + logger.info("Monitor: Workers taken down.") + except Exception: + logger.error("Monitor: Cleanup exception. Trying again...") + time.sleep(2) + + def _handle_failure(self, error): + if ( + self.autoscaler is not None + and os.environ.get("RAY_AUTOSCALER_FATESHARE_WORKERS", "") == "1" + ): + self.autoscaler.kill_workers() + # Take down autoscaler workers if necessary. + self.destroy_autoscaler_workers() + + # Something went wrong, so push an error to all current and future + # drivers. + message = f"The autoscaler failed with the following error:\n{error}" + if _internal_kv_initialized(): + _internal_kv_put( + ray_constants.DEBUG_AUTOSCALING_ERROR, message, overwrite=True + ) + gcs_publisher = ray._raylet.GcsPublisher(address=self.gcs_address) + from ray._private.utils import publish_error_to_driver + + publish_error_to_driver( + ray_constants.MONITOR_DIED_ERROR, + message, + gcs_publisher=gcs_publisher, + ) + + def _signal_handler(self, sig, frame): + try: + self._handle_failure( + f"Terminated with signal {sig}\n" + + "".join(traceback.format_stack(frame)) + ) + except Exception: + logger.exception("Monitor: Failure in signal handler.") + sys.exit(sig + 128) + + def run(self): + # Register signal handlers for autoscaler termination. + # Signals will not be received on windows + signal.signal(signal.SIGINT, self._signal_handler) + signal.signal(signal.SIGTERM, self._signal_handler) + try: + if _internal_kv_initialized(): + # Delete any previous autoscaling errors. + _internal_kv_del(ray_constants.DEBUG_AUTOSCALING_ERROR) + self._initialize_autoscaler() + self._run() + except Exception: + logger.exception("Error in monitor loop") + self._handle_failure(traceback.format_exc()) + raise + + +def log_resource_batch_data_if_desired( + resources_batch_data: gcs_pb2.ResourceUsageBatchData, +) -> None: + if os.getenv("AUTOSCALER_LOG_RESOURCE_BATCH_DATA") == "1": + logger.info("Logging raw resource message pulled from GCS.") + logger.info(resources_batch_data) + logger.info("Done logging raw resource message.") + + +if __name__ == "__main__": + parser = argparse.ArgumentParser( + description=("Parse GCS server for the monitor to connect to.") + ) + parser.add_argument( + "--gcs-address", required=False, type=str, help="The address (ip:port) of GCS." + ) + parser.add_argument( + "--autoscaling-config", + required=False, + type=str, + help="the path to the autoscaling config file", + ) + parser.add_argument( + "--logging-level", + required=False, + type=str, + default=ray_constants.LOGGER_LEVEL, + choices=ray_constants.LOGGER_LEVEL_CHOICES, + help=ray_constants.LOGGER_LEVEL_HELP, + ) + parser.add_argument( + "--logging-format", + required=False, + type=str, + default=ray_constants.LOGGER_FORMAT, + help=ray_constants.LOGGER_FORMAT_HELP, + ) + parser.add_argument( + "--logging-filename", + required=False, + type=str, + default=ray_constants.MONITOR_LOG_FILE_NAME, + help="Specify the name of log file, " + "log to stdout if set empty, default is " + f'"{ray_constants.MONITOR_LOG_FILE_NAME}"', + ) + parser.add_argument( + "--logs-dir", + required=True, + type=str, + help="Specify the path of the temporary directory used by Ray processes.", + ) + parser.add_argument( + "--logging-rotate-bytes", + required=False, + type=int, + default=ray_constants.LOGGING_ROTATE_BYTES, + help="Specify the max bytes for rotating " + "log file, default is " + f"{ray_constants.LOGGING_ROTATE_BYTES} bytes.", + ) + parser.add_argument( + "--logging-rotate-backup-count", + required=False, + type=int, + default=ray_constants.LOGGING_ROTATE_BACKUP_COUNT, + help="Specify the backup count of rotated log file, default is " + f"{ray_constants.LOGGING_ROTATE_BACKUP_COUNT}.", + ) + parser.add_argument( + "--monitor-ip", + required=False, + type=str, + default=None, + help="The IP address of the machine hosting the monitor process.", + ) + + args = parser.parse_args() + setup_component_logger( + logging_level=args.logging_level, + logging_format=args.logging_format, + log_dir=args.logs_dir, + filename=args.logging_filename, + max_bytes=args.logging_rotate_bytes, + backup_count=args.logging_rotate_backup_count, + ) + + logger.info(f"Starting monitor using ray installation: {ray.__file__}") + logger.info(f"Ray version: {ray.__version__}") + logger.info(f"Ray commit: {ray.__commit__}") + logger.info(f"Monitor started with command: {sys.argv}") + + if args.autoscaling_config: + autoscaling_config = os.path.expanduser(args.autoscaling_config) + else: + autoscaling_config = None + + bootstrap_address = args.gcs_address + if bootstrap_address is None: + raise ValueError("--gcs-address must be set!") + + monitor = Monitor( + bootstrap_address, + autoscaling_config, + log_dir=args.logs_dir, + monitor_ip=args.monitor_ip, + ) + + monitor.run() diff --git a/.venv/lib/python3.11/site-packages/ray/autoscaler/_private/node_launcher.py b/.venv/lib/python3.11/site-packages/ray/autoscaler/_private/node_launcher.py new file mode 100644 index 0000000000000000000000000000000000000000..a457a7703c1c43ff65feba155044bf83b7b71ae0 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/autoscaler/_private/node_launcher.py @@ -0,0 +1,221 @@ +import copy +import logging +import operator +import threading +import time +import traceback +from typing import Any, Dict, Optional + +from ray.autoscaler._private.node_provider_availability_tracker import ( + NodeProviderAvailabilityTracker, +) +from ray.autoscaler._private.prom_metrics import AutoscalerPrometheusMetrics +from ray.autoscaler._private.util import hash_launch_conf +from ray.autoscaler.node_launch_exception import NodeLaunchException +from ray.autoscaler.tags import ( + NODE_KIND_WORKER, + STATUS_UNINITIALIZED, + TAG_RAY_LAUNCH_CONFIG, + TAG_RAY_NODE_KIND, + TAG_RAY_NODE_NAME, + TAG_RAY_NODE_STATUS, + TAG_RAY_USER_NODE_TYPE, +) + +logger = logging.getLogger(__name__) + + +class BaseNodeLauncher: + """Launches Ray nodes in the main thread using + `BaseNodeLauncher.launch_node()`. + + This is a superclass of NodeLauncher, which launches nodes asynchronously + in the background. + + By default, the subclass NodeLauncher is used to launch nodes in subthreads. + That behavior can be flagged off in the provider config by setting + `foreground_node_launch: True`; the autoscaler will then makes blocking calls to + BaseNodeLauncher.launch_node() in the main thread. + """ + + def __init__( + self, + provider, + pending, + event_summarizer, + node_provider_availability_tracker: NodeProviderAvailabilityTracker, + session_name: Optional[str] = None, + prom_metrics=None, + node_types=None, + index=None, + *args, + **kwargs, + ): + self.pending = pending + self.event_summarizer = event_summarizer + self.node_provider_availability_tracker = node_provider_availability_tracker + self.prom_metrics = prom_metrics or AutoscalerPrometheusMetrics( + session_name=session_name + ) + self.provider = provider + self.node_types = node_types + self.index = str(index) if index is not None else "" + + def launch_node( + self, config: Dict[str, Any], count: int, node_type: str + ) -> Optional[Dict]: + self.log("Got {} nodes to launch.".format(count)) + created_nodes = self._launch_node(config, count, node_type) + self.pending.dec(node_type, count) + return created_nodes + + def _launch_node( + self, config: Dict[str, Any], count: int, node_type: str + ) -> Optional[Dict]: + if self.node_types: + assert node_type, node_type + + # The `worker_nodes` field is deprecated in favor of per-node-type + # node_configs. We allow it for backwards-compatibility. + launch_config = copy.deepcopy(config.get("worker_nodes", {})) + if node_type: + launch_config.update( + config["available_node_types"][node_type]["node_config"] + ) + resources = copy.deepcopy( + config["available_node_types"][node_type]["resources"] + ) + labels = copy.deepcopy( + config["available_node_types"][node_type].get("labels", {}) + ) + launch_hash = hash_launch_conf(launch_config, config["auth"]) + node_config = copy.deepcopy(config.get("worker_nodes", {})) + node_tags = { + TAG_RAY_NODE_NAME: "ray-{}-worker".format(config["cluster_name"]), + TAG_RAY_NODE_KIND: NODE_KIND_WORKER, + TAG_RAY_NODE_STATUS: STATUS_UNINITIALIZED, + TAG_RAY_LAUNCH_CONFIG: launch_hash, + } + # A custom node type is specified; set the tag in this case, and also + # merge the configs. We merge the configs instead of overriding, so + # that the bootstrapped per-cloud properties are preserved. + # TODO(ekl) this logic is duplicated in commands.py (keep in sync) + if node_type: + node_tags[TAG_RAY_USER_NODE_TYPE] = node_type + node_config.update(launch_config) + + node_launch_start_time = time.time() + + error_msg = None + full_exception = None + created_nodes = {} + try: + created_nodes = self.provider.create_node_with_resources_and_labels( + node_config, node_tags, count, resources, labels + ) + except NodeLaunchException as node_launch_exception: + self.node_provider_availability_tracker.update_node_availability( + node_type, int(node_launch_start_time), node_launch_exception + ) + + if node_launch_exception.src_exc_info is not None: + full_exception = "\n".join( + traceback.format_exception(*node_launch_exception.src_exc_info) + ) + + error_msg = ( + f"Failed to launch {{}} node(s) of type {node_type}. " + f"({node_launch_exception.category}): " + f"{node_launch_exception.description}" + ) + except Exception: + error_msg = f"Failed to launch {{}} node(s) of type {node_type}." + full_exception = traceback.format_exc() + else: + # Record some metrics/observability information when a node is launched. + launch_time = time.time() - node_launch_start_time + for _ in range(count): + # Note: when launching multiple nodes we observe the time it + # took all nodes to launch for each node. For example, if 4 + # nodes were created in 25 seconds, we would observe the 25 + # second create time 4 times. + self.prom_metrics.worker_create_node_time.observe(launch_time) + self.prom_metrics.started_nodes.inc(count) + self.node_provider_availability_tracker.update_node_availability( + node_type=node_type, + timestamp=int(node_launch_start_time), + node_launch_exception=None, + ) + + if error_msg is not None: + self.event_summarizer.add( + error_msg, + quantity=count, + aggregate=operator.add, + ) + self.log(error_msg) + self.prom_metrics.node_launch_exceptions.inc() + self.prom_metrics.failed_create_nodes.inc(count) + else: + self.log("Launching {} nodes, type {}.".format(count, node_type)) + self.event_summarizer.add( + "Adding {} node(s) of type " + str(node_type) + ".", + quantity=count, + aggregate=operator.add, + ) + + if full_exception is not None: + self.log(full_exception) + + return created_nodes + + def log(self, statement): + # launcher_class is "BaseNodeLauncher", or "NodeLauncher" if called + # from that subclass. + launcher_class: str = type(self).__name__ + prefix = "{}{}:".format(launcher_class, self.index) + logger.info(prefix + " {}".format(statement)) + + +class NodeLauncher(BaseNodeLauncher, threading.Thread): + """Launches nodes asynchronously in the background.""" + + def __init__( + self, + provider, + queue, + pending, + event_summarizer, + node_provider_availability_tracker, + session_name: Optional[str] = None, + prom_metrics=None, + node_types=None, + index=None, + *thread_args, + **thread_kwargs, + ): + self.queue = queue + BaseNodeLauncher.__init__( + self, + provider=provider, + pending=pending, + event_summarizer=event_summarizer, + session_name=session_name, + node_provider_availability_tracker=node_provider_availability_tracker, + prom_metrics=prom_metrics, + node_types=node_types, + index=index, + ) + threading.Thread.__init__(self, *thread_args, **thread_kwargs) + + def run(self): + """Collects launch data from queue populated by StandardAutoscaler. + Launches nodes in a background thread. + + Overrides threading.Thread.run(). + NodeLauncher.start() executes this loop in a background thread. + """ + while True: + config, count, node_type = self.queue.get() + # launch_node is implemented in BaseNodeLauncher + self.launch_node(config, count, node_type) diff --git a/.venv/lib/python3.11/site-packages/ray/autoscaler/_private/node_provider_availability_tracker.py b/.venv/lib/python3.11/site-packages/ray/autoscaler/_private/node_provider_availability_tracker.py new file mode 100644 index 0000000000000000000000000000000000000000..9126ab4b0fea9d0e90a041915ab7d21991dbde50 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/autoscaler/_private/node_provider_availability_tracker.py @@ -0,0 +1,165 @@ +import threading +import time +from dataclasses import dataclass +from typing import Callable, Dict, Optional, Tuple + +from ray.autoscaler._private.constants import ( + AUTOSCALER_NODE_AVAILABILITY_MAX_STALENESS_S, +) +from ray.autoscaler.node_launch_exception import NodeLaunchException + + +@dataclass +class UnavailableNodeInformation: + category: str + description: str + + +@dataclass +class NodeAvailabilityRecord: + node_type: str + is_available: bool + last_checked_timestamp: float + unavailable_node_information: Optional[UnavailableNodeInformation] + + +@dataclass +class NodeAvailabilitySummary: + node_availabilities: Dict[ + str, NodeAvailabilityRecord + ] # Mapping from node type to node availability record. + + @classmethod + def from_fields(cls, **fields) -> Optional["NodeAvailabilitySummary"]: + """Implement marshalling from nested fields. pydantic isn't a core dependency + so we're implementing this by hand instead.""" + parsed = {} + + node_availabilites_dict = fields.get("node_availabilities", {}) + + for node_type, node_availability_record_dict in node_availabilites_dict.items(): + unavailable_information_dict = node_availability_record_dict.pop( + "unavailable_node_information", None + ) + unavaiable_information = None + if unavailable_information_dict is not None: + unavaiable_information = UnavailableNodeInformation( + **unavailable_information_dict + ) + + parsed[node_type] = NodeAvailabilityRecord( + unavailable_node_information=unavaiable_information, + **node_availability_record_dict, + ) + + return NodeAvailabilitySummary(node_availabilities=parsed) + + def __eq__(self, other: "NodeAvailabilitySummary"): + return self.node_availabilities == other.node_availabilities + + def __bool__(self) -> bool: + return bool(self.node_availabilities) + + +class NodeProviderAvailabilityTracker: + """A thread safe, TTL cache of node provider availability. We don't use + cachetools.TTLCache because it always sets the expiration time relative to + insertion time, but in our case, we want entries to expire relative to when + the node creation was attempted (and entries aren't necessarily added in + order). We want the entries to expire because the information grows stale + over time. + """ + + def __init__( + self, + timer: Callable[[], float] = time.time, + ttl: float = AUTOSCALER_NODE_AVAILABILITY_MAX_STALENESS_S, + ): + """A cache that tracks the availability of nodes and throw away + entries which have grown too stale. + + Args: + timer: A function that returns the current time in seconds. + ttl: The ttl from the insertion timestamp of an entry. + """ + self.timer = timer + self.ttl = ttl + # Mapping from node type to (eviction_time, record) + self.store: Dict[str, Tuple[float, NodeAvailabilityRecord]] = {} + # A global lock to simplify thread safety handling. + self.lock = threading.RLock() + + def _update_node_availability_requires_lock( + self, + node_type: str, + timestamp: int, + node_launch_exception: Optional[NodeLaunchException], + ) -> None: + if node_launch_exception is None: + record = NodeAvailabilityRecord( + node_type=node_type, + is_available=True, + last_checked_timestamp=timestamp, + unavailable_node_information=None, + ) + else: + info = UnavailableNodeInformation( + category=node_launch_exception.category, + description=node_launch_exception.description, + ) + record = NodeAvailabilityRecord( + node_type=node_type, + is_available=False, + last_checked_timestamp=timestamp, + unavailable_node_information=info, + ) + + expiration_time = timestamp + self.ttl + + # TODO (Alex): In theory it would be nice to make this dictionary + # ordered by expiration time, unfortunately that's a bit difficult + # since `update_node_availability` can be called with out of order + # timestamps. + self.store[node_type] = (expiration_time, record) + + self._remove_old_entries() + + def update_node_availability( + self, + node_type: str, + timestamp: int, + node_launch_exception: Optional[NodeLaunchException], + ) -> None: + """ + Update the availability and details of a single ndoe type. + + Args: + node_type: The node type. + timestamp: The timestamp that this information is accurate as of. + node_launch_exception: Details about why the node launch failed. If + empty, the node type will be considered available.""" + with self.lock: + self._update_node_availability_requires_lock( + node_type, timestamp, node_launch_exception + ) + + def summary(self) -> NodeAvailabilitySummary: + """ + Returns a summary of node availabilities and their staleness. + + Returns + A summary of node availabilities and their staleness. + """ + with self.lock: + self._remove_old_entries() + return NodeAvailabilitySummary( + {node_type: record for node_type, (_, record) in self.store.items()} + ) + + def _remove_old_entries(self): + """Remove any expired entries from the cache.""" + cur_time = self.timer() + with self.lock: + for key, (expiration_time, _) in list(self.store.items()): + if expiration_time < cur_time: + del self.store[key] diff --git a/.venv/lib/python3.11/site-packages/ray/autoscaler/_private/node_tracker.py b/.venv/lib/python3.11/site-packages/ray/autoscaler/_private/node_tracker.py new file mode 100644 index 0000000000000000000000000000000000000000..1b25117fc3bea6c81110fc9677ebd172e7eff789 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/autoscaler/_private/node_tracker.py @@ -0,0 +1,77 @@ +from typing import List, Set, Tuple + +from ray.autoscaler._private import constants + + +class NodeTracker: + """Map nodes to their corresponding logs. + + We need to be a little careful here. At an given point in time, node_id <-> + ip can be interchangeably used, but the node_id -> ip relation is not + bijective _across time_ since IP addresses can be reused. Therefore, we + should treat node_id as the only unique identifier. + """ + + def __init__(self): + # Mapping from node_id -> (ip, node type, stdout_path, process runner) + self.node_mapping = {} + + # A quick, inefficient FIFO cache implementation. + self.lru_order = [] + + def _add_node_mapping(self, node_id: str, value: str): + if node_id in self.node_mapping: + return + + assert len(self.lru_order) == len(self.node_mapping) + if len(self.lru_order) >= constants.AUTOSCALER_MAX_NODES_TRACKED: + # The LRU eviction case + node_id = self.lru_order.pop(0) + del self.node_mapping[node_id] + + self.node_mapping[node_id] = value + self.lru_order.append(node_id) + + def track(self, node_id: str, ip: str, node_type: str): + """ + Begin to track a new node. + + Args: + node_id: The node id. + ip: The node ip address. + node_type: The node type. + """ + if node_id not in self.node_mapping: + self._add_node_mapping(node_id, (ip, node_type)) + + def untrack(self, node_id: str): + """Gracefully stop tracking a node. If a node is intentionally removed from + the cluster, we should stop tracking it so we don't mistakenly mark it + as failed. + + Args: + node_id: The node id which failed. + """ + if node_id in self.node_mapping: + self.lru_order.remove(node_id) + del self.node_mapping[node_id] + + def get_all_failed_node_info( + self, non_failed_ids: Set[str] + ) -> List[Tuple[str, str]]: + """Get the information about all failed nodes. A failed node is any node which + we began to track that is not pending or alive (i.e. not failed). + + Args: + non_failed_ids: Nodes are failed unless they are in this set. + + Returns: + List[Tuple[str, str]]: A list of tuples. Each tuple is the ip + address and type of a failed node. + """ + failed_nodes = self.node_mapping.keys() - non_failed_ids + failed_info = [] + # Returning the list in order is important for display purposes. + for node_id in filter(lambda node_id: node_id in failed_nodes, self.lru_order): + failed_info.append(self.node_mapping[node_id]) + return failed_info diff --git a/.venv/lib/python3.11/site-packages/ray/autoscaler/_private/prom_metrics.py b/.venv/lib/python3.11/site-packages/ray/autoscaler/_private/prom_metrics.py new file mode 100644 index 0000000000000000000000000000000000000000..a86407f3e030a54381961f33cd281b4479eff78f --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/autoscaler/_private/prom_metrics.py @@ -0,0 +1,292 @@ +from typing import Optional + + +class NullMetric: + """Mock metric class to be used in case of prometheus_client import error.""" + + def set(self, *args, **kwargs): + pass + + def observe(self, *args, **kwargs): + pass + + def inc(self, *args, **kwargs): + pass + + def labels(self, *args, **kwargs): + return self + + def clear(self): + pass + + +try: + + from prometheus_client import CollectorRegistry, Counter, Gauge, Histogram + + # The metrics in this class should be kept in sync with + # python/ray/tests/test_metrics_agent.py + class AutoscalerPrometheusMetrics: + def __init__( + self, session_name: str = None, registry: Optional[CollectorRegistry] = None + ): + self.registry: CollectorRegistry = registry or CollectorRegistry( + auto_describe=True + ) + self._session_name = session_name + # Buckets: 5 seconds, 10 seconds, 20 seconds, 30 seconds, + # 45 seconds, 1 minute, 1.5 minutes, 2 minutes, + # 3 minutes, 4 minutes, 5 minutes, 6 minutes, + # 8 minutes, 10 minutes, 12 minutes, 15 minutes + # 20 minutes, 25 minutes, 30 minutes + # used for both worker launch time and worker update time + histogram_buckets = [ + 5, + 10, + 20, + 30, + 45, + 60, + 90, + 120, + 180, + 240, + 300, + 360, + 480, + 600, + 720, + 900, + 1200, + 1500, + 1800, + ] + # Buckets: .01 seconds to 1000 seconds. + # Used for autoscaler update time. + update_time_buckets = [0.01, 0.1, 1, 10, 100, 1000] + self.worker_create_node_time: Histogram = Histogram( + "worker_create_node_time_seconds", + "Worker launch time. This is the time it takes for a call to " + "a node provider's create_node method to return. Note that " + "when nodes are launched in batches, the launch time for that " + "batch will be observed once for *each* node in that batch. " + "For example, if 8 nodes are launched in 3 minutes, a launch " + "time of 3 minutes will be observed 8 times.", + labelnames=("SessionName",), + unit="seconds", + namespace="autoscaler", + registry=self.registry, + buckets=histogram_buckets, + ).labels(SessionName=session_name) + self.worker_update_time: Histogram = Histogram( + "worker_update_time_seconds", + "Worker update time. This is the time between when an updater " + "thread begins executing and when it exits successfully. This " + "metric only observes times for successful updates.", + labelnames=("SessionName",), + unit="seconds", + namespace="autoscaler", + registry=self.registry, + buckets=histogram_buckets, + ).labels(SessionName=session_name) + self.update_time: Histogram = Histogram( + "update_time", + "Autoscaler update time. This is the time for an autoscaler " + "update iteration to complete.", + labelnames=("SessionName",), + unit="seconds", + namespace="autoscaler", + registry=self.registry, + buckets=update_time_buckets, + ).labels(SessionName=session_name) + self.pending_nodes: Gauge = Gauge( + "pending_nodes", + "Number of nodes pending to be started.", + labelnames=( + "NodeType", + "SessionName", + ), + unit="nodes", + namespace="autoscaler", + registry=self.registry, + ) + self.active_nodes: Gauge = Gauge( + "active_nodes", + "Number of nodes in the cluster.", + labelnames=( + "NodeType", + "SessionName", + ), + unit="nodes", + namespace="autoscaler", + registry=self.registry, + ) + self.recently_failed_nodes = Gauge( + "recently_failed_nodes", + "The number of recently failed nodes. This count could reset " + "at undefined times.", + labelnames=( + "NodeType", + "SessionName", + ), + unit="nodes", + namespace="autoscaler", + registry=self.registry, + ) + self.started_nodes: Counter = Counter( + "started_nodes", + "Number of nodes started.", + labelnames=("SessionName",), + unit="nodes", + namespace="autoscaler", + registry=self.registry, + ).labels(SessionName=session_name) + self.stopped_nodes: Counter = Counter( + "stopped_nodes", + "Number of nodes stopped.", + labelnames=("SessionName",), + unit="nodes", + namespace="autoscaler", + registry=self.registry, + ).labels(SessionName=session_name) + self.updating_nodes: Gauge = Gauge( + "updating_nodes", + "Number of nodes in the process of updating.", + labelnames=("SessionName",), + unit="nodes", + namespace="autoscaler", + registry=self.registry, + ).labels(SessionName=session_name) + self.recovering_nodes: Gauge = Gauge( + "recovering_nodes", + "Number of nodes in the process of recovering.", + labelnames=("SessionName",), + unit="nodes", + namespace="autoscaler", + registry=self.registry, + ).labels(SessionName=session_name) + self.running_workers: Gauge = Gauge( + "running_workers", + "Number of worker nodes running.", + labelnames=("SessionName",), + unit="nodes", + namespace="autoscaler", + registry=self.registry, + ).labels(SessionName=session_name) + self.failed_create_nodes: Counter = Counter( + "failed_create_nodes", + "Number of nodes that failed to be created due to an " + "exception in the node provider's create_node method.", + labelnames=("SessionName",), + unit="nodes", + namespace="autoscaler", + registry=self.registry, + ).labels(SessionName=session_name) + self.failed_updates: Counter = Counter( + "failed_updates", + "Number of failed worker node updates.", + labelnames=("SessionName",), + unit="updates", + namespace="autoscaler", + registry=self.registry, + ).labels(SessionName=session_name) + self.successful_updates: Counter = Counter( + "successful_updates", + "Number of succesfful worker node updates.", + labelnames=("SessionName",), + unit="updates", + namespace="autoscaler", + registry=self.registry, + ).labels(SessionName=session_name) + self.failed_recoveries: Counter = Counter( + "failed_recoveries", + "Number of failed node recoveries.", + labelnames=("SessionName",), + unit="recoveries", + namespace="autoscaler", + registry=self.registry, + ).labels(SessionName=session_name) + self.successful_recoveries: Counter = Counter( + "successful_recoveries", + "Number of successful node recoveries.", + labelnames=("SessionName",), + unit="recoveries", + namespace="autoscaler", + registry=self.registry, + ).labels(SessionName=session_name) + self.update_loop_exceptions: Counter = Counter( + "update_loop_exceptions", + "Number of exceptions raised in the update loop of the autoscaler.", + labelnames=("SessionName",), + unit="exceptions", + namespace="autoscaler", + registry=self.registry, + ).labels(SessionName=session_name) + self.node_launch_exceptions: Counter = Counter( + "node_launch_exceptions", + "Number of exceptions raised while launching nodes.", + labelnames=("SessionName",), + unit="exceptions", + namespace="autoscaler", + registry=self.registry, + ).labels(SessionName=session_name) + self.reset_exceptions: Counter = Counter( + "reset_exceptions", + "Number of exceptions raised while resetting the autoscaler.", + labelnames=("SessionName",), + unit="exceptions", + namespace="autoscaler", + registry=self.registry, + ).labels(SessionName=session_name) + self.config_validation_exceptions: Counter = Counter( + "config_validation_exceptions", + "Number of exceptions raised while validating the config " + "during a reset.", + labelnames=("SessionName",), + unit="exceptions", + namespace="autoscaler", + registry=self.registry, + ).labels(SessionName=session_name) + self.drain_node_exceptions: Counter = Counter( + "drain_node_exceptions", + "Number of exceptions raised when making a DrainNode rpc" + "prior to node termination.", + labelnames=("SessionName",), + unit="exceptions", + namespace="autoscaler", + registry=self.registry, + ).labels(SessionName=session_name) + # This represents the autoscaler's view of essentially + # `ray.cluster_resources()`, it may be slightly different from the + # core metric from an eventual consistency perspective. + self.cluster_resources: Gauge = Gauge( + "cluster_resources", + "Total logical resources in the cluster.", + labelnames=("resource", "SessionName"), + unit="resources", + namespace="autoscaler", + registry=self.registry, + ) + # This represents the pending launches + nodes being set up for the + # autoscaler. + self.pending_resources: Gauge = Gauge( + "pending_resources", + "Pending logical resources in the cluster.", + labelnames=("resource", "SessionName"), + unit="resources", + namespace="autoscaler", + registry=self.registry, + ) + + @property + def session_name(self): + return self._session_name + +except ImportError: + + class AutoscalerPrometheusMetrics(object): + def __init__(self, session_name: str = None): + pass + + def __getattr__(self, attr): + return NullMetric() diff --git a/.venv/lib/python3.11/site-packages/ray/autoscaler/_private/providers.py b/.venv/lib/python3.11/site-packages/ray/autoscaler/_private/providers.py new file mode 100644 index 0000000000000000000000000000000000000000..930fc52bb3fc155d69cbe175b83ae0621b61563c --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/autoscaler/_private/providers.py @@ -0,0 +1,313 @@ +import copy +import json +import logging +import os +from typing import Any, Dict + +import yaml + +from ray.autoscaler._private.loader import load_function_or_class + +logger = logging.getLogger(__name__) + +# For caching provider instantiations across API calls of one python session +_provider_instances = {} + +# Minimal config for compatibility with legacy-style external configs. +MINIMAL_EXTERNAL_CONFIG = { + "available_node_types": { + "ray.head.default": {}, + "ray.worker.default": {}, + }, + "head_node_type": "ray.head.default", + "head_node": {}, + "worker_nodes": {}, +} + + +def _import_aws(provider_config): + try: + # boto3 and botocore are imported in multiple places in the codebase, + # so we just import them here to ensure that they are installed. + import boto3 # noqa: F401 + except ImportError as e: + raise ImportError( + "The Ray AWS VM launcher requires the AWS SDK for Python (Boto3) " + "to be installed. You can install it with `pip install boto3`." + ) from e + + from ray.autoscaler._private.aws.node_provider import AWSNodeProvider + + return AWSNodeProvider + + +def _import_gcp(provider_config): + try: + import googleapiclient # noqa: F401 + except ImportError as e: + raise ImportError( + "The Ray GCP VM launcher requires the Google API Client to be installed. " + "You can install it with `pip install google-api-python-client`." + ) from e + + from ray.autoscaler._private.gcp.node_provider import GCPNodeProvider + + return GCPNodeProvider + + +def _import_azure(provider_config): + from ray.autoscaler._private._azure.node_provider import AzureNodeProvider + + return AzureNodeProvider + + +def _import_vsphere(provider_config): + from ray.autoscaler._private.vsphere.node_provider import VsphereNodeProvider + + return VsphereNodeProvider + + +def _import_local(provider_config): + if "coordinator_address" in provider_config: + from ray.autoscaler._private.local.coordinator_node_provider import ( + CoordinatorSenderNodeProvider, + ) + + return CoordinatorSenderNodeProvider + else: + from ray.autoscaler._private.local.node_provider import LocalNodeProvider + + return LocalNodeProvider + + +def _import_readonly(provider_config): + from ray.autoscaler._private.readonly.node_provider import ReadOnlyNodeProvider + + return ReadOnlyNodeProvider + + +def _import_fake_multinode(provider_config): + from ray.autoscaler._private.fake_multi_node.node_provider import ( + FakeMultiNodeProvider, + ) + + return FakeMultiNodeProvider + + +def _import_fake_multinode_docker(provider_config): + from ray.autoscaler._private.fake_multi_node.node_provider import ( + FakeMultiNodeDockerProvider, + ) + + return FakeMultiNodeDockerProvider + + +def _import_kubernetes(provider_config): + from ray.autoscaler._private._kubernetes.node_provider import KubernetesNodeProvider + + return KubernetesNodeProvider + + +def _import_kuberay(provider_config): + from ray.autoscaler._private.kuberay.node_provider import KubeRayNodeProvider + + return KubeRayNodeProvider + + +def _import_aliyun(provider_config): + from ray.autoscaler._private.aliyun.node_provider import AliyunNodeProvider + + return AliyunNodeProvider + + +def _import_spark(provider_config): + from ray.autoscaler._private.spark.node_provider import SparkNodeProvider + + return SparkNodeProvider + + +def _load_fake_multinode_defaults_config(): + import ray.autoscaler._private.fake_multi_node as ray_fake_multinode + + return os.path.join(os.path.dirname(ray_fake_multinode.__file__), "example.yaml") + + +def _load_read_only_defaults_config(): + import ray.autoscaler._private.readonly as ray_readonly + + return os.path.join(os.path.dirname(ray_readonly.__file__), "example.yaml") + + +def _load_fake_multinode_docker_defaults_config(): + import ray.autoscaler._private.fake_multi_node as ray_fake_multinode + + return os.path.join( + os.path.dirname(ray_fake_multinode.__file__), "example_docker.yaml" + ) + + +def _load_local_defaults_config(): + import ray.autoscaler.local as ray_local + + return os.path.join(os.path.dirname(ray_local.__file__), "defaults.yaml") + + +def _load_kubernetes_defaults_config(): + import ray.autoscaler.kubernetes as ray_kubernetes + + return os.path.join(os.path.dirname(ray_kubernetes.__file__), "defaults.yaml") + + +def _load_aws_defaults_config(): + import ray.autoscaler.aws as ray_aws + + return os.path.join(os.path.dirname(ray_aws.__file__), "defaults.yaml") + + +def _load_vsphere_defaults_config(): + import ray.autoscaler.vsphere as ray_vsphere + + return os.path.join(os.path.dirname(ray_vsphere.__file__), "defaults.yaml") + + +def _load_gcp_defaults_config(): + import ray.autoscaler.gcp as ray_gcp + + return os.path.join(os.path.dirname(ray_gcp.__file__), "defaults.yaml") + + +def _load_azure_defaults_config(): + import ray.autoscaler.azure as ray_azure + + return os.path.join(os.path.dirname(ray_azure.__file__), "defaults.yaml") + + +def _load_aliyun_defaults_config(): + import ray.autoscaler.aliyun as ray_aliyun + + return os.path.join(os.path.dirname(ray_aliyun.__file__), "defaults.yaml") + + +def _import_external(provider_config): + provider_cls = load_function_or_class(path=provider_config["module"]) + return provider_cls + + +_NODE_PROVIDERS = { + "local": _import_local, + "fake_multinode": _import_fake_multinode, + "fake_multinode_docker": _import_fake_multinode_docker, + "readonly": _import_readonly, + "aws": _import_aws, + "gcp": _import_gcp, + "vsphere": _import_vsphere, + "azure": _import_azure, + "kubernetes": _import_kubernetes, + "kuberay": _import_kuberay, + "aliyun": _import_aliyun, + "external": _import_external, # Import an external module + "spark": _import_spark, +} + +_PROVIDER_PRETTY_NAMES = { + "readonly": "Readonly (Manual Cluster Setup)", + "fake_multinode": "Fake Multinode", + "fake_multinode_docker": "Fake Multinode Docker", + "local": "Local", + "aws": "AWS", + "gcp": "GCP", + "azure": "Azure", + "kubernetes": "Kubernetes", + "kuberay": "KubeRay", + "aliyun": "Aliyun", + "external": "External", + "vsphere": "vSphere", +} + +_DEFAULT_CONFIGS = { + "fake_multinode": _load_fake_multinode_defaults_config, + "fake_multinode_docker": _load_fake_multinode_docker_defaults_config, + "local": _load_local_defaults_config, + "aws": _load_aws_defaults_config, + "gcp": _load_gcp_defaults_config, + "azure": _load_azure_defaults_config, + "aliyun": _load_aliyun_defaults_config, + "kubernetes": _load_kubernetes_defaults_config, + "vsphere": _load_vsphere_defaults_config, + "readonly": _load_read_only_defaults_config, +} + + +def _get_node_provider_cls(provider_config: Dict[str, Any]): + """Get the node provider class for a given provider config. + + Note that this may be used by private node providers that proxy methods to + built-in node providers, so we should maintain backwards compatibility. + + Args: + provider_config: provider section of the autoscaler config. + + Returns: + NodeProvider class + """ + importer = _NODE_PROVIDERS.get(provider_config["type"]) + if importer is None: + raise NotImplementedError( + "Unsupported node provider: {}".format(provider_config["type"]) + ) + return importer(provider_config) + + +def _get_node_provider( + provider_config: Dict[str, Any], cluster_name: str, use_cache: bool = True +) -> Any: + """Get the instantiated node provider for a given provider config. + + Note that this may be used by private node providers that proxy methods to + built-in node providers, so we should maintain backwards compatibility. + + Args: + provider_config: provider section of the autoscaler config. + cluster_name: cluster name from the autoscaler config. + use_cache: whether or not to use a cached definition if available. If + False, the returned object will also not be stored in the cache. + + Returns: + NodeProvider + """ + provider_key = (json.dumps(provider_config, sort_keys=True), cluster_name) + if use_cache and provider_key in _provider_instances: + return _provider_instances[provider_key] + + provider_cls = _get_node_provider_cls(provider_config) + new_provider = provider_cls(provider_config, cluster_name) + + if use_cache: + _provider_instances[provider_key] = new_provider + + return new_provider + + +def _clear_provider_cache(): + global _provider_instances + _provider_instances = {} + + +def _get_default_config(provider_config): + """Retrieve a node provider. + + This is an INTERNAL API. It is not allowed to call this from any Ray + package outside the autoscaler. + """ + if provider_config["type"] == "external": + return copy.deepcopy(MINIMAL_EXTERNAL_CONFIG) + load_config = _DEFAULT_CONFIGS.get(provider_config["type"]) + if load_config is None: + raise NotImplementedError( + "Unsupported node provider: {}".format(provider_config["type"]) + ) + path_to_default = load_config() + with open(path_to_default) as f: + defaults = yaml.safe_load(f) + + return defaults diff --git a/.venv/lib/python3.11/site-packages/ray/autoscaler/_private/resource_demand_scheduler.py b/.venv/lib/python3.11/site-packages/ray/autoscaler/_private/resource_demand_scheduler.py new file mode 100644 index 0000000000000000000000000000000000000000..1db20be69c62244cd27b84a178131ea1f339e1bb --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/autoscaler/_private/resource_demand_scheduler.py @@ -0,0 +1,1009 @@ +"""Implements multi-node-type autoscaling. + +This file implements an autoscaling algorithm that is aware of multiple node +types (e.g., example-multi-node-type.yaml). The Ray autoscaler will pass in +a vector of resource shape demands, and the resource demand scheduler will +return a list of node types that can satisfy the demands given constraints +(i.e., reverse bin packing). +""" + +import collections +import copy +import logging +import os +from abc import abstractmethod +from functools import partial +from typing import Callable, Dict, List, Optional, Tuple + +import ray +from ray._private.gcs_utils import PlacementGroupTableData +from ray.autoscaler._private.constants import ( + AUTOSCALER_CONSERVE_GPU_NODES, + AUTOSCALER_UTILIZATION_SCORER_KEY, +) +from ray.autoscaler._private.loader import load_function_or_class +from ray.autoscaler._private.node_provider_availability_tracker import ( + NodeAvailabilitySummary, +) +from ray.autoscaler._private.util import ( + NodeID, + NodeIP, + NodeType, + NodeTypeConfigDict, + ResourceDict, + is_placement_group_resource, +) +from ray.autoscaler.node_provider import NodeProvider +from ray.autoscaler.tags import ( + NODE_KIND_HEAD, + NODE_KIND_UNMANAGED, + NODE_KIND_WORKER, + TAG_RAY_NODE_KIND, + TAG_RAY_USER_NODE_TYPE, +) +from ray.core.generated.common_pb2 import PlacementStrategy + +logger = logging.getLogger(__name__) + +# The minimum number of nodes to launch concurrently. +UPSCALING_INITIAL_NUM_NODES = 5 + +NodeResources = ResourceDict +ResourceDemands = List[ResourceDict] + + +class UtilizationScore: + """This fancy class just defines the `UtilizationScore` protocol to be + some type that is a "totally ordered set" (i.e. things that can be sorted). + + What we're really trying to express is + + ``` + UtilizationScore = TypeVar("UtilizationScore", bound=Comparable["UtilizationScore"]) + ``` + + but Comparable isn't a real type and, and a bound with a type argument + can't be enforced (f-bounded polymorphism with contravariance). See Guido's + comment for more details: https://github.com/python/typing/issues/59. + + This isn't just a `float`. In the case of the default scorer, it's a + `Tuple[float, float]` which is quite difficult to map to a single number. + + """ + + @abstractmethod + def __eq__(self, other: "UtilizationScore") -> bool: + pass + + @abstractmethod + def __lt__(self: "UtilizationScore", other: "UtilizationScore") -> bool: + pass + + def __gt__(self: "UtilizationScore", other: "UtilizationScore") -> bool: + return (not self < other) and self != other + + def __le__(self: "UtilizationScore", other: "UtilizationScore") -> bool: + return self < other or self == other + + def __ge__(self: "UtilizationScore", other: "UtilizationScore") -> bool: + return not self < other + + +class UtilizationScorer: + def __call__( + node_resources: NodeResources, + resource_demands: ResourceDemands, + *, + node_availability_summary: NodeAvailabilitySummary, + ) -> Optional[UtilizationScore]: + pass + + +class ResourceDemandScheduler: + def __init__( + self, + provider: NodeProvider, + node_types: Dict[NodeType, NodeTypeConfigDict], + max_workers: int, + head_node_type: NodeType, + upscaling_speed: float, + ) -> None: + self.provider = provider + self.node_types = copy.deepcopy(node_types) + self.node_resource_updated = set() + self.max_workers = max_workers + self.head_node_type = head_node_type + self.upscaling_speed = upscaling_speed + + utilization_scorer_str = os.environ.get( + AUTOSCALER_UTILIZATION_SCORER_KEY, + "ray.autoscaler._private.resource_demand_scheduler" + "._default_utilization_scorer", + ) + self.utilization_scorer: UtilizationScorer = load_function_or_class( + utilization_scorer_str + ) + + def _get_head_and_workers(self, nodes: List[NodeID]) -> Tuple[NodeID, List[NodeID]]: + """Returns the head node's id and the list of all worker node ids, + given a list `nodes` of all node ids in the cluster. + """ + head_id, worker_ids = None, [] + for node in nodes: + tags = self.provider.node_tags(node) + if tags[TAG_RAY_NODE_KIND] == NODE_KIND_HEAD: + head_id = node + elif tags[TAG_RAY_NODE_KIND] == NODE_KIND_WORKER: + worker_ids.append(node) + return head_id, worker_ids + + def reset_config( + self, + provider: NodeProvider, + node_types: Dict[NodeType, NodeTypeConfigDict], + max_workers: int, + head_node_type: NodeType, + upscaling_speed: float = 1, + ) -> None: + """Updates the class state variables. + + For legacy yamls, it merges previous state and new state to make sure + inferered resources are not lost. + """ + self.provider = provider + self.node_types = copy.deepcopy(node_types) + self.node_resource_updated = set() + self.max_workers = max_workers + self.head_node_type = head_node_type + self.upscaling_speed = upscaling_speed + + def is_feasible(self, bundle: ResourceDict) -> bool: + for node_type, config in self.node_types.items(): + max_of_type = config.get("max_workers", 0) + node_resources = config["resources"] + if (node_type == self.head_node_type or max_of_type > 0) and _fits( + node_resources, bundle + ): + return True + return False + + def get_nodes_to_launch( + self, + nodes: List[NodeID], + launching_nodes: Dict[NodeType, int], + resource_demands: List[ResourceDict], + unused_resources_by_ip: Dict[NodeIP, ResourceDict], + pending_placement_groups: List[PlacementGroupTableData], + max_resources_by_ip: Dict[NodeIP, ResourceDict], + ensure_min_cluster_size: List[ResourceDict], + node_availability_summary: NodeAvailabilitySummary, + ) -> (Dict[NodeType, int], List[ResourceDict]): + """Given resource demands, return node types to add to the cluster. + + This method: + (1) calculates the resources present in the cluster. + (2) calculates the remaining nodes to add to respect min_workers + constraint per node type. + (3) for each strict spread placement group, reserve space on + available nodes and launch new nodes if necessary. + (4) calculates the unfulfilled resource bundles. + (5) calculates which nodes need to be launched to fulfill all + the bundle requests, subject to max_worker constraints. + + Args: + nodes: List of existing nodes in the cluster. + launching_nodes: Summary of node types currently being launched. + resource_demands: Vector of resource demands from the scheduler. + unused_resources_by_ip: Mapping from ip to available resources. + pending_placement_groups: Placement group demands. + max_resources_by_ip: Mapping from ip to static node resources. + ensure_min_cluster_size: Try to ensure the cluster can fit at least + this set of resources. This differs from resources_demands in + that we don't take into account existing usage. + + node_availability_summary: A snapshot of the current + NodeAvailabilitySummary. + + Returns: + Dict of count to add for each node type, and residual of resources + that still cannot be fulfilled. + """ + utilization_scorer = partial( + self.utilization_scorer, node_availability_summary=node_availability_summary + ) + self._update_node_resources_from_runtime(nodes, max_resources_by_ip) + + node_resources: List[ResourceDict] + node_type_counts: Dict[NodeType, int] + node_resources, node_type_counts = self.calculate_node_resources( + nodes, launching_nodes, unused_resources_by_ip + ) + + logger.debug("Cluster resources: {}".format(node_resources)) + logger.debug("Node counts: {}".format(node_type_counts)) + # Step 2: add nodes to add to satisfy min_workers for each type + ( + node_resources, + node_type_counts, + adjusted_min_workers, + ) = _add_min_workers_nodes( + node_resources, + node_type_counts, + self.node_types, + self.max_workers, + self.head_node_type, + ensure_min_cluster_size, + utilization_scorer=utilization_scorer, + ) + + # Step 3: get resource demands of placement groups and return the + # groups that should be strictly spread. + logger.debug(f"Placement group demands: {pending_placement_groups}") + # TODO(Clark): Refactor placement group bundle demands such that their placement + # group provenance is mantained, since we need to keep an accounting of the + # cumulative CPU cores allocated as fulfilled during bin packing in order to + # ensure that a placement group's cumulative allocation is under the placement + # group's max CPU fraction per node. Without this, and placement group with many + # bundles might not be schedulable, but will fail to trigger scale-up since the + # max CPU fraction is properly applied to the cumulative bundle requests for a + # single node. + # + # placement_group_demand_vector: List[Tuple[List[ResourceDict], double]] + # + # bin_pack_residual() can keep it's packing priority; we just need to account + # for (1) the running CPU allocation for the bundle's placement group for that + # particular node, and (2) the max CPU cores allocatable for a single placement + # group for that particular node. + ( + placement_group_demand_vector, + strict_spreads, + ) = placement_groups_to_resource_demands(pending_placement_groups) + # Place placement groups demand vector at the beginning of the resource + # demands vector to make it consistent (results in the same types of + # nodes to add) with pg_demands_nodes_max_launch_limit calculated later + resource_demands = placement_group_demand_vector + resource_demands + + ( + spread_pg_nodes_to_add, + node_resources, + node_type_counts, + ) = self.reserve_and_allocate_spread( + strict_spreads, + node_resources, + node_type_counts, + utilization_scorer, + ) + + # Calculate the nodes to add for bypassing max launch limit for + # placement groups and spreads. + unfulfilled_placement_groups_demands, _ = get_bin_pack_residual( + node_resources, + placement_group_demand_vector, + ) + # Add 1 to account for the head node. + max_to_add = self.max_workers + 1 - sum(node_type_counts.values()) + pg_demands_nodes_max_launch_limit, _ = get_nodes_for( + self.node_types, + node_type_counts, + self.head_node_type, + max_to_add, + unfulfilled_placement_groups_demands, + utilization_scorer=utilization_scorer, + ) + placement_groups_nodes_max_limit = { + node_type: spread_pg_nodes_to_add.get(node_type, 0) + + pg_demands_nodes_max_launch_limit.get(node_type, 0) + for node_type in self.node_types + } + + # Step 4/5: add nodes for pending tasks, actors, and non-strict spread + # groups + unfulfilled, _ = get_bin_pack_residual(node_resources, resource_demands) + logger.debug("Resource demands: {}".format(resource_demands)) + logger.debug("Unfulfilled demands: {}".format(unfulfilled)) + nodes_to_add_based_on_demand, final_unfulfilled = get_nodes_for( + self.node_types, + node_type_counts, + self.head_node_type, + max_to_add, + unfulfilled, + utilization_scorer=utilization_scorer, + ) + logger.debug("Final unfulfilled: {}".format(final_unfulfilled)) + # Merge nodes to add based on demand and nodes to add based on + # min_workers constraint. We add them because nodes to add based on + # demand was calculated after the min_workers constraint was respected. + total_nodes_to_add = {} + + for node_type in self.node_types: + nodes_to_add = ( + adjusted_min_workers.get(node_type, 0) + + spread_pg_nodes_to_add.get(node_type, 0) + + nodes_to_add_based_on_demand.get(node_type, 0) + ) + if nodes_to_add > 0: + total_nodes_to_add[node_type] = nodes_to_add + + # Limit the number of concurrent launches + total_nodes_to_add = self._get_concurrent_resource_demand_to_launch( + total_nodes_to_add, + unused_resources_by_ip.keys(), + nodes, + launching_nodes, + adjusted_min_workers, + placement_groups_nodes_max_limit, + ) + + logger.debug("Node requests: {}".format(total_nodes_to_add)) + return total_nodes_to_add, final_unfulfilled + + def _update_node_resources_from_runtime( + self, nodes: List[NodeID], max_resources_by_ip: Dict[NodeIP, ResourceDict] + ): + """Update static node type resources with runtime resources + + This will update the cached static node type resources with the runtime + resources. Because we can not know the exact autofilled memory or + object_store_memory from config file. + """ + need_update = len(self.node_types) != len(self.node_resource_updated) + + if not need_update: + return + for node_id in nodes: + tags = self.provider.node_tags(node_id) + + if TAG_RAY_USER_NODE_TYPE not in tags: + continue + + node_type = tags[TAG_RAY_USER_NODE_TYPE] + if ( + node_type in self.node_resource_updated + or node_type not in self.node_types + ): + # continue if the node type has been updated or is not an known + # node type + continue + ip = self.provider.internal_ip(node_id) + runtime_resources = max_resources_by_ip.get(ip) + if runtime_resources: + runtime_resources = copy.deepcopy(runtime_resources) + resources = self.node_types[node_type].get("resources", {}) + for key in ["CPU", "GPU", "memory", "object_store_memory"]: + if key in runtime_resources: + resources[key] = runtime_resources[key] + self.node_types[node_type]["resources"] = resources + + node_kind = tags[TAG_RAY_NODE_KIND] + if node_kind == NODE_KIND_WORKER: + # Here, we do not record the resources have been updated + # if it is the head node kind. Because it need be updated + # by worker kind runtime resource. The most difference + # between head and worker is the memory resources. The head + # node needs to configure redis memory which is not needed + # for worker nodes. + self.node_resource_updated.add(node_type) + + def _get_concurrent_resource_demand_to_launch( + self, + to_launch: Dict[NodeType, int], + connected_nodes: List[NodeIP], + non_terminated_nodes: List[NodeID], + pending_launches_nodes: Dict[NodeType, int], + adjusted_min_workers: Dict[NodeType, int], + placement_group_nodes: Dict[NodeType, int], + ) -> Dict[NodeType, int]: + """Updates the max concurrent resources to launch for each node type. + + Given the current nodes that should be launched, the non terminated + nodes (running and pending) and the pending to be launched nodes. This + method calculates the maximum number of nodes to launch concurrently + for each node type as follows: + 1) Calculates the running nodes. + 2) Calculates the pending nodes and gets the launching nodes. + 3) Limits the total number of pending + currently-launching + + to-be-launched nodes to: + max( + 5, + self.upscaling_speed * max(running_nodes[node_type], 1) + ). + + Args: + to_launch: List of number of nodes to launch based on resource + demand for every node type. + connected_nodes: Running nodes (from LoadMetrics). + non_terminated_nodes: Non terminated nodes (pending/running). + pending_launches_nodes: Nodes that are in the launch queue. + adjusted_min_workers: Nodes to launch to satisfy + min_workers and request_resources(). This overrides the launch + limits since the user is hinting to immediately scale up to + this size. + placement_group_nodes: Nodes to launch for placement groups. + This overrides the launch concurrency limits. + Returns: + Dict[NodeType, int]: Maximum number of nodes to launch for each + node type. + """ + updated_nodes_to_launch = {} + running_nodes, pending_nodes = self._separate_running_and_pending_nodes( + non_terminated_nodes, + connected_nodes, + ) + for node_type in to_launch: + # Enforce here max allowed pending nodes to be frac of total + # running nodes. + max_allowed_pending_nodes = max( + UPSCALING_INITIAL_NUM_NODES, + int(self.upscaling_speed * max(running_nodes[node_type], 1)), + ) + total_pending_nodes = ( + pending_launches_nodes.get(node_type, 0) + pending_nodes[node_type] + ) + + upper_bound = max( + max_allowed_pending_nodes - total_pending_nodes, + # Allow more nodes if this is to respect min_workers or + # request_resources() or placement groups. + adjusted_min_workers.get(node_type, 0) + + placement_group_nodes.get(node_type, 0), + ) + + if upper_bound > 0: + updated_nodes_to_launch[node_type] = min( + upper_bound, to_launch[node_type] + ) + + return updated_nodes_to_launch + + def _separate_running_and_pending_nodes( + self, + non_terminated_nodes: List[NodeID], + connected_nodes: List[NodeIP], + ) -> (Dict[NodeType, int], Dict[NodeType, int]): + """Splits connected and non terminated nodes to pending & running.""" + + running_nodes = collections.defaultdict(int) + pending_nodes = collections.defaultdict(int) + for node_id in non_terminated_nodes: + tags = self.provider.node_tags(node_id) + if TAG_RAY_USER_NODE_TYPE in tags: + node_type = tags[TAG_RAY_USER_NODE_TYPE] + node_ip = self.provider.internal_ip(node_id) + if node_ip in connected_nodes: + running_nodes[node_type] += 1 + else: + pending_nodes[node_type] += 1 + return running_nodes, pending_nodes + + def calculate_node_resources( + self, + nodes: List[NodeID], + pending_nodes: Dict[NodeID, int], + unused_resources_by_ip: Dict[str, ResourceDict], + ) -> (List[ResourceDict], Dict[NodeType, int]): + """Returns node resource list and node type counts. + + Counts the running nodes, pending nodes. + Args: + nodes: Existing nodes. + pending_nodes: Pending nodes. + Returns: + node_resources: a list of running + pending resources. + E.g., [{"CPU": 4}, {"GPU": 2}]. + node_type_counts: running + pending workers per node type. + """ + + node_resources = [] + node_type_counts = collections.defaultdict(int) + + def add_node(node_type, available_resources=None): + if node_type not in self.node_types: + # We should not get here, but if for some reason we do, log an + # error and skip the errant node_type. + logger.error( + f"Missing entry for node_type {node_type} in " + f"cluster config: {self.node_types} under entry " + "available_node_types. This node's resources will be " + "ignored. If you are using an unmanaged node, manually " + f"set the {TAG_RAY_NODE_KIND} tag to " + f'"{NODE_KIND_UNMANAGED}" in your cloud provider\'s ' + "management console." + ) + return None + # Careful not to include the same dict object multiple times. + available = copy.deepcopy(self.node_types[node_type]["resources"]) + # If available_resources is None this might be because the node is + # no longer pending, but the raylet hasn't sent a heartbeat to gcs + # yet. + if available_resources is not None: + available = copy.deepcopy(available_resources) + + node_resources.append(available) + node_type_counts[node_type] += 1 + + for node_id in nodes: + tags = self.provider.node_tags(node_id) + if TAG_RAY_USER_NODE_TYPE in tags: + node_type = tags[TAG_RAY_USER_NODE_TYPE] + ip = self.provider.internal_ip(node_id) + available_resources = unused_resources_by_ip.get(ip) + add_node(node_type, available_resources) + + for node_type, count in pending_nodes.items(): + for _ in range(count): + add_node(node_type) + + return node_resources, node_type_counts + + def reserve_and_allocate_spread( + self, + strict_spreads: List[List[ResourceDict]], + node_resources: List[ResourceDict], + node_type_counts: Dict[NodeType, int], + utilization_scorer: Callable[ + [NodeResources, ResourceDemands], Optional[UtilizationScore] + ], + ): + """For each strict spread, attempt to reserve as much space as possible + on the node, then allocate new nodes for the unfulfilled portion. + + Args: + strict_spreads (List[List[ResourceDict]]): A list of placement + groups which must be spread out. + node_resources (List[ResourceDict]): Available node resources in + the cluster. + node_type_counts (Dict[NodeType, int]): The amount of each type of + node pending or in the cluster. + utilization_scorer: A function that, given a node + type, its resources, and resource demands, returns what its + utilization would be. + + Returns: + Dict[NodeType, int]: Nodes to add. + List[ResourceDict]: The updated node_resources after the method. + Dict[NodeType, int]: The updated node_type_counts. + + """ + to_add = collections.defaultdict(int) + for bundles in strict_spreads: + # Try to pack as many bundles of this group as possible on existing + # nodes. The remaining will be allocated on new nodes. + unfulfilled, node_resources = get_bin_pack_residual( + node_resources, bundles, strict_spread=True + ) + max_to_add = self.max_workers + 1 - sum(node_type_counts.values()) + # Allocate new nodes for the remaining bundles that don't fit. + to_launch, _ = get_nodes_for( + self.node_types, + node_type_counts, + self.head_node_type, + max_to_add, + unfulfilled, + utilization_scorer=utilization_scorer, + strict_spread=True, + ) + _inplace_add(node_type_counts, to_launch) + _inplace_add(to_add, to_launch) + new_node_resources = _node_type_counts_to_node_resources( + self.node_types, to_launch + ) + # Update node resources to include newly launched nodes and their + # bundles. + unfulfilled, including_reserved = get_bin_pack_residual( + new_node_resources, unfulfilled, strict_spread=True + ) + assert not unfulfilled + node_resources += including_reserved + return to_add, node_resources, node_type_counts + + def debug_string( + self, + nodes: List[NodeID], + pending_nodes: Dict[NodeID, int], + unused_resources_by_ip: Dict[str, ResourceDict], + ) -> str: + node_resources, node_type_counts = self.calculate_node_resources( + nodes, pending_nodes, unused_resources_by_ip + ) + + out = "Worker node types:" + for node_type, count in node_type_counts.items(): + out += "\n - {}: {}".format(node_type, count) + if pending_nodes.get(node_type): + out += " ({} pending)".format(pending_nodes[node_type]) + + return out + + +def _node_type_counts_to_node_resources( + node_types: Dict[NodeType, NodeTypeConfigDict], + node_type_counts: Dict[NodeType, int], +) -> List[ResourceDict]: + """Converts a node_type_counts dict into a list of node_resources.""" + resources = [] + for node_type, count in node_type_counts.items(): + # Be careful, each entry in the list must be deep copied! + resources += [node_types[node_type]["resources"].copy() for _ in range(count)] + return resources + + +def _add_min_workers_nodes( + node_resources: List[ResourceDict], + node_type_counts: Dict[NodeType, int], + node_types: Dict[NodeType, NodeTypeConfigDict], + max_workers: int, + head_node_type: NodeType, + ensure_min_cluster_size: List[ResourceDict], + utilization_scorer: Callable[ + [NodeResources, ResourceDemands, str], Optional[UtilizationScore] + ], +) -> (List[ResourceDict], Dict[NodeType, int], Dict[NodeType, int]): + """Updates resource demands to respect the min_workers and + request_resources() constraints. + + Args: + node_resources: Resources of exisiting nodes already launched/pending. + node_type_counts: Counts of existing nodes already launched/pending. + node_types: Node types config. + max_workers: global max_workers constaint. + ensure_min_cluster_size: resource demands from request_resources(). + utilization_scorer: A function that, given a node + type, its resources, and resource demands, returns what its + utilization would be. + + Returns: + node_resources: The updated node resources after adding min_workers + and request_resources() constraints per node type. + node_type_counts: The updated node counts after adding min_workers + and request_resources() constraints per node type. + total_nodes_to_add_dict: The nodes to add to respect min_workers and + request_resources() constraints. + """ + total_nodes_to_add_dict = {} + for node_type, config in node_types.items(): + existing = node_type_counts.get(node_type, 0) + target = min(config.get("min_workers", 0), config.get("max_workers", 0)) + if node_type == head_node_type: + # Add 1 to account for head node. + target = target + 1 + if existing < target: + total_nodes_to_add_dict[node_type] = target - existing + node_type_counts[node_type] = target + node_resources.extend( + [ + copy.deepcopy(node_types[node_type]["resources"]) + for _ in range(total_nodes_to_add_dict[node_type]) + ] + ) + + if ensure_min_cluster_size: + max_to_add = max_workers + 1 - sum(node_type_counts.values()) + max_node_resources = [] + # Fit request_resources() on all the resources as if they are idle. + for node_type in node_type_counts: + max_node_resources.extend( + [ + copy.deepcopy(node_types[node_type]["resources"]) + for _ in range(node_type_counts[node_type]) + ] + ) + # Get the unfulfilled to ensure min cluster size. + resource_requests_unfulfilled, _ = get_bin_pack_residual( + max_node_resources, ensure_min_cluster_size + ) + # Get the nodes to meet the unfulfilled. + nodes_to_add_request_resources, _ = get_nodes_for( + node_types, + node_type_counts, + head_node_type, + max_to_add, + resource_requests_unfulfilled, + utilization_scorer=utilization_scorer, + ) + # Update the resources, counts and total nodes to add. + for node_type in nodes_to_add_request_resources: + nodes_to_add = nodes_to_add_request_resources.get(node_type, 0) + if nodes_to_add > 0: + node_type_counts[node_type] = nodes_to_add + node_type_counts.get( + node_type, 0 + ) + node_resources.extend( + [ + copy.deepcopy(node_types[node_type]["resources"]) + for _ in range(nodes_to_add) + ] + ) + total_nodes_to_add_dict[ + node_type + ] = nodes_to_add + total_nodes_to_add_dict.get(node_type, 0) + return node_resources, node_type_counts, total_nodes_to_add_dict + + +def get_nodes_for( + node_types: Dict[NodeType, NodeTypeConfigDict], + existing_nodes: Dict[NodeType, int], + head_node_type: NodeType, + max_to_add: int, + resources: List[ResourceDict], + utilization_scorer: Callable[ + [NodeResources, ResourceDemands, str], Optional[UtilizationScore] + ], + strict_spread: bool = False, +) -> (Dict[NodeType, int], List[ResourceDict]): + """Determine nodes to add given resource demands and constraints. + + Args: + node_types: node types config. + existing_nodes: counts of existing nodes already launched. + This sets constraints on the number of new nodes to add. + max_to_add: global constraint on nodes to add. + resources: resource demands to fulfill. + strict_spread: If true, each element in `resources` must be placed on a + different node. + utilization_scorer: A function that, given a node + type, its resources, and resource demands, returns what its + utilization would be. + + Returns: + Dict of count to add for each node type, and residual of resources + that still cannot be fulfilled. + """ + nodes_to_add: Dict[NodeType, int] = collections.defaultdict(int) + + while resources and sum(nodes_to_add.values()) < max_to_add: + utilization_scores = [] + for node_type in node_types: + max_workers_of_node_type = node_types[node_type].get("max_workers", 0) + if head_node_type == node_type: + # Add 1 to account for head node. + max_workers_of_node_type = max_workers_of_node_type + 1 + if ( + existing_nodes.get(node_type, 0) + nodes_to_add.get(node_type, 0) + >= max_workers_of_node_type + ): + continue + node_resources = node_types[node_type]["resources"] + if strict_spread: + # If handling strict spread, only one bundle can be placed on + # the node. + score = utilization_scorer(node_resources, [resources[0]], node_type) + else: + score = utilization_scorer(node_resources, resources, node_type) + if score is not None: + utilization_scores.append((score, node_type)) + + # Give up, no feasible node. + if not utilization_scores: + if not any( + is_placement_group_resource(resource) + for resources_dict in resources + for resource in resources_dict + ): + logger.warning( + f"The autoscaler could not find a node type to satisfy the " + f"request: {resources}. Please specify a node type with the " + f"necessary resources." + ) + break + + utilization_scores = sorted(utilization_scores, reverse=True) + best_node_type = utilization_scores[0][1] + nodes_to_add[best_node_type] += 1 + if strict_spread: + resources = resources[1:] + else: + allocated_resource = node_types[best_node_type]["resources"] + residual, _ = get_bin_pack_residual([allocated_resource], resources) + assert len(residual) < len(resources), (resources, residual) + resources = residual + + return nodes_to_add, resources + + +def _resource_based_utilization_scorer( + node_resources: ResourceDict, + resources: List[ResourceDict], + *, + node_availability_summary: NodeAvailabilitySummary, +) -> Optional[Tuple[bool, int, float, float]]: + remaining = copy.deepcopy(node_resources) + fittable = [] + resource_types = set() + for r in resources: + for k, v in r.items(): + if v > 0: + resource_types.add(k) + if _fits(remaining, r): + fittable.append(r) + _inplace_subtract(remaining, r) + if not fittable: + return None + + util_by_resources = [] + num_matching_resource_types = 0 + for k, v in node_resources.items(): + # Don't divide by zero. + if v < 1: + # Could test v == 0 on the nose, but v < 1 feels safer. + # (Note that node resources are integers.) + continue + if k in resource_types: + num_matching_resource_types += 1 + util = (v - remaining[k]) / v + util_by_resources.append(v * (util**3)) + + # Could happen if node_resources has only zero values. + if not util_by_resources: + return None + + # Prefer not to launch a GPU node if there aren't any GPU requirements in the + # resource bundle. + gpu_ok = True + if AUTOSCALER_CONSERVE_GPU_NODES: + is_gpu_node = "GPU" in node_resources and node_resources["GPU"] > 0 + any_gpu_task = any("GPU" in r for r in resources) + if is_gpu_node and not any_gpu_task: + gpu_ok = False + + # Prioritize avoiding gpu nodes for non-gpu workloads first, + # then prioritize matching multiple resource types, + # then prioritize using all resources, + # then prioritize overall balance of multiple resources. + return ( + gpu_ok, + num_matching_resource_types, + min(util_by_resources), + # util_by_resources should be non empty + float(sum(util_by_resources)) / len(util_by_resources), + ) + + +def _default_utilization_scorer( + node_resources: ResourceDict, + resources: List[ResourceDict], + node_type: str, + *, + node_availability_summary: NodeAvailabilitySummary, +): + return _resource_based_utilization_scorer( + node_resources, resources, node_availability_summary=node_availability_summary + ) + + +def get_bin_pack_residual( + node_resources: List[ResourceDict], + resource_demands: List[ResourceDict], + strict_spread: bool = False, +) -> (List[ResourceDict], List[ResourceDict]): + """Return a subset of resource_demands that cannot fit in the cluster. + + TODO(ekl): this currently does not guarantee the resources will be packed + correctly by the Ray scheduler. This is only possible once the Ray backend + supports a placement groups API. + + Args: + node_resources (List[ResourceDict]): List of resources per node. + resource_demands (List[ResourceDict]): List of resource bundles that + need to be bin packed onto the nodes. + strict_spread: If true, each element in resource_demands must be + placed on a different entry in `node_resources`. + + Returns: + List[ResourceDict]: the residual list resources that do not fit. + List[ResourceDict]: The updated node_resources after the method. + """ + + unfulfilled = [] + + # A most naive bin packing algorithm. + nodes = copy.deepcopy(node_resources) + # List of nodes that cannot be used again due to strict spread. + used = [] + # We order the resource demands in the following way: + # More complex demands first. + # Break ties: heavier demands first. + # Break ties: lexicographically (to ensure stable ordering). + for demand in sorted( + resource_demands, + key=lambda demand: ( + len(demand.values()), + sum(demand.values()), + sorted(demand.items()), + ), + reverse=True, + ): + found = False + node = None + for i in range(len(nodes)): + node = nodes[i] + if _fits(node, demand): + found = True + # In the strict_spread case, we can't reuse nodes. + if strict_spread: + used.append(node) + del nodes[i] + break + if found and node: + _inplace_subtract(node, demand) + else: + unfulfilled.append(demand) + + return unfulfilled, nodes + used + + +def _fits(node: ResourceDict, resources: ResourceDict) -> bool: + for k, v in resources.items(): + # TODO(jjyao): Change ResourceDict to a class so we can + # hide the implicit resource handling. + if v > node.get( + k, 1.0 if k.startswith(ray._raylet.IMPLICIT_RESOURCE_PREFIX) else 0.0 + ): + return False + return True + + +def _inplace_subtract(node: ResourceDict, resources: ResourceDict) -> None: + for k, v in resources.items(): + if v == 0: + # This is an edge case since some reasonable programs/computers can + # do `ray.autoscaler.sdk.request_resources({"GPU": 0}"})`. + continue + if k not in node: + assert k.startswith(ray._raylet.IMPLICIT_RESOURCE_PREFIX), (k, node) + node[k] = 1 + assert k in node, (k, node) + node[k] -= v + assert node[k] >= 0.0, (node, k, v) + + +def _inplace_add(a: collections.defaultdict, b: Dict) -> None: + """Generically adds values in `b` to `a`. + a[k] should be defined for all k in b.keys()""" + for k, v in b.items(): + a[k] += v + + +def placement_groups_to_resource_demands( + pending_placement_groups: List[PlacementGroupTableData], +): + """Preprocess placement group requests into regular resource demand vectors + when possible. The policy is: + * STRICT_PACK - Convert to a single bundle. + * PACK - Flatten into a resource demand vector. + * STRICT_SPREAD - Cannot be converted. + * SPREAD - Flatten into a resource demand vector. + + Args: + pending_placement_groups (List[PlacementGroupData]): List of + PlacementGroupLoad's. + + Returns: + List[ResourceDict]: The placement groups which were converted to a + resource demand vector. + List[List[ResourceDict]]: The placement groups which should be strictly + spread. + """ + resource_demand_vector = [] + unconverted = [] + for placement_group in pending_placement_groups: + shapes = [dict(bundle.unit_resources) for bundle in placement_group.bundles] + if ( + placement_group.strategy == PlacementStrategy.PACK + or placement_group.strategy == PlacementStrategy.SPREAD + ): + resource_demand_vector.extend(shapes) + elif placement_group.strategy == PlacementStrategy.STRICT_PACK: + combined = collections.defaultdict(float) + for shape in shapes: + for label, quantity in shape.items(): + combined[label] += quantity + resource_demand_vector.append(combined) + elif placement_group.strategy == PlacementStrategy.STRICT_SPREAD: + unconverted.append(shapes) + else: + logger.error( + f"Unknown placement group request type: {placement_group}. " + f"Please file a bug report " + f"https://github.com/ray-project/ray/issues/new." + ) + return resource_demand_vector, unconverted diff --git a/.venv/lib/python3.11/site-packages/ray/autoscaler/_private/subprocess_output_util.py b/.venv/lib/python3.11/site-packages/ray/autoscaler/_private/subprocess_output_util.py new file mode 100644 index 0000000000000000000000000000000000000000..2aaa2007d3c1721cb11a6c7e6eadfd45dcbc54db --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/autoscaler/_private/subprocess_output_util.py @@ -0,0 +1,420 @@ +import os +import re +import subprocess +import sys +import tempfile +import time + +from ray.autoscaler._private.cli_logger import cf, cli_logger + +CONN_REFUSED_PATIENCE = 30 # how long to wait for sshd to run + +_redirect_output = False # Whether to log command output to a temporary file +_allow_interactive = True # whether to pass on stdin to running commands. + + +def is_output_redirected(): + return _redirect_output + + +def set_output_redirected(val: bool): + """Choose between logging to a temporary file and to `sys.stdout`. + + The default is to log to a file. + + Args: + val: If true, subprocess output will be redirected to + a temporary file. + """ + global _redirect_output + _redirect_output = val + + +def does_allow_interactive(): + return _allow_interactive + + +def set_allow_interactive(val: bool): + """Choose whether to pass on stdin to running commands. + + The default is to pipe stdin and close it immediately. + + Args: + val: If true, stdin will be passed to commands. + """ + global _allow_interactive + _allow_interactive = val + + +class ProcessRunnerError(Exception): + def __init__(self, msg, msg_type, code=None, command=None, special_case=None): + super(ProcessRunnerError, self).__init__( + "{} (discovered={}): type={}, code={}, command={}".format( + msg, special_case, msg_type, code, command + ) + ) + + self.msg_type = msg_type + self.code = code + self.command = command + + self.special_case = special_case + + +_ssh_output_regexes = { + "known_host_update": re.compile( + r"\s*Warning: Permanently added '.+' \(.+\) " r"to the list of known hosts.\s*" + ), + "connection_closed": re.compile(r"\s*Shared connection to .+ closed.\s*"), + "timeout": re.compile( + r"\s*ssh: connect to host .+ port .+: " r"Operation timed out\s*" + ), + "conn_refused": re.compile( + r"\s*ssh: connect to host .+ port .+: Connection refused\s*" + ) + # todo: check for other connection failures for better error messages? +} + + +def _read_subprocess_stream(f, output_file, is_stdout=False): + """Read and process a subprocess output stream. + + The goal is to find error messages and respond to them in a clever way. + Currently just used for SSH messages (CONN_REFUSED, TIMEOUT, etc.), so + the user does not get confused by these. + + Ran in a thread each for both `stdout` and `stderr` to + allow for cross-platform asynchronous IO. + + Note: `select`-based IO is another option, but Windows has + no support for `select`ing pipes, and Linux support varies somewhat. + Spefically, Older *nix systems might also have quirks in how they + handle `select` on pipes. + + Args: + f: File object for the stream. + output_file: File object to which filtered output is written. + is_stdout (bool): + When `is_stdout` is `False`, the stream is assumed to + be `stderr`. Different error message detectors are used, + and the output is displayed to the user unless it matches + a special case (e.g. SSH timeout), in which case this is + left up to the caller. + """ + + detected_special_case = None + while True: + # ! Readline here is crucial. + # ! Normal `read()` will block until EOF instead of until + # something is available. + line = f.readline() + + if line is None or line == "": + # EOF + break + + if line[-1] == "\n": + line = line[:-1] + + if not is_stdout: + if _ssh_output_regexes["connection_closed"].fullmatch(line) is not None: + # Do not log "connection closed" messages which SSH + # puts in stderr for no reason. + # + # They are never errors since the connection will + # close no matter whether the command succeeds or not. + continue + + if _ssh_output_regexes["timeout"].fullmatch(line) is not None: + # Timeout is not really an error but rather a special + # condition. It should be handled by the caller, since + # network conditions/nodes in the early stages of boot + # are expected to sometimes cause connection timeouts. + if detected_special_case is not None: + raise ValueError( + "Bug: ssh_timeout conflicts with another " + "special codition: " + detected_special_case + ) + + detected_special_case = "ssh_timeout" + continue + + if _ssh_output_regexes["conn_refused"].fullmatch(line) is not None: + # Connection refused is not really an error but + # rather a special condition. It should be handled by + # the caller, since network conditions/nodes in the + # early stages of boot are expected to sometimes cause + # CONN_REFUSED. + if detected_special_case is not None: + raise ValueError( + "Bug: ssh_conn_refused conflicts with another " + "special codition: " + detected_special_case + ) + + detected_special_case = "ssh_conn_refused" + continue + + if _ssh_output_regexes["known_host_update"].fullmatch(line) is not None: + # Since we ignore SSH host control anyway + # (-o UserKnownHostsFile=/dev/null), + # we should silence the host control warnings. + continue + + cli_logger.error(line) + + if output_file is not None and output_file != subprocess.DEVNULL: + output_file.write(line + "\n") + + return detected_special_case + + +def _run_and_process_output( + cmd, + stdout_file, + process_runner=subprocess, + stderr_file=None, + use_login_shells=False, +): + """Run a command and process its output for special cases. + + Calls a standard 'check_call' if process_runner is not subprocess. + + Specifically, run all command output through regex to detect + error conditions and filter out non-error messages that went to stderr + anyway (SSH writes ALL of its "system" messages to stderr even if they + are not actually errors). + + Args: + cmd (List[str]): Command to run. + process_runner: Used for command execution. Assumed to have + 'check_call' and 'check_output' inplemented. + stdout_file: File to redirect stdout to. + stderr_file: File to redirect stderr to. + + Implementation notes: + 1. `use_login_shells` disables special processing + If we run interactive apps, output processing will likely get + overwhelmed with the interactive output elements. + Thus, we disable output processing for login shells. This makes + the logging experience considerably worse, but it only degrades + to old-style logging. + + For example, `pip install` outputs HUNDREDS of progress-bar lines + when downloading a package, and we have to + read + regex + write all of them. + + After all, even just printing output to console can often slow + down a fast-printing app, and we do more than just print, and + all that from Python, which is much slower than C regarding + stream processing. + + 2. `stdin=PIPE` for subprocesses + Do not inherit stdin as it messes with bash signals + (ctrl-C for SIGINT) and these commands aren't supposed to + take input anyway. + + 3. `ThreadPoolExecutor` without the `Pool` + We use `ThreadPoolExecutor` to create futures from threads. + Threads are never reused. + + This approach allows us to have no custom synchronization by + off-loading the return value and exception passing to the + standard library (`ThreadPoolExecutor` internals). + + This instance will be `shutdown()` ASAP so it's fine to + create one in such a weird place. + + The code is thus 100% thread-safe as long as the stream readers + are read-only except for return values and possible exceptions. + """ + stdin_overwrite = subprocess.PIPE + # This already should be validated in a higher place of the stack. + assert not ( + does_allow_interactive() and is_output_redirected() + ), "Cannot redirect output while in interactive mode." + if process_runner != subprocess or ( + does_allow_interactive() and not is_output_redirected() + ): + stdin_overwrite = None + + # See implementation note #1 + + if use_login_shells or process_runner != subprocess: + return process_runner.check_call( + cmd, + # See implementation note #2 + stdin=stdin_overwrite, + stdout=stdout_file, + stderr=stderr_file, + ) + + with subprocess.Popen( + cmd, + # See implementation note #2 + stdin=stdin_overwrite, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + bufsize=1, # line buffering + universal_newlines=True, # text mode outputs + ) as p: + from concurrent.futures import ThreadPoolExecutor + + # Closing stdin might be necessary to signal EOF to some + # apps (they might get stuck waiting for input forever otherwise). + p.stdin.close() + + # See implementation note #3 + with ThreadPoolExecutor(max_workers=2) as executor: + stdout_future = executor.submit( + _read_subprocess_stream, p.stdout, stdout_file, is_stdout=True + ) + stderr_future = executor.submit( + _read_subprocess_stream, p.stderr, stderr_file, is_stdout=False + ) + # Wait for completion. + executor.shutdown() + + # Update `p.returncode` + p.poll() + + detected_special_case = stdout_future.result() + if stderr_future.result() is not None: + if detected_special_case is not None: + # This might some day need to be changed. + # We should probably make sure the two special cases + # are compatible then and that we can handle both by + # e.g. reporting both to the caller. + raise ValueError( + "Bug: found a special case in both stdout and " + "stderr. This is not valid behavior at the time " + "of writing this code." + ) + detected_special_case = stderr_future.result() + + if p.returncode > 0: + # Process failed, but not due to a signal, since signals + # set the exit code to a negative value. + raise ProcessRunnerError( + "Command failed", + "ssh_command_failed", + code=p.returncode, + command=cmd, + special_case=detected_special_case, + ) + elif p.returncode < 0: + # Process failed due to a signal, since signals + # set the exit code to a negative value. + raise ProcessRunnerError( + "Command failed", + "ssh_command_failed", + code=p.returncode, + command=cmd, + special_case="died_to_signal", + ) + + return p.returncode + + +def run_cmd_redirected( + cmd, process_runner=subprocess, silent=False, use_login_shells=False +): + """Run a command and optionally redirect output to a file. + + Args: + cmd (List[str]): Command to run. + process_runner: Process runner used for executing commands. + silent: If true, the command output will be silenced completely + (redirected to /dev/null), unless verbose logging + is enabled. Use this for running utility commands like + rsync. + """ + if silent and cli_logger.verbosity < 1: + return _run_and_process_output( + cmd, + process_runner=process_runner, + stdout_file=process_runner.DEVNULL, + stderr_file=process_runner.DEVNULL, + use_login_shells=use_login_shells, + ) + + if not is_output_redirected(): + return _run_and_process_output( + cmd, + process_runner=process_runner, + stdout_file=sys.stdout, + stderr_file=sys.stderr, + use_login_shells=use_login_shells, + ) + else: + tmpfile_path = os.path.join( + tempfile.gettempdir(), "ray-up-{}-{}.txt".format(cmd[0], time.time()) + ) + with open( + tmpfile_path, + mode="w", + # line buffering + buffering=1, + ) as tmp: + cli_logger.verbose("Command stdout is redirected to {}", cf.bold(tmp.name)) + + return _run_and_process_output( + cmd, + process_runner=process_runner, + stdout_file=tmp, + stderr_file=tmp, + use_login_shells=use_login_shells, + ) + + +def handle_ssh_fails(e, first_conn_refused_time, retry_interval): + """Handle SSH system failures coming from a subprocess. + + Args: + e: The `ProcessRunnerException` to handle. + first_conn_refused_time: + The time (as reported by this function) or None, + indicating the last time a CONN_REFUSED error was caught. + + After exceeding a patience value, the program will be aborted + since SSH will likely never recover. + retry_interval: The interval after which the command will be retried, + used here just to inform the user. + """ + if e.msg_type != "ssh_command_failed": + return + + if e.special_case == "ssh_conn_refused": + if ( + first_conn_refused_time is not None + and time.time() - first_conn_refused_time > CONN_REFUSED_PATIENCE + ): + cli_logger.error( + "SSH connection was being refused " + "for {} seconds. Head node assumed " + "unreachable.", + cf.bold(str(CONN_REFUSED_PATIENCE)), + ) + cli_logger.abort( + "Check the node's firewall settings " + "and the cloud network configuration." + ) + + cli_logger.warning("SSH connection was refused.") + cli_logger.warning( + "This might mean that the SSH daemon is " + "still setting up, or that " + "the host is inaccessable (e.g. due to " + "a firewall)." + ) + + return time.time() + + if e.special_case in ["ssh_timeout", "ssh_conn_refused"]: + cli_logger.print( + "SSH still not available, retrying in {} seconds.", + cf.bold(str(retry_interval)), + ) + else: + raise e + + return first_conn_refused_time diff --git a/.venv/lib/python3.11/site-packages/ray/autoscaler/_private/updater.py b/.venv/lib/python3.11/site-packages/ray/autoscaler/_private/updater.py new file mode 100644 index 0000000000000000000000000000000000000000..3843a14aa633d4386ef58ae1ac1468d4beb51898 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/autoscaler/_private/updater.py @@ -0,0 +1,570 @@ +import logging +import os +import subprocess +import time +import traceback +from threading import Thread + +import click + +from ray._private.usage import usage_constants, usage_lib +from ray.autoscaler._private import subprocess_output_util as cmd_output_util +from ray.autoscaler._private.cli_logger import cf, cli_logger +from ray.autoscaler._private.command_runner import ( + AUTOSCALER_NODE_START_WAIT_S, + ProcessRunnerError, +) +from ray.autoscaler._private.constants import ( + LABELS_ENVIRONMENT_VARIABLE, + RESOURCES_ENVIRONMENT_VARIABLE, +) +from ray.autoscaler._private.event_system import CreateClusterEvent, global_event_system +from ray.autoscaler._private.log_timer import LogTimer +from ray.autoscaler.tags import ( + STATUS_SETTING_UP, + STATUS_SYNCING_FILES, + STATUS_UP_TO_DATE, + STATUS_UPDATE_FAILED, + STATUS_WAITING_FOR_SSH, + TAG_RAY_FILE_MOUNTS_CONTENTS, + TAG_RAY_NODE_STATUS, + TAG_RAY_RUNTIME_CONFIG, +) + +logger = logging.getLogger(__name__) + +NUM_SETUP_STEPS = 7 +READY_CHECK_INTERVAL = 5 + + +class NodeUpdater: + """A process for syncing files and running init commands on a node. + + Arguments: + node_id: the Node ID + provider_config: Provider section of autoscaler yaml + provider: NodeProvider Class + auth_config: Auth section of autoscaler yaml + cluster_name: the name of the cluster. + file_mounts: Map of remote to local paths + initialization_commands: Commands run before container launch + setup_commands: Commands run before ray starts + ray_start_commands: Commands to start ray + runtime_hash: Used to check for config changes + file_mounts_contents_hash: Used to check for changes to file mounts + is_head_node: Whether to use head start/setup commands + rsync_options: Extra options related to the rsync command. + process_runner: the module to use to run the commands + in the CommandRunner. E.g., subprocess. + use_internal_ip: Wwhether the node_id belongs to an internal ip + or external ip. + docker_config: Docker section of autoscaler yaml + restart_only: Whether to skip setup commands & just restart ray + for_recovery: True if updater is for a recovering node. Only used for + metric tracking. + """ + + def __init__( + self, + node_id, + provider_config, + provider, + auth_config, + cluster_name, + file_mounts, + initialization_commands, + setup_commands, + ray_start_commands, + runtime_hash, + file_mounts_contents_hash, + is_head_node, + node_resources=None, + node_labels=None, + cluster_synced_files=None, + rsync_options=None, + process_runner=subprocess, + use_internal_ip=False, + docker_config=None, + restart_only=False, + for_recovery=False, + ): + self.log_prefix = "NodeUpdater: {}: ".format(node_id) + # Three cases: + # 1) use_internal_ip arg is True -> use_internal_ip is True + # 2) worker node -> use value of provider_config["use_internal_ips"] + # 3) head node -> use value of provider_config["use_internal_ips"] unless + # overriden by provider_config["use_external_head_ip"] + use_internal_ip = use_internal_ip or ( + provider_config.get("use_internal_ips", False) + and not ( + is_head_node and provider_config.get("use_external_head_ip", False) + ) + ) + self.cmd_runner = provider.get_command_runner( + self.log_prefix, + node_id, + auth_config, + cluster_name, + process_runner, + use_internal_ip, + docker_config, + ) + + self.daemon = True + self.node_id = node_id + self.provider_type = provider_config.get("type") + self.provider = provider + # Some node providers don't specify empty structures as + # defaults. Better to be defensive. + file_mounts = file_mounts or {} + self.file_mounts = { + remote: os.path.expanduser(local) for remote, local in file_mounts.items() + } + + self.initialization_commands = initialization_commands + self.setup_commands = setup_commands + self.ray_start_commands = ray_start_commands + self.node_resources = node_resources + self.node_labels = node_labels + self.runtime_hash = runtime_hash + self.file_mounts_contents_hash = file_mounts_contents_hash + # TODO (Alex): This makes the assumption that $HOME on the head and + # worker nodes is the same. Also note that `cluster_synced_files` is + # set on the head -> worker updaters only (so `expanduser` is only run + # on the head node). + cluster_synced_files = cluster_synced_files or [] + self.cluster_synced_files = [ + os.path.expanduser(path) for path in cluster_synced_files + ] + self.rsync_options = rsync_options or {} + self.auth_config = auth_config + self.is_head_node = is_head_node + self.docker_config = docker_config + self.restart_only = restart_only + self.update_time = None + self.for_recovery = for_recovery + + def run(self): + update_start_time = time.time() + if ( + cmd_output_util.does_allow_interactive() + and cmd_output_util.is_output_redirected() + ): + # this is most probably a bug since the user has no control + # over these settings + msg = ( + "Output was redirected for an interactive command. " + "Either do not pass `--redirect-command-output` " + "or also pass in `--use-normal-shells`." + ) + cli_logger.abort(msg) + + try: + with LogTimer( + self.log_prefix + "Applied config {}".format(self.runtime_hash) + ): + self.do_update() + except Exception as e: + self.provider.set_node_tags( + self.node_id, {TAG_RAY_NODE_STATUS: STATUS_UPDATE_FAILED} + ) + cli_logger.error("New status: {}", cf.bold(STATUS_UPDATE_FAILED)) + + cli_logger.error("!!!") + if hasattr(e, "cmd"): + stderr_output = getattr(e, "stderr", "No stderr available") + cli_logger.error( + "Setup command `{}` failed with exit code {}. stderr: {}", + cf.bold(e.cmd), + e.returncode, + stderr_output, + ) + else: + cli_logger.verbose_error("Exception details: {}", str(vars(e))) + full_traceback = traceback.format_exc() + cli_logger.error("Full traceback: {}", full_traceback) + # todo: handle this better somehow? + cli_logger.error("Error message: {}", str(e)) + cli_logger.error("!!!") + cli_logger.newline() + + if isinstance(e, click.ClickException): + # todo: why do we ignore this here + return + raise + + tags_to_set = { + TAG_RAY_NODE_STATUS: STATUS_UP_TO_DATE, + TAG_RAY_RUNTIME_CONFIG: self.runtime_hash, + } + if self.file_mounts_contents_hash is not None: + tags_to_set[TAG_RAY_FILE_MOUNTS_CONTENTS] = self.file_mounts_contents_hash + + self.provider.set_node_tags(self.node_id, tags_to_set) + cli_logger.labeled_value("New status", STATUS_UP_TO_DATE) + + self.update_time = time.time() - update_start_time + self.exitcode = 0 + + def sync_file_mounts(self, sync_cmd, step_numbers=(0, 2)): + # step_numbers is (# of previous steps, total steps) + previous_steps, total_steps = step_numbers + + nolog_paths = [] + if cli_logger.verbosity == 0: + nolog_paths = ["~/ray_bootstrap_key.pem", "~/ray_bootstrap_config.yaml"] + + def do_sync(remote_path, local_path, allow_non_existing_paths=False): + if allow_non_existing_paths and not os.path.exists(local_path): + cli_logger.print("sync: {} does not exist. Skipping.", local_path) + # Ignore missing source files. In the future we should support + # the --delete-missing-args command to delete files that have + # been removed + return + + assert os.path.exists(local_path), local_path + + if os.path.isdir(local_path): + if not local_path.endswith("/"): + local_path += "/" + if not remote_path.endswith("/"): + remote_path += "/" + + with LogTimer( + self.log_prefix + "Synced {} to {}".format(local_path, remote_path) + ): + is_docker = ( + self.docker_config and self.docker_config["container_name"] != "" + ) + if not is_docker: + # The DockerCommandRunner handles this internally. + self.cmd_runner.run( + "mkdir -p {}".format(os.path.dirname(remote_path)), + run_env="host", + ) + sync_cmd(local_path, remote_path, docker_mount_if_possible=True) + + if remote_path not in nolog_paths: + # todo: timed here? + cli_logger.print( + "{} from {}", cf.bold(remote_path), cf.bold(local_path) + ) + + # Rsync file mounts + with cli_logger.group( + "Processing file mounts", _numbered=("[]", previous_steps + 1, total_steps) + ): + for remote_path, local_path in self.file_mounts.items(): + do_sync(remote_path, local_path) + previous_steps += 1 + + if self.cluster_synced_files: + with cli_logger.group( + "Processing worker file mounts", + _numbered=("[]", previous_steps + 1, total_steps), + ): + cli_logger.print("synced files: {}", str(self.cluster_synced_files)) + for path in self.cluster_synced_files: + do_sync(path, path, allow_non_existing_paths=True) + previous_steps += 1 + else: + cli_logger.print( + "No worker file mounts to sync", + _numbered=("[]", previous_steps + 1, total_steps), + ) + + def wait_ready(self, deadline): + with cli_logger.group( + "Waiting for SSH to become available", _numbered=("[]", 1, NUM_SETUP_STEPS) + ): + with LogTimer(self.log_prefix + "Got remote shell"): + cli_logger.print("Running `{}` as a test.", cf.bold("uptime")) + first_conn_refused_time = None + while True: + if time.time() > deadline: + raise Exception("wait_ready timeout exceeded.") + if self.provider.is_terminated(self.node_id): + raise Exception( + "wait_ready aborting because node " + "detected as terminated." + ) + + try: + # Run outside of the container + self.cmd_runner.run("uptime", timeout=10, run_env="host") + cli_logger.success("Success.") + return True + except ProcessRunnerError as e: + first_conn_refused_time = cmd_output_util.handle_ssh_fails( + e, + first_conn_refused_time, + retry_interval=READY_CHECK_INTERVAL, + ) + time.sleep(READY_CHECK_INTERVAL) + except Exception as e: + # TODO(maximsmol): we should not be ignoring + # exceptions if they get filtered properly + # (new style log + non-interactive shells) + # + # however threading this configuration state + # is a pain and I'm leaving it for later + + retry_str = "(" + str(e) + ")" + if hasattr(e, "cmd"): + if isinstance(e.cmd, str): + cmd_ = e.cmd + elif isinstance(e.cmd, list): + cmd_ = " ".join(e.cmd) + else: + logger.debug( + f"e.cmd type ({type(e.cmd)}) not list or str." + ) + cmd_ = str(e.cmd) + retry_str = "(Exit Status {}): {}".format( + e.returncode, cmd_ + ) + + cli_logger.print( + "SSH still not available {}, retrying in {} seconds.", + cf.dimmed(retry_str), + cf.bold(str(READY_CHECK_INTERVAL)), + ) + + time.sleep(READY_CHECK_INTERVAL) + + def do_update(self): + self.provider.set_node_tags( + self.node_id, {TAG_RAY_NODE_STATUS: STATUS_WAITING_FOR_SSH} + ) + cli_logger.labeled_value("New status", STATUS_WAITING_FOR_SSH) + + deadline = time.time() + AUTOSCALER_NODE_START_WAIT_S + self.wait_ready(deadline) + global_event_system.execute_callback(CreateClusterEvent.ssh_control_acquired) + + node_tags = self.provider.node_tags(self.node_id) + logger.debug("Node tags: {}".format(str(node_tags))) + + if self.provider_type == "aws" and self.provider.provider_config: + from ray.autoscaler._private.aws.cloudwatch.cloudwatch_helper import ( + CloudwatchHelper, + ) + + CloudwatchHelper( + self.provider.provider_config, self.node_id, self.provider.cluster_name + ).update_from_config(self.is_head_node) + + if node_tags.get(TAG_RAY_RUNTIME_CONFIG) == self.runtime_hash: + # When resuming from a stopped instance the runtime_hash may be the + # same, but the container will not be started. + init_required = self.cmd_runner.run_init( + as_head=self.is_head_node, + file_mounts=self.file_mounts, + sync_run_yet=False, + ) + if init_required: + node_tags[TAG_RAY_RUNTIME_CONFIG] += "-invalidate" + # This ensures that `setup_commands` are not removed + self.restart_only = False + + if self.restart_only: + self.setup_commands = [] + + # runtime_hash will only change whenever the user restarts + # or updates their cluster with `get_or_create_head_node` + if node_tags.get(TAG_RAY_RUNTIME_CONFIG) == self.runtime_hash and ( + not self.file_mounts_contents_hash + or node_tags.get(TAG_RAY_FILE_MOUNTS_CONTENTS) + == self.file_mounts_contents_hash + ): + # todo: we lie in the confirmation message since + # full setup might be cancelled here + cli_logger.print( + "Configuration already up to date, " + "skipping file mounts, initalization and setup commands.", + _numbered=("[]", "2-6", NUM_SETUP_STEPS), + ) + + else: + cli_logger.print( + "Updating cluster configuration.", _tags=dict(hash=self.runtime_hash) + ) + + self.provider.set_node_tags( + self.node_id, {TAG_RAY_NODE_STATUS: STATUS_SYNCING_FILES} + ) + cli_logger.labeled_value("New status", STATUS_SYNCING_FILES) + self.sync_file_mounts(self.rsync_up, step_numbers=(1, NUM_SETUP_STEPS)) + + # Only run setup commands if runtime_hash has changed because + # we don't want to run setup_commands every time the head node + # file_mounts folders have changed. + if node_tags.get(TAG_RAY_RUNTIME_CONFIG) != self.runtime_hash: + # Run init commands + self.provider.set_node_tags( + self.node_id, {TAG_RAY_NODE_STATUS: STATUS_SETTING_UP} + ) + cli_logger.labeled_value("New status", STATUS_SETTING_UP) + + if self.initialization_commands: + with cli_logger.group( + "Running initialization commands", + _numbered=("[]", 4, NUM_SETUP_STEPS), + ): + global_event_system.execute_callback( + CreateClusterEvent.run_initialization_cmd + ) + with LogTimer( + self.log_prefix + "Initialization commands", + show_status=True, + ): + for cmd in self.initialization_commands: + global_event_system.execute_callback( + CreateClusterEvent.run_initialization_cmd, + {"command": cmd}, + ) + try: + # Overriding the existing SSHOptions class + # with a new SSHOptions class that uses + # this ssh_private_key as its only __init__ + # argument. + # Run outside docker. + self.cmd_runner.run( + cmd, + ssh_options_override_ssh_key=self.auth_config.get( # noqa: E501 + "ssh_private_key" + ), + run_env="host", + ) + except ProcessRunnerError as e: + if e.msg_type == "ssh_command_failed": + cli_logger.error("Failed.") + cli_logger.error("See above for stderr.") + + raise click.ClickException( + "Initialization command failed." + ) from None + else: + cli_logger.print( + "No initialization commands to run.", + _numbered=("[]", 4, NUM_SETUP_STEPS), + ) + with cli_logger.group( + "Initializing command runner", + # todo: fix command numbering + _numbered=("[]", 5, NUM_SETUP_STEPS), + ): + self.cmd_runner.run_init( + as_head=self.is_head_node, + file_mounts=self.file_mounts, + sync_run_yet=True, + ) + if self.setup_commands: + with cli_logger.group( + "Running setup commands", + # todo: fix command numbering + _numbered=("[]", 6, NUM_SETUP_STEPS), + ): + global_event_system.execute_callback( + CreateClusterEvent.run_setup_cmd + ) + with LogTimer( + self.log_prefix + "Setup commands", show_status=True + ): + total = len(self.setup_commands) + for i, cmd in enumerate(self.setup_commands): + global_event_system.execute_callback( + CreateClusterEvent.run_setup_cmd, {"command": cmd} + ) + if cli_logger.verbosity == 0 and len(cmd) > 30: + cmd_to_print = cf.bold(cmd[:30]) + "..." + else: + cmd_to_print = cf.bold(cmd) + + cli_logger.print( + "{}", cmd_to_print, _numbered=("()", i, total) + ) + + try: + # Runs in the container if docker is in use + self.cmd_runner.run(cmd, run_env="auto") + except ProcessRunnerError as e: + if e.msg_type == "ssh_command_failed": + cli_logger.error("Failed.") + cli_logger.error("See above for stderr.") + + raise click.ClickException("Setup command failed.") + else: + cli_logger.print( + "No setup commands to run.", + _numbered=("[]", 6, NUM_SETUP_STEPS), + ) + + with cli_logger.group( + "Starting the Ray runtime", _numbered=("[]", 7, NUM_SETUP_STEPS) + ): + global_event_system.execute_callback(CreateClusterEvent.start_ray_runtime) + with LogTimer(self.log_prefix + "Ray start commands", show_status=True): + for cmd in self.ray_start_commands: + env_vars = {} + if self.is_head_node: + if usage_lib.usage_stats_enabled(): + env_vars[usage_constants.USAGE_STATS_ENABLED_ENV_VAR] = 1 + else: + # Disable usage stats collection in the cluster. + env_vars[usage_constants.USAGE_STATS_ENABLED_ENV_VAR] = 0 + + # Add a resource override env variable if needed. + # Local NodeProvider doesn't need resource and label override. + if self.provider_type != "local": + if self.node_resources: + env_vars[ + RESOURCES_ENVIRONMENT_VARIABLE + ] = self.node_resources + if self.node_labels: + env_vars[LABELS_ENVIRONMENT_VARIABLE] = self.node_labels + + try: + old_redirected = cmd_output_util.is_output_redirected() + cmd_output_util.set_output_redirected(False) + # Runs in the container if docker is in use + self.cmd_runner.run( + cmd, environment_variables=env_vars, run_env="auto" + ) + cmd_output_util.set_output_redirected(old_redirected) + except ProcessRunnerError as e: + if e.msg_type == "ssh_command_failed": + cli_logger.error("Failed.") + cli_logger.error("See above for stderr.") + + raise click.ClickException("Start command failed.") + global_event_system.execute_callback( + CreateClusterEvent.start_ray_runtime_completed + ) + + def rsync_up(self, source, target, docker_mount_if_possible=False): + options = {} + options["docker_mount_if_possible"] = docker_mount_if_possible + options["rsync_exclude"] = self.rsync_options.get("rsync_exclude") + options["rsync_filter"] = self.rsync_options.get("rsync_filter") + self.cmd_runner.run_rsync_up(source, target, options=options) + cli_logger.verbose( + "`rsync`ed {} (local) to {} (remote)", cf.bold(source), cf.bold(target) + ) + + def rsync_down(self, source, target, docker_mount_if_possible=False): + options = {} + options["docker_mount_if_possible"] = docker_mount_if_possible + options["rsync_exclude"] = self.rsync_options.get("rsync_exclude") + options["rsync_filter"] = self.rsync_options.get("rsync_filter") + self.cmd_runner.run_rsync_down(source, target, options=options) + cli_logger.verbose( + "`rsync`ed {} (remote) to {} (local)", cf.bold(source), cf.bold(target) + ) + + +class NodeUpdaterThread(NodeUpdater, Thread): + def __init__(self, *args, **kwargs): + Thread.__init__(self) + NodeUpdater.__init__(self, *args, **kwargs) + self.exitcode = -1 diff --git a/.venv/lib/python3.11/site-packages/ray/autoscaler/_private/util.py b/.venv/lib/python3.11/site-packages/ray/autoscaler/_private/util.py new file mode 100644 index 0000000000000000000000000000000000000000..fde2e2cd7fac19f9fba802e6591ca2e9b79f8695 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/autoscaler/_private/util.py @@ -0,0 +1,963 @@ +import collections +import copy +import hashlib +import json +import logging +import os +import threading +from dataclasses import dataclass +from datetime import datetime +from io import StringIO +from numbers import Number, Real +from typing import Any, Dict, List, Optional, Tuple, Union + +import ray +import ray._private.ray_constants as ray_constants +import ray._private.services as services +from ray._private.utils import ( + PLACEMENT_GROUP_INDEXED_BUNDLED_RESOURCE_PATTERN, + PLACEMENT_GROUP_WILDCARD_RESOURCE_PATTERN, +) +from ray.autoscaler._private import constants +from ray.autoscaler._private.cli_logger import cli_logger +from ray.autoscaler._private.docker import validate_docker_config +from ray.autoscaler._private.local.config import prepare_local +from ray.autoscaler._private.providers import _get_default_config +from ray.autoscaler.tags import NODE_TYPE_LEGACY_HEAD, NODE_TYPE_LEGACY_WORKER + +REQUIRED, OPTIONAL = True, False + + +HEAD_TYPE_MAX_WORKERS_WARN_TEMPLATE = ( + "Setting `max_workers` for node type" + " `{node_type}` to the global `max_workers` value of {max_workers}. To" + " avoid spawning worker nodes of type `{node_type}`, explicitly set" + " `max_workers: 0` for `{node_type}`.\n" + "Note that `max_workers: 0` was the default value prior to Ray 1.3.0." + " Your current version is Ray {version}.\n" + "See the docs for more information:\n" + "https://docs.ray.io/en/master/cluster/config.html" + "#cluster-configuration-node-max-workers\n" + "https://docs.ray.io/en/master/cluster/config.html#full-configuration" +) + +ResourceBundle = Dict[str, Union[int, float]] + +# A Dict and the count of how many times it occurred. +# Refer to freq_of_dicts() below. +DictCount = Tuple[Dict, Number] + +# e.g., cpu_4_ondemand. +NodeType = str + +# e.g., head, worker, unmanaged +NodeKind = str + +# e.g., {"resources": ..., "max_workers": ...}. +NodeTypeConfigDict = Dict[str, Any] + +# e.g., {"GPU": 1}. +ResourceDict = Dict[str, Real] + +# e.g., "node-1". +NodeID = str + +# e.g., "127.0.0.1". +NodeIP = str + +# Number of nodes to launch +NodeCount = int + +# e.g. "up-to-date", "update-failed" +# See autoscaler/tags.py for other status +# values used by the autoscaler. +NodeStatus = str + +Usage = Dict[str, Tuple[Number, Number]] + +logger = logging.getLogger(__name__) + + +def is_placement_group_resource(resource_name: str) -> bool: + """ + Check if a resource name is structured like a placement group. + """ + return bool( + PLACEMENT_GROUP_WILDCARD_RESOURCE_PATTERN.match(resource_name) + or PLACEMENT_GROUP_INDEXED_BUNDLED_RESOURCE_PATTERN.match(resource_name) + ) + + +@dataclass +class LoadMetricsSummary: + # Map of resource name (e.g. "memory") to pair of (Used, Available) numbers + usage: Usage + # Counts of demand bundles from task/actor demand. + # e.g. [({"CPU": 1}, 5), ({"GPU":1}, 2)] + resource_demand: List[DictCount] + # Counts of pending placement groups + pg_demand: List[DictCount] + # Counts of demand bundles requested by autoscaler.sdk.request_resources + request_demand: List[DictCount] + node_types: List[DictCount] + # Optionally included for backwards compatibility: IP of the head node. See + # https://github.com/ray-project/ray/pull/20623 for details. + head_ip: Optional[NodeIP] = None + # Optionally included for backwards compatibility: Resource breakdown by + # node. Mapping from node id to resource usage. + usage_by_node: Optional[Dict[str, Usage]] = None + # A mapping from node name (the same key as `usage_by_node`) to node type. + # Optional for deployment modes which have the concept of node types and + # backwards compatibility. + node_type_mapping: Optional[Dict[str, str]] = None + idle_time_map: Optional[Dict[str, int]] = None + + +class ConcurrentCounter: + def __init__(self): + self._lock = threading.RLock() + self._counter = collections.defaultdict(int) + + def inc(self, key, count): + with self._lock: + self._counter[key] += count + return self.value + + def dec(self, key, count): + with self._lock: + self._counter[key] -= count + assert self._counter[key] >= 0, "counter cannot go negative" + return self.value + + def breakdown(self): + with self._lock: + return dict(self._counter) + + @property + def value(self): + with self._lock: + return sum(self._counter.values()) + + +def validate_config(config: Dict[str, Any]) -> None: + """Required Dicts indicate that no extra fields can be introduced.""" + if not isinstance(config, dict): + raise ValueError("Config {} is not a dictionary".format(config)) + + schema_path = os.path.join( + os.path.dirname(ray.autoscaler.__file__), "ray-schema.json" + ) + with open(schema_path) as f: + schema = json.load(f) + + try: + import jsonschema + except (ModuleNotFoundError, ImportError) as e: + # Don't log a warning message here. Logging be handled by upstream. + raise e from None + + try: + jsonschema.validate(config, schema) + except jsonschema.ValidationError as e: + raise e from None + + # Detect out of date defaults. This happens when the autoscaler that filled + # out the default values is older than the version of the autoscaler that + # is running on the cluster. + if "cluster_synced_files" not in config: + raise RuntimeError( + "Missing 'cluster_synced_files' field in the cluster " + "configuration. This is likely due to the Ray version running " + "in the cluster {ray_version} is greater than the Ray version " + "running on your laptop. Please try updating Ray on your local " + "machine and make sure the versions match.".format( + ray_version=ray.__version__ + ) + ) + + if "available_node_types" in config: + if "head_node_type" not in config: + raise ValueError( + "You must specify `head_node_type` if `available_node_types is set." + ) + if config["head_node_type"] not in config["available_node_types"]: + raise ValueError("`head_node_type` must be one of `available_node_types`.") + + sum_min_workers = sum( + config["available_node_types"][node_type].get("min_workers", 0) + for node_type in config["available_node_types"] + ) + if sum_min_workers > config["max_workers"]: + raise ValueError( + "The specified global `max_workers` is smaller than the " + "sum of `min_workers` of all the available node types." + ) + + +def check_legacy_fields(config: Dict[str, Any]) -> None: + """For use in providers that have completed the migration to + available_node_types. + + Warns user that head_node and worker_nodes fields are being ignored. + Throws an error if available_node_types and head_node_type aren't + specified. + """ + # log warning if non-empty head_node field + if "head_node" in config and config["head_node"]: + cli_logger.warning( + "The `head_node` field is deprecated and will be ignored. " + "Use `head_node_type` and `available_node_types` instead." + ) + # log warning if non-empty worker_nodes field + if "worker_nodes" in config and config["worker_nodes"]: + cli_logger.warning( + "The `worker_nodes` field is deprecated and will be ignored. " + "Use `available_node_types` instead." + ) + if "available_node_types" not in config: + cli_logger.error("`available_node_types` not specified in config") + raise ValueError("`available_node_types` not specified in config") + if "head_node_type" not in config: + cli_logger.error("`head_node_type` not specified in config") + raise ValueError("`head_node_type` not specified in config") + + +def prepare_config(config: Dict[str, Any]) -> Dict[str, Any]: + """ + The returned config has the following properties: + - Uses the multi-node-type autoscaler configuration. + - Merged with the appropriate defaults.yaml + - Has a valid Docker configuration if provided. + - Has max_worker set for each node type. + """ + is_local = config.get("provider", {}).get("type") == "local" + is_kuberay = config.get("provider", {}).get("type") == "kuberay" + if is_local: + config = prepare_local(config) + elif is_kuberay: + # With KubeRay, we don't need to do anything here since KubeRay + # generate the autoscaler config from the RayCluster CR instead + # of loading from the files. + return config + + with_defaults = fillout_defaults(config) + merge_setup_commands(with_defaults) + validate_docker_config(with_defaults) + fill_node_type_min_max_workers(with_defaults) + return with_defaults + + +def translate_trivial_legacy_config(config: Dict[str, Any]): + """ + Drop empty deprecated fields ("head_node" and "worker_node"). + """ + + REMOVABLE_FIELDS = ["head_node", "worker_nodes"] + + for field in REMOVABLE_FIELDS: + if field in config and not config[field]: + logger.warning( + f"Dropping the empty legacy field {field}. {field}" + "is not supported for ray>=2.0.0. It is recommended to remove" + f"{field} from the cluster config." + ) + del config[field] + + +def fillout_defaults(config: Dict[str, Any]) -> Dict[str, Any]: + defaults = _get_default_config(config["provider"]) + defaults.update(config) + + # Just for clarity: + merged_config = copy.deepcopy(defaults) + + # Fill auth field to avoid key errors. + # This field is accessed when calling NodeUpdater but is not relevant to + # certain node providers and is thus left out of some cluster launching + # configs. + merged_config["auth"] = merged_config.get("auth", {}) + + # A legacy config is one which doesn't have available_node_types, + # but has at least one of head_node or worker_nodes. + is_legacy_config = ("available_node_types" not in config) and ( + "head_node" in config or "worker_nodes" in config + ) + # Do merging logic for legacy configs. + if is_legacy_config: + merged_config = merge_legacy_yaml_with_defaults(merged_config) + # Take care of this here, in case a config does not specify any of head, + # workers, node types, but does specify min workers: + merged_config.pop("min_workers", None) + + translate_trivial_legacy_config(merged_config) + + return merged_config + + +def merge_legacy_yaml_with_defaults(merged_config: Dict[str, Any]) -> Dict[str, Any]: + """Rewrite legacy config's available node types after it has been merged + with defaults yaml. + """ + cli_logger.warning( + "Converting legacy cluster config to a multi node type cluster " + "config. Multi-node-type cluster configs are the recommended " + "format for configuring Ray clusters. " + "See the docs for more information:\n" + "https://docs.ray.io/en/master/cluster/config.html#full-configuration" + ) + + # Get default head and worker types. + default_head_type = merged_config["head_node_type"] + # Default configs are assumed to have two node types -- one for the head + # and one for the workers. + assert len(merged_config["available_node_types"].keys()) == 2 + default_worker_type = ( + merged_config["available_node_types"].keys() - {default_head_type} + ).pop() + + if merged_config["head_node"]: + # User specified a head node in legacy config. + # Convert it into data for the head's node type. + head_node_info = { + "node_config": merged_config["head_node"], + "resources": merged_config["head_node"].get("resources") or {}, + "min_workers": 0, + "max_workers": 0, + } + else: + # Use default data for the head's node type. + head_node_info = merged_config["available_node_types"][default_head_type] + if merged_config["worker_nodes"]: + # User specified a worker node in legacy config. + # Convert it into data for the workers' node type. + worker_node_info = { + "node_config": merged_config["worker_nodes"], + "resources": merged_config["worker_nodes"].get("resources") or {}, + "min_workers": merged_config.get("min_workers", 0), + "max_workers": merged_config["max_workers"], + } + else: + # Use default data for the workers' node type. + worker_node_info = merged_config["available_node_types"][default_worker_type] + + # Rewrite available_node_types. + merged_config["available_node_types"] = { + NODE_TYPE_LEGACY_HEAD: head_node_info, + NODE_TYPE_LEGACY_WORKER: worker_node_info, + } + merged_config["head_node_type"] = NODE_TYPE_LEGACY_HEAD + + # Resources field in head/worker fields cause node launch to fail. + merged_config["head_node"].pop("resources", None) + merged_config["worker_nodes"].pop("resources", None) + + return merged_config + + +def merge_setup_commands(config): + config["head_setup_commands"] = ( + config["setup_commands"] + config["head_setup_commands"] + ) + config["worker_setup_commands"] = ( + config["setup_commands"] + config["worker_setup_commands"] + ) + return config + + +def fill_node_type_min_max_workers(config): + """Sets default per-node max workers to global max_workers. + This equivalent to setting the default per-node max workers to infinity, + with the only upper constraint coming from the global max_workers. + Sets default per-node min workers to zero. + Also sets default max_workers for the head node to zero. + """ + assert "max_workers" in config, "Global max workers should be set." + node_types = config["available_node_types"] + for node_type_name in node_types: + node_type_data = node_types[node_type_name] + + node_type_data.setdefault("min_workers", 0) + if "max_workers" not in node_type_data: + if node_type_name == config["head_node_type"]: + logger.info("setting max workers for head node type to 0") + node_type_data.setdefault("max_workers", 0) + else: + global_max_workers = config["max_workers"] + logger.info( + f"setting max workers for {node_type_name} to " + f"{global_max_workers}" + ) + node_type_data.setdefault("max_workers", global_max_workers) + + +def with_envs(cmds: List[str], kv: Dict[str, str]) -> str: + """ + Returns a list of commands with the given environment variables set. + + Args: + cmds (List[str]): List of commands to set environment variables for. + kv (Dict[str, str]): Dictionary of environment variables to set. + + Returns: + List[str]: List of commands with the given environment variables set. + + Example: + with_envs(["echo $FOO"], {"FOO": "BAR"}) + -> ["export FOO=BAR; echo $FOO"] + """ + out_cmds = [] + for cmd in cmds: + kv_str = "" + for k, v in kv.items(): + # We will need to do export here so that it works correctly with + # shell if the cmd args uses the argument. + kv_str += f"export {k}={v}; " + + out_cmds.append(f"{kv_str}{cmd}") + return out_cmds + + +def with_head_node_ip(cmds, head_ip=None): + if head_ip is None: + head_ip = services.get_node_ip_address() + return with_envs(cmds, {"RAY_HEAD_IP": head_ip}) + + +def hash_launch_conf(node_conf, auth): + hasher = hashlib.sha1() + # For hashing, we replace the path to the key with the + # key itself. This is to make sure the hashes are the + # same even if keys live at different locations on different + # machines. + full_auth = auth.copy() + for key_type in ["ssh_private_key", "ssh_public_key"]: + if key_type in auth: + with open(os.path.expanduser(auth[key_type])) as key: + full_auth[key_type] = key.read() + hasher.update(json.dumps([node_conf, full_auth], sort_keys=True).encode("utf-8")) + return hasher.hexdigest() + + +# Cache the file hashes to avoid rescanning it each time. Also, this avoids +# inadvertently restarting workers if the file mount content is mutated on the +# head node. +_hash_cache = {} + + +def hash_runtime_conf( + file_mounts, + cluster_synced_files, + extra_objs, + generate_file_mounts_contents_hash=False, +): + """Returns two hashes, a runtime hash and file_mounts_content hash. + + The runtime hash is used to determine if the configuration or file_mounts + contents have changed. It is used at launch time (ray up) to determine if + a restart is needed. + + The file_mounts_content hash is used to determine if the file_mounts or + cluster_synced_files contents have changed. It is used at monitor time to + determine if additional file syncing is needed. + """ + runtime_hasher = hashlib.sha1() + contents_hasher = hashlib.sha1() + + def add_content_hashes(path, allow_non_existing_paths: bool = False): + def add_hash_of_file(fpath): + with open(fpath, "rb") as f: + for chunk in iter(lambda: f.read(2**20), b""): + contents_hasher.update(chunk) + + path = os.path.expanduser(path) + if allow_non_existing_paths and not os.path.exists(path): + return + if os.path.isdir(path): + dirs = [] + for dirpath, _, filenames in os.walk(path): + dirs.append((dirpath, sorted(filenames))) + for dirpath, filenames in sorted(dirs): + contents_hasher.update(dirpath.encode("utf-8")) + for name in filenames: + contents_hasher.update(name.encode("utf-8")) + fpath = os.path.join(dirpath, name) + add_hash_of_file(fpath) + else: + add_hash_of_file(path) + + conf_str = json.dumps(file_mounts, sort_keys=True).encode("utf-8") + json.dumps( + extra_objs, sort_keys=True + ).encode("utf-8") + + # Only generate a contents hash if generate_contents_hash is true or + # if we need to generate the runtime_hash + if conf_str not in _hash_cache or generate_file_mounts_contents_hash: + for local_path in sorted(file_mounts.values()): + add_content_hashes(local_path) + head_node_contents_hash = contents_hasher.hexdigest() + + # Generate a new runtime_hash if its not cached + # The runtime hash does not depend on the cluster_synced_files hash + # because we do not want to restart nodes only if cluster_synced_files + # contents have changed. + if conf_str not in _hash_cache: + runtime_hasher.update(conf_str) + runtime_hasher.update(head_node_contents_hash.encode("utf-8")) + _hash_cache[conf_str] = runtime_hasher.hexdigest() + + # Add cluster_synced_files to the file_mounts_content hash + if cluster_synced_files is not None: + for local_path in sorted(cluster_synced_files): + # For cluster_synced_files, we let the path be non-existant + # because its possible that the source directory gets set up + # anytime over the life of the head node. + add_content_hashes(local_path, allow_non_existing_paths=True) + + file_mounts_contents_hash = contents_hasher.hexdigest() + + else: + file_mounts_contents_hash = None + + return (_hash_cache[conf_str], file_mounts_contents_hash) + + +def add_prefix(info_string, prefix): + """Prefixes each line of info_string, except the first, by prefix.""" + lines = info_string.split("\n") + prefixed_lines = [lines[0]] + for line in lines[1:]: + prefixed_line = ":".join([prefix, line]) + prefixed_lines.append(prefixed_line) + prefixed_info_string = "\n".join(prefixed_lines) + return prefixed_info_string + + +def format_pg(pg): + strategy = pg["strategy"] + bundles = pg["bundles"] + shape_strs = [] + for bundle, count in bundles: + shape_strs.append(f"{bundle} * {count}") + bundles_str = ", ".join(shape_strs) + return f"{bundles_str} ({strategy})" + + +def parse_placement_group_resource_str( + placement_group_resource_str: str, +) -> Tuple[str, Optional[str], bool]: + """Parse placement group resource in the form of following 3 cases: + {resource_name}_group_{bundle_id}_{group_name}; + -> This case is ignored as it is duplicated to the case below. + {resource_name}_group_{group_name}; + {resource_name} + + Returns: + Tuple of (resource_name, placement_group_name, is_countable_resource). + placement_group_name could be None if its not a placement group + resource. is_countable_resource is True if the resource + doesn't contain bundle index. We shouldn't count resources + with bundle index because it will + have duplicated resource information as + wildcard resources (resource name without bundle index). + """ + result = PLACEMENT_GROUP_INDEXED_BUNDLED_RESOURCE_PATTERN.match( + placement_group_resource_str + ) + if result: + return (result.group(1), result.group(3), False) + result = PLACEMENT_GROUP_WILDCARD_RESOURCE_PATTERN.match( + placement_group_resource_str + ) + if result: + return (result.group(1), result.group(2), True) + return (placement_group_resource_str, None, True) + + +MEMORY_SUFFIXES = [ + ("TiB", 2**40), + ("GiB", 2**30), + ("MiB", 2**20), + ("KiB", 2**10), +] + + +def format_memory(mem_bytes: Number) -> str: + """Formats memory in bytes in friendly unit. E.g. (2**30 + 1) bytes should + be displayed as 1GiB but 1 byte should be displayed as 1B, (as opposed to + rounding it to 0GiB). + """ + for suffix, bytes_per_unit in MEMORY_SUFFIXES: + if mem_bytes >= bytes_per_unit: + mem_in_unit = mem_bytes / bytes_per_unit + return f"{mem_in_unit:.2f}{suffix}" + + return f"{int(mem_bytes)}B" + + +def parse_usage(usage: Usage, verbose: bool) -> List[str]: + # first collect resources used in placement groups + placement_group_resource_usage = {} + placement_group_resource_total = collections.defaultdict(float) + for resource, (used, total) in usage.items(): + (pg_resource_name, pg_name, is_countable) = parse_placement_group_resource_str( + resource + ) + if pg_name: + if pg_resource_name not in placement_group_resource_usage: + placement_group_resource_usage[pg_resource_name] = 0 + if is_countable: + placement_group_resource_usage[pg_resource_name] += used + placement_group_resource_total[pg_resource_name] += total + continue + usage_lines = [] + for resource, (used, total) in sorted(usage.items()): + if "node:" in resource: + continue # Skip the auto-added per-node "node:" resource. + + (_, pg_name, _) = parse_placement_group_resource_str(resource) + if pg_name: + continue # Skip resource used by placement groups + + pg_used = 0 + pg_total = 0 + used_in_pg = resource in placement_group_resource_usage + if used_in_pg: + pg_used = placement_group_resource_usage[resource] + pg_total = placement_group_resource_total[resource] + # Used includes pg_total because when pgs are created + # it allocates resources. + # To get the real resource usage, we should subtract the pg + # reserved resources from the usage and add pg used instead. + used = used - pg_total + pg_used + + if resource in ["memory", "object_store_memory"]: + formatted_used = format_memory(used) + formatted_total = format_memory(total) + line = f"{formatted_used}/{formatted_total} {resource}" + if used_in_pg: + formatted_pg_used = format_memory(pg_used) + formatted_pg_total = format_memory(pg_total) + line = line + ( + f" ({formatted_pg_used} used of " + f"{formatted_pg_total} " + "reserved in placement groups)" + ) + usage_lines.append(line) + elif resource.startswith("accelerator_type:") and not verbose: + # We made a judgement call not to show this. + # https://github.com/ray-project/ray/issues/33272 + pass + else: + line = f"{used}/{total} {resource}" + if used_in_pg: + line += ( + f" ({pg_used} used of " f"{pg_total} reserved in placement groups)" + ) + usage_lines.append(line) + return usage_lines + + +def get_usage_report(lm_summary: LoadMetricsSummary, verbose: bool) -> str: + usage_lines = parse_usage(lm_summary.usage, verbose) + + sio = StringIO() + for line in usage_lines: + print(f" {line}", file=sio) + return sio.getvalue() + + +def format_resource_demand_summary( + resource_demand: List[Tuple[ResourceBundle, int]] +) -> List[str]: + def filter_placement_group_from_bundle(bundle: ResourceBundle): + """filter placement group from bundle resource name. returns + filtered bundle and a bool indicate if the bundle is using + placement group. + + Example: {"CPU_group_groupid": 1} returns {"CPU": 1}, True + {"memory": 1} return {"memory": 1}, False + """ + using_placement_group = False + result_bundle = dict() + for pg_resource_str, resource_count in bundle.items(): + (resource_name, pg_name, _) = parse_placement_group_resource_str( + pg_resource_str + ) + result_bundle[resource_name] = resource_count + if pg_name: + using_placement_group = True + return (result_bundle, using_placement_group) + + bundle_demand = collections.defaultdict(int) + pg_bundle_demand = collections.defaultdict(int) + + for bundle, count in resource_demand: + ( + pg_filtered_bundle, + using_placement_group, + ) = filter_placement_group_from_bundle(bundle) + + # bundle is a special keyword for placement group scheduling + # but it doesn't need to be exposed to users. Remove it from + # the demand report. + if ( + using_placement_group + and ray_constants.PLACEMENT_GROUP_BUNDLE_RESOURCE_NAME + in pg_filtered_bundle.keys() + ): + del pg_filtered_bundle[ray_constants.PLACEMENT_GROUP_BUNDLE_RESOURCE_NAME] + + # No need to report empty request to demand (e.g., + # placement group ready task). + if len(pg_filtered_bundle.keys()) == 0: + continue + + bundle_demand[tuple(sorted(pg_filtered_bundle.items()))] += count + if using_placement_group: + pg_bundle_demand[tuple(sorted(pg_filtered_bundle.items()))] += count + + demand_lines = [] + for bundle, count in bundle_demand.items(): + line = f" {dict(bundle)}: {count}+ pending tasks/actors" + if bundle in pg_bundle_demand: + line += f" ({pg_bundle_demand[bundle]}+ using placement groups)" + demand_lines.append(line) + return demand_lines + + +def get_demand_report(lm_summary: LoadMetricsSummary): + demand_lines = [] + if lm_summary.resource_demand: + demand_lines.extend(format_resource_demand_summary(lm_summary.resource_demand)) + for entry in lm_summary.pg_demand: + pg, count = entry + pg_str = format_pg(pg) + line = f" {pg_str}: {count}+ pending placement groups" + demand_lines.append(line) + for bundle, count in lm_summary.request_demand: + line = f" {bundle}: {count}+ from request_resources()" + demand_lines.append(line) + if len(demand_lines) > 0: + demand_report = "\n".join(demand_lines) + else: + demand_report = " (no resource demands)" + return demand_report + + +def get_per_node_breakdown_as_dict( + lm_summary: LoadMetricsSummary, +) -> dict: + per_node_breakdown = {} + + for node_id, usage in lm_summary.usage_by_node.items(): + usage_string = "" + for line in parse_usage(usage, verbose=True): + usage_string += f"{line}\n" + per_node_breakdown[node_id] = usage_string.strip() + + return per_node_breakdown + + +def get_per_node_breakdown( + lm_summary: LoadMetricsSummary, + node_type_mapping: Optional[Dict[str, float]], + node_activities: Optional[Dict[str, List[str]]], + verbose: bool, +) -> str: + sio = StringIO() + + if node_type_mapping is None: + node_type_mapping = {} + + print(file=sio) + for node_id, usage in lm_summary.usage_by_node.items(): + print(file=sio) # Print a newline. + node_string = f"Node: {node_id}" + if node_id in node_type_mapping: + node_type = node_type_mapping[node_id] + node_string += f" ({node_type})" + print(node_string, file=sio) + if ( + lm_summary.idle_time_map + and node_id in lm_summary.idle_time_map + and lm_summary.idle_time_map[node_id] > 0 + ): + print(f" Idle: {lm_summary.idle_time_map[node_id]} ms", file=sio) + + print(" Usage:", file=sio) + for line in parse_usage(usage, verbose): + print(f" {line}", file=sio) + # Don't print anything if not provided. + if not node_activities: + continue + print(" Activity:", file=sio) + if node_id not in node_activities: + print(" (no activity)", file=sio) + else: + # Note: We have node IP here. + _, reasons = node_activities[node_id] + for reason in reasons: + print(f" {reason}", file=sio) + + return sio.getvalue() + + +def format_info_string( + lm_summary, + autoscaler_summary, + time=None, + gcs_request_time: Optional[float] = None, + non_terminated_nodes_time: Optional[float] = None, + autoscaler_update_time: Optional[float] = None, + verbose: bool = False, +): + if time is None: + time = datetime.now() + header = "=" * 8 + f" Autoscaler status: {time} " + "=" * 8 + separator = "-" * len(header) + if verbose: + header += "\n" + if gcs_request_time: + header += f"GCS request time: {gcs_request_time:3f}s\n" + if non_terminated_nodes_time: + header += ( + "Node Provider non_terminated_nodes time: " + f"{non_terminated_nodes_time:3f}s\n" + ) + if autoscaler_update_time: + header += "Autoscaler iteration time: " f"{autoscaler_update_time:3f}s\n" + + available_node_report_lines = [] + if not autoscaler_summary.active_nodes: + available_node_report = " (no active nodes)" + else: + for node_type, count in autoscaler_summary.active_nodes.items(): + line = f" {count} {node_type}" + available_node_report_lines.append(line) + available_node_report = "\n".join(available_node_report_lines) + + if not autoscaler_summary.idle_nodes: + idle_node_report = " (no idle nodes)" + else: + idle_node_report_lines = [] + for node_type, count in autoscaler_summary.idle_nodes.items(): + line = f" {count} {node_type}" + idle_node_report_lines.append(line) + idle_node_report = "\n".join(idle_node_report_lines) + + pending_lines = [] + for node_type, count in autoscaler_summary.pending_launches.items(): + line = f" {node_type}, {count} launching" + pending_lines.append(line) + for ip, node_type, status in autoscaler_summary.pending_nodes: + line = f" {ip}: {node_type}, {status.lower()}" + pending_lines.append(line) + if pending_lines: + pending_report = "\n".join(pending_lines) + else: + pending_report = " (no pending nodes)" + + failure_lines = [] + for ip, node_type in autoscaler_summary.failed_nodes: + line = f" {node_type}: NodeTerminated (ip: {ip})" + failure_lines.append(line) + if autoscaler_summary.node_availability_summary: + records = sorted( + autoscaler_summary.node_availability_summary.node_availabilities.values(), + key=lambda record: record.last_checked_timestamp, + ) + for record in records: + if record.is_available: + continue + assert record.unavailable_node_information is not None + node_type = record.node_type + category = record.unavailable_node_information.category + description = record.unavailable_node_information.description + attempted_time = datetime.fromtimestamp(record.last_checked_timestamp) + formatted_time = ( + # This `:02d` funny business is python syntax for printing a 2 + # digit number with a leading zero as padding if needed. + f"{attempted_time.hour:02d}:" + f"{attempted_time.minute:02d}:" + f"{attempted_time.second:02d}" + ) + line = f" {node_type}: {category} (latest_attempt: {formatted_time})" + if verbose: + line += f" - {description}" + failure_lines.append(line) + + failure_lines = failure_lines[: -constants.AUTOSCALER_MAX_FAILURES_DISPLAYED : -1] + failure_report = "Recent failures:\n" + if failure_lines: + failure_report += "\n".join(failure_lines) + else: + failure_report += " (no failures)" + + usage_report = get_usage_report(lm_summary, verbose) + demand_report = get_demand_report(lm_summary) + formatted_output = f"""{header} +Node status +{separator} +Active: +{available_node_report}""" + + if not autoscaler_summary.legacy: + formatted_output += f""" +Idle: +{idle_node_report}""" + + formatted_output += f""" +Pending: +{pending_report} +{failure_report} + +Resources +{separator} +{"Total " if verbose else ""}Usage: +{usage_report} +{"Total " if verbose else ""}Demands: +{demand_report}""" + + if verbose: + if lm_summary.usage_by_node: + formatted_output += get_per_node_breakdown( + lm_summary, + autoscaler_summary.node_type_mapping, + autoscaler_summary.node_activities, + verbose, + ) + else: + formatted_output += "\n" + + return formatted_output.strip() + + +def format_readonly_node_type(node_id: str): + """The anonymous node type for readonly node provider nodes.""" + return "node_{}".format(node_id) + + +def format_no_node_type_string(node_type: dict): + placement_group_resource_usage = {} + regular_resource_usage = collections.defaultdict(float) + for resource, total in node_type.items(): + (pg_resource_name, pg_name, is_countable) = parse_placement_group_resource_str( + resource + ) + if pg_name: + if not is_countable: + continue + if pg_resource_name not in placement_group_resource_usage: + placement_group_resource_usage[pg_resource_name] = 0 + placement_group_resource_usage[pg_resource_name] += total + else: + regular_resource_usage[resource] += total + + output_lines = [""] + for resource, total in regular_resource_usage.items(): + output_line = f"{resource}: {total}" + if resource in placement_group_resource_usage: + pg_resource = placement_group_resource_usage[resource] + output_line += f" ({pg_resource} reserved in placement groups)" + output_lines.append(output_line) + + return "\n ".join(output_lines) diff --git a/.venv/lib/python3.11/site-packages/ray/autoscaler/batching_node_provider.py b/.venv/lib/python3.11/site-packages/ray/autoscaler/batching_node_provider.py new file mode 100644 index 0000000000000000000000000000000000000000..6a9118585fa54c0e548ed0a5b5e69268107aa018 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/autoscaler/batching_node_provider.py @@ -0,0 +1,255 @@ +import logging +from collections import defaultdict +from dataclasses import dataclass, field +from typing import Any, Dict, List, Optional, Set + +from ray.autoscaler._private.constants import ( + DISABLE_LAUNCH_CONFIG_CHECK_KEY, + DISABLE_NODE_UPDATERS_KEY, + FOREGROUND_NODE_LAUNCH_KEY, +) +from ray.autoscaler._private.util import NodeID, NodeIP, NodeKind, NodeStatus, NodeType +from ray.autoscaler.node_provider import NodeProvider +from ray.autoscaler.tags import ( + NODE_KIND_HEAD, + TAG_RAY_NODE_KIND, + TAG_RAY_NODE_STATUS, + TAG_RAY_REPLICA_INDEX, + TAG_RAY_USER_NODE_TYPE, +) + +logger = logging.getLogger(__name__) + + +@dataclass +class ScaleRequest: + """Stores desired scale computed by the autoscaler. + + Attributes: + desired_num_workers: Map of worker NodeType to desired number of workers of + that type. + workers_to_delete: List of ids of nodes that should be removed. + """ + + desired_num_workers: Dict[NodeType, int] = field(default_factory=dict) + workers_to_delete: Set[NodeID] = field(default_factory=set) + + +@dataclass +class NodeData: + """Stores all data about a Ray node needed by the autoscaler. + + Attributes: + kind: Whether the node is the head or a worker. + type: The user-defined type of the node. + replica_index: An identifier for nodes in a replica of a TPU worker group. + This value is set as a Pod label by a GKE webhook when TPUs are requested + ip: Cluster-internal ip of the node. ip can be None if the ip + has not yet been assigned. + status: The status of the node. You must adhere to the following semantics + for status: + * The status must be "up-to-date" if and only if the node is running. + * The status must be "update-failed" if and only if the node is in an + unknown or failed state. + * If the node is in a pending (starting-up) state, the status should be + a brief user-facing description of why the node is pending. + """ + + kind: NodeKind + type: NodeType + ip: Optional[NodeIP] + status: NodeStatus + replica_index: Optional[str] = None + + +class BatchingNodeProvider(NodeProvider): + """Abstract subclass of NodeProvider meant for use with external cluster managers. + + Batches reads of cluster state into a single method, get_node_data, called at the + start of an autoscaling update. + + Batches modifications to cluster state into a single method, submit_scale_request, + called at the end of an autoscaling update. + + Implementing a concrete subclass of BatchingNodeProvider only requires overriding + get_node_data() and submit_scale_request(). + + See the method docstrings for more information. + + Note that an autoscaling update may be conditionally + cancelled using the optional method safe_to_scale() + of the root NodeProvider. + """ + + def __init__( + self, + provider_config: Dict[str, Any], + cluster_name: str, + ) -> None: + NodeProvider.__init__(self, provider_config, cluster_name) + self.node_data_dict: Dict[NodeID, NodeData] = {} + + # These flags enforce correct behavior for single-threaded node providers + # which interact with external cluster managers: + assert ( + provider_config.get(DISABLE_NODE_UPDATERS_KEY, False) is True + ), f"To use BatchingNodeProvider, must set `{DISABLE_NODE_UPDATERS_KEY}:True`." + assert provider_config.get(DISABLE_LAUNCH_CONFIG_CHECK_KEY, False) is True, ( + "To use BatchingNodeProvider, must set " + f"`{DISABLE_LAUNCH_CONFIG_CHECK_KEY}:True`." + ) + assert ( + provider_config.get(FOREGROUND_NODE_LAUNCH_KEY, False) is True + ), f"To use BatchingNodeProvider, must set `{FOREGROUND_NODE_LAUNCH_KEY}:True`." + + # self.scale_change_needed tracks whether we need to update scale. + # set to True in create_node and terminate_nodes calls + # reset to False in non_terminated_nodes, which occurs at the start of the + # autoscaling update. For good measure, also set to false in post_process. + self.scale_change_needed = False + + self.scale_request = ScaleRequest() + + # Initialize map of replica indices to nodes in that replica + self.replica_index_to_nodes = defaultdict(list[str]) + + def get_node_data(self) -> Dict[NodeID, NodeData]: + """Queries cluster manager for node info. Returns a mapping from node id to + NodeData. + + Each NodeData value must adhere to the semantics of the NodeData docstring. + (Note in particular the requirements for NodeData.status.) + + Consistency requirement: + If a node id was present in ScaleRequest.workers_to_delete of a previously + submitted scale request, it should no longer be present as a key in + get_node_data. + (Node termination must be registered immediately when submit_scale_request + returns.) + """ + raise NotImplementedError + + def submit_scale_request(self, scale_request: ScaleRequest) -> None: + """Tells the cluster manager which nodes to delete and how many nodes of + each node type to maintain. + + Consistency requirement: + If a node id was present in ScaleRequest.workers_to_delete of a previously + submitted scale request, it should no longer be present as key in get_node_data. + (Node termination must be registered immediately when submit_scale_request + returns.) + """ + raise NotImplementedError + + def post_process(self) -> None: + """Submit a scale request if it is necessary to do so.""" + if self.scale_change_needed: + self.submit_scale_request(self.scale_request) + self.scale_change_needed = False + + def non_terminated_nodes(self, tag_filters: Dict[str, str]) -> List[str]: + self.scale_change_needed = False + self.node_data_dict = self.get_node_data() + + # Initialize ScaleRequest + self.scale_request = ScaleRequest( + desired_num_workers=self.cur_num_workers(), # Current scale + workers_to_delete=set(), # No workers to delete yet + ) + all_nodes = list(self.node_data_dict.keys()) + self.replica_index_to_nodes.clear() + for node_id in all_nodes: + replica_index = self.node_data_dict[node_id].replica_index + # Only add node to map if it belongs to a multi-host podslice + if replica_index is not None: + self.replica_index_to_nodes[replica_index].append(node_id) + # Support filtering by TAG_RAY_NODE_KIND, TAG_RAY_NODE_STATUS, and + # TAG_RAY_USER_NODE_TYPE. + # The autoscaler only uses tag_filters={}, + # but filtering by the these keys is useful for testing. + filtered_nodes = [ + node + for node in all_nodes + if tag_filters.items() <= self.node_tags(node).items() + ] + return filtered_nodes + + def cur_num_workers(self): + """Returns dict mapping node type to the number of nodes of that type.""" + # Factor like this for convenient re-use. + return self._cur_num_workers(self.node_data_dict) + + def _cur_num_workers(self, node_data_dict: Dict[str, Any]): + num_workers_dict = defaultdict(int) + for node_data in node_data_dict.values(): + if node_data.kind == NODE_KIND_HEAD: + # Only track workers. + continue + num_workers_dict[node_data.type] += 1 + return num_workers_dict + + def node_tags(self, node_id: str) -> Dict[str, str]: + node_data = self.node_data_dict[node_id] + tags = { + TAG_RAY_NODE_KIND: node_data.kind, + TAG_RAY_NODE_STATUS: node_data.status, + TAG_RAY_USER_NODE_TYPE: node_data.type, + } + if node_data.replica_index is not None: + tags[TAG_RAY_REPLICA_INDEX] = node_data.replica_index + return tags + + def internal_ip(self, node_id: str) -> str: + return self.node_data_dict[node_id].ip + + def create_node( + self, node_config: Dict[str, Any], tags: Dict[str, str], count: int + ) -> Optional[Dict[str, Any]]: + node_type = tags[TAG_RAY_USER_NODE_TYPE] + self.scale_request.desired_num_workers[node_type] += count + self.scale_change_needed = True + + def terminate_node(self, node_id: str) -> Optional[Dict[str, Any]]: + # Sanity check: We should never try to delete the same node twice. + if node_id in self.scale_request.workers_to_delete: + logger.warning( + f"Autoscaler tried to terminate node {node_id} twice in the same update" + ". Skipping termination request." + ) + return + + # Sanity check: We should never try to delete a node we haven't seen. + if node_id not in self.node_data_dict: + logger.warning( + f"Autoscaler tried to terminate unkown node {node_id}" + ". Skipping termination request." + ) + return + + node_type = self.node_data_dict[node_id].type + + # Sanity check: Don't request less than 0 nodes. + if self.scale_request.desired_num_workers[node_type] <= 0: + # This is logically impossible. + raise AssertionError( + "NodeProvider attempted to request less than 0 workers of type " + f"{node_type}. Skipping termination request." + ) + + # Terminate node + self.scale_request.desired_num_workers[node_type] -= 1 + self.scale_request.workers_to_delete.add(node_id) + + # Scale down all nodes in replica if node_id is part of a multi-host podslice + tags = self.node_tags(node_id) + if TAG_RAY_REPLICA_INDEX in tags: + node_replica_index = tags[TAG_RAY_REPLICA_INDEX] + for worker_id in self.replica_index_to_nodes[node_replica_index]: + # Check if worker has already been scheduled to delete + if worker_id not in self.scale_request.workers_to_delete: + self.scale_request.workers_to_delete.add(worker_id) + logger.info( + f"Autoscaler terminating node {worker_id} " + f"in multi-host replica {node_replica_index}." + ) + self.scale_change_needed = True diff --git a/.venv/lib/python3.11/site-packages/ray/autoscaler/command_runner.py b/.venv/lib/python3.11/site-packages/ray/autoscaler/command_runner.py new file mode 100644 index 0000000000000000000000000000000000000000..23ce60a3b0eb6e20f269dd3a59888df876af1789 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/autoscaler/command_runner.py @@ -0,0 +1,92 @@ +from typing import Any, Dict, List, Optional, Tuple + +from ray.util.annotations import DeveloperAPI + + +@DeveloperAPI +class CommandRunnerInterface: + """Interface to run commands on a remote cluster node. + + **Important**: This is an INTERNAL API that is only exposed for the purpose + of implementing custom node providers. It is not allowed to call into + CommandRunner methods from any Ray package outside the autoscaler, only to + define new implementations for use with the "external" node provider + option. + + Command runner instances are returned by provider.get_command_runner().""" + + def run( + self, + cmd: Optional[str] = None, + timeout: int = 120, + exit_on_fail: bool = False, + port_forward: List[Tuple[int, int]] = None, + with_output: bool = False, + environment_variables: Optional[Dict[str, object]] = None, + run_env: str = "auto", + ssh_options_override_ssh_key: str = "", + shutdown_after_run: bool = False, + ) -> str: + """Run the given command on the cluster node and optionally get output. + + WARNING: the cloudgateway needs arguments of "run" function to be json + dumpable to send them over HTTP requests. + + Args: + cmd: The command to run. + timeout: The command timeout in seconds. + exit_on_fail: Whether to sys exit on failure. + port_forward: List of (local, remote) ports to forward, or + a single tuple. + with_output: Whether to return output. + environment_variables (Dict[str, str | int | Dict[str, str]): + Environment variables that `cmd` should be run with. + run_env: Options: docker/host/auto. Used in + DockerCommandRunner to determine the run environment. + ssh_options_override_ssh_key: if provided, overwrites + SSHOptions class with SSHOptions(ssh_options_override_ssh_key). + shutdown_after_run: if provided, shutdowns down the machine + after executing the command with `sudo shutdown -h now`. + """ + raise NotImplementedError + + def run_rsync_up( + self, source: str, target: str, options: Optional[Dict[str, Any]] = None + ) -> None: + """Rsync files up to the cluster node. + + Args: + source: The (local) source directory or file. + target: The (remote) destination path. + """ + raise NotImplementedError + + def run_rsync_down( + self, source: str, target: str, options: Optional[Dict[str, Any]] = None + ) -> None: + """Rsync files down from the cluster node. + + Args: + source: The (remote) source directory or file. + target: The (local) destination path. + """ + raise NotImplementedError + + def remote_shell_command_str(self) -> str: + """Return the command the user can use to open a shell.""" + raise NotImplementedError + + def run_init( + self, *, as_head: bool, file_mounts: Dict[str, str], sync_run_yet: bool + ) -> Optional[bool]: + """Used to run extra initialization commands. + + Args: + as_head: Run as head image or worker. + file_mounts: Files to copy to the head and worker nodes. + sync_run_yet: Whether sync has been run yet. + + Returns: + optional: Whether initialization is necessary. + """ + pass diff --git a/.venv/lib/python3.11/site-packages/ray/autoscaler/gcp/__init__.py b/.venv/lib/python3.11/site-packages/ray/autoscaler/gcp/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/.venv/lib/python3.11/site-packages/ray/autoscaler/gcp/__pycache__/__init__.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/autoscaler/gcp/__pycache__/__init__.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..430078c7f0cd1f3185084dc21f7a51964e3ec3c7 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/autoscaler/gcp/__pycache__/__init__.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/autoscaler/gcp/defaults.yaml b/.venv/lib/python3.11/site-packages/ray/autoscaler/gcp/defaults.yaml new file mode 100644 index 0000000000000000000000000000000000000000..79da71efe142dff528fd80f64bac2d80987ac57a --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/autoscaler/gcp/defaults.yaml @@ -0,0 +1,171 @@ +# An unique identifier for the head node and workers of this cluster. +cluster_name: default + +# The maximum number of workers nodes to launch in addition to the head +# node. +max_workers: 2 + +# The autoscaler will scale up the cluster faster with higher upscaling speed. +# E.g., if the task requires adding more nodes then autoscaler will gradually +# scale up the cluster in chunks of upscaling_speed*currently_running_nodes. +# This number should be > 0. +upscaling_speed: 1.0 + +# This executes all commands on all nodes in the docker container, +# and opens all the necessary ports to support the Ray cluster. +# Empty string means disabled. +docker: {} + +# If a node is idle for this many minutes, it will be removed. +idle_timeout_minutes: 5 + +# Cloud-provider specific configuration. +provider: + type: gcp + region: us-west1 + availability_zone: us-west1-a + project_id: null # Globally unique project id + +# How Ray will authenticate with newly launched nodes. +auth: + ssh_user: ubuntu +# By default Ray creates a new private keypair, but you can also use your own. +# If you do so, make sure to also set "KeyName" in the head and worker node +# configurations below. This requires that you have added the key into the +# project wide meta-data. +# ssh_private_key: /path/to/your/key.pem + +# Tell the autoscaler the allowed node types and the resources they provide. +# The key is the name of the node type, which is just for debugging purposes. +# The node config specifies the launch config and physical instance type. +available_node_types: + ray_head_default: + # The resources provided by this node type. + resources: {"CPU": 2} + # Provider-specific config for this node type, e.g. instance type. By default + # Ray will auto-configure unspecified fields such as subnets and ssh-keys. + # For more documentation on available fields, see: + # https://cloud.google.com/compute/docs/reference/rest/v1/instances/insert + node_config: + machineType: n1-standard-2 + disks: + - boot: true + autoDelete: true + type: PERSISTENT + initializeParams: + diskSizeGb: 50 + # See https://cloud.google.com/compute/docs/images for more images + sourceImage: projects/deeplearning-platform-release/global/images/family/common-cpu + + # Additional options can be found in in the compute docs at + # https://cloud.google.com/compute/docs/reference/rest/v1/instances/insert + + # If the network interface is specified as below in both head and worker + # nodes, the manual network config is used. Otherwise an existing subnet is + # used. To use a shared subnet, ask the subnet owner to grant permission + # for 'compute.subnetworks.use' to the ray autoscaler account... + # networkInterfaces: + # - kind: compute#networkInterface + # subnetwork: path/to/subnet + # aliasIpRanges: [] + ray_worker_small: + # The minimum number of nodes of this type to launch. + # This number should be >= 0. + min_workers: 0 + # The resources provided by this node type. + resources: {"CPU": 2} + # Provider-specific config for this node type, e.g. instance type. By default + # Ray will auto-configure unspecified fields such as subnets and ssh-keys. + # For more documentation on available fields, see: + # https://cloud.google.com/compute/docs/reference/rest/v1/instances/insert + node_config: + machineType: n1-standard-2 + disks: + - boot: true + autoDelete: true + type: PERSISTENT + initializeParams: + diskSizeGb: 50 + # See https://cloud.google.com/compute/docs/images for more images + sourceImage: projects/deeplearning-platform-release/global/images/family/common-cpu + # Run workers on preemtible instance by default. + # Comment this out to use on-demand. + scheduling: + - preemptible: true + + # Additional options can be found in in the compute docs at + # https://cloud.google.com/compute/docs/reference/rest/v1/instances/insert + +# Specify the node type of the head node (as configured above). +head_node_type: ray_head_default + +# Files or directories to copy to the head and worker nodes. The format is a +# dictionary from REMOTE_PATH: LOCAL_PATH, e.g. +file_mounts: { +# "/path1/on/remote/machine": "/path1/on/local/machine", +# "/path2/on/remote/machine": "/path2/on/local/machine", +} + +# Files or directories to copy from the head node to the worker nodes. The format is a +# list of paths. The same path on the head node will be copied to the worker node. +# This behavior is a subset of the file_mounts behavior. In the vast majority of cases +# you should just use file_mounts. Only use this if you know what you're doing! +cluster_synced_files: [] + +# Whether changes to directories in file_mounts or cluster_synced_files in the head node +# should sync to the worker node continuously +file_mounts_sync_continuously: False + +# Patterns for files to exclude when running rsync up or rsync down +rsync_exclude: [] + +# Pattern files to use for filtering out files when running rsync up or rsync down. The file is searched for +# in the source directory and recursively through all subdirectories. For example, if .gitignore is provided +# as a value, the behavior will match git's behavior for finding and using .gitignore files. +rsync_filter: [] + +# List of commands that will be run before `setup_commands`. If docker is +# enabled, these commands will run outside the container and before docker +# is setup. +initialization_commands: [] + +# List of shell commands to run to set up nodes. +setup_commands: + # Note: if you're developing Ray, you probably want to create an AMI that + # has your Ray repo pre-cloned. Then, you can replace the pip installs + # below with a git checkout (and possibly a recompile). + # - echo 'export PATH="$HOME/anaconda3/envs/tensorflow_p36/bin:$PATH"' >> ~/.bashrc + # Install ray if not present + - >- + (stat /opt/conda/bin/ &> /dev/null && + echo 'export PATH="/opt/conda/bin:$PATH"' >> ~/.bashrc) || true + - which ray || pip install -U "ray[default] @ https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-3.0.0.dev0-cp37-cp37m-manylinux2014_x86_64.whl" + + +# Custom commands that will be run on the head node after common setup. +head_setup_commands: + - pip install google-api-python-client==1.7.8 + +# Custom commands that will be run on worker nodes after common setup. +worker_setup_commands: [] + +# Command to start ray on the head node. You don't need to change this. +head_start_ray_commands: + - ray stop + - >- + ulimit -n 65536; + ray start + --head + --port=6379 + --object-manager-port=8076 + --autoscaling-config=~/ray_bootstrap_config.yaml + --dashboard-host=0.0.0.0 + +# Command to start ray on worker nodes. You don't need to change this. +worker_start_ray_commands: + - ray stop + - >- + ulimit -n 65536; + ray start + --address=$RAY_HEAD_IP:6379 + --object-manager-port=8076 diff --git a/.venv/lib/python3.11/site-packages/ray/autoscaler/launch_and_verify_cluster.py b/.venv/lib/python3.11/site-packages/ray/autoscaler/launch_and_verify_cluster.py new file mode 100644 index 0000000000000000000000000000000000000000..2dee563eac9a5c3d98abbb83a914c53a5121bb13 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/autoscaler/launch_and_verify_cluster.py @@ -0,0 +1,439 @@ +""" +This script automates the process of launching and verifying a Ray cluster using a given +cluster configuration file. It also handles cluster cleanup before and after the +verification process. The script requires one command-line argument: the path to the +cluster configuration file. + +Usage: + python launch_and_verify_cluster.py [--no-config-cache] [--retries NUM_RETRIES] + [--num-expected-nodes NUM_NODES] [--docker-override DOCKER_OVERRIDE] + [--wheel-override WHEEL_OVERRIDE] + +""" +import argparse +import os +import re +import subprocess +import sys +import tempfile +import time +import traceback +from pathlib import Path + +import boto3 +import botocore +import yaml +from google.cloud import storage + +import ray +from ray.autoscaler._private.aws.config import RAY + + +def check_arguments(): + """ + Check command line arguments and return the cluster configuration file path, the + number of retries, the number of expected nodes, and the value of the + --no-config-cache flag. + """ + parser = argparse.ArgumentParser(description="Launch and verify a Ray cluster") + parser.add_argument( + "--no-config-cache", + action="store_true", + help="Pass the --no-config-cache flag to Ray CLI commands", + ) + parser.add_argument( + "--retries", + type=int, + default=3, + help="Number of retries for verifying Ray is running (default: 3)", + ) + parser.add_argument( + "--num-expected-nodes", + type=int, + default=1, + help="Number of nodes for verifying Ray is running (default: 1)", + ) + parser.add_argument( + "--docker-override", + choices=["disable", "latest", "nightly", "commit"], + default="disable", + help="Override the docker image used for the head node and worker nodes", + ) + parser.add_argument( + "--wheel-override", + type=str, + default="", + help="Override the wheel used for the head node and worker nodes", + ) + parser.add_argument( + "cluster_config", type=str, help="Path to the cluster configuration file" + ) + args = parser.parse_args() + + assert not ( + args.docker_override != "disable" and args.wheel_override != "" + ), "Cannot override both docker and wheel" + + return ( + args.cluster_config, + args.retries, + args.no_config_cache, + args.num_expected_nodes, + args.docker_override, + args.wheel_override, + ) + + +def get_docker_image(docker_override): + """ + Get the docker image to use for the head node and worker nodes. + + Args: + docker_override: The value of the --docker-override flag. + + Returns: + The docker image to use for the head node and worker nodes, or None if not + applicable. + """ + if docker_override == "latest": + return "rayproject/ray:latest-py39" + elif docker_override == "nightly": + return "rayproject/ray:nightly-py39" + elif docker_override == "commit": + if re.match("^[0-9]+.[0-9]+.[0-9]+$", ray.__version__): + return f"rayproject/ray:{ray.__version__}.{ray.__commit__[:6]}-py39" + else: + print( + "Error: docker image is only available for " + f"release version, but we get: {ray.__version__}" + ) + sys.exit(1) + return None + + +def check_file(file_path): + """ + Check if the provided file path is valid and readable. + + Args: + file_path: The path of the file to check. + + Raises: + SystemExit: If the file is not readable or does not exist. + """ + if not file_path.is_file() or not os.access(file_path, os.R_OK): + print(f"Error: Cannot read cluster configuration file: {file_path}") + sys.exit(1) + + +def override_wheels_url(config_yaml, wheel_url): + setup_commands = config_yaml.get("setup_commands", []) + setup_commands.append( + f'pip3 uninstall -y ray && pip3 install -U "ray[default] @ {wheel_url}"' + ) + config_yaml["setup_commands"] = setup_commands + + +def override_docker_image(config_yaml, docker_image): + docker_config = config_yaml.get("docker", {}) + docker_config["image"] = docker_image + docker_config["container_name"] = "ray_container" + assert docker_config.get("head_image") is None, "Cannot override head_image" + assert docker_config.get("worker_image") is None, "Cannot override worker_image" + config_yaml["docker"] = docker_config + + +def download_ssh_key_aws(): + """Download the ssh key from the S3 bucket to the local machine.""" + print("======================================") + print("Downloading ssh key...") + # Create a Boto3 client to interact with S3 + s3_client = boto3.client("s3", region_name="us-west-2") + + # Set the name of the S3 bucket and the key to download + bucket_name = "aws-cluster-launcher-test" + key_name = "ray-autoscaler_59_us-west-2.pem" + + # Download the key from the S3 bucket to a local file + local_key_path = os.path.expanduser(f"~/.ssh/{key_name}") + if not os.path.exists(os.path.dirname(local_key_path)): + os.makedirs(os.path.dirname(local_key_path)) + s3_client.download_file(bucket_name, key_name, local_key_path) + + # Set permissions on the key file + os.chmod(local_key_path, 0o400) + + +def download_ssh_key_gcp(): + """Download the ssh key from the google cloud bucket to the local machine.""" + print("======================================") + print("Downloading ssh key from GCP...") + + # Initialize the GCP storage client + client = storage.Client() + + # Set the name of the GCS bucket and the blob (key) to download + bucket_name = "gcp-cluster-launcher-release-test-ssh-keys" + key_name = "ray-autoscaler_gcp_us-west1_anyscale-bridge-cd812d38_ubuntu_0.pem" + + # Get the bucket and blob + bucket = client.get_bucket(bucket_name) + blob = bucket.get_blob(key_name) + + # Download the blob to a local file + local_key_path = os.path.expanduser(f"~/.ssh/{key_name}") + if not os.path.exists(os.path.dirname(local_key_path)): + os.makedirs(os.path.dirname(local_key_path)) + blob.download_to_filename(local_key_path) + + # Set permissions on the key file + os.chmod(local_key_path, 0o400) + + +def cleanup_cluster(config_yaml, cluster_config): + """ + Clean up the cluster using the given cluster configuration file. + + Args: + cluster_config: The path of the cluster configuration file. + """ + print("======================================") + print("Cleaning up cluster...") + + # We do multiple retries here because sometimes the cluster + # fails to clean up properly, resulting in a non-zero exit code (e.g. + # when processes have to be killed forcefully). + + last_error = None + num_tries = 3 + for i in range(num_tries): + try: + subprocess.run( + ["ray", "down", "-v", "-y", str(cluster_config)], + check=True, + capture_output=True, + ) + cleanup_security_groups(config_yaml) + # Final success + return + except subprocess.CalledProcessError as e: + print(f"ray down fails[{i+1}/{num_tries}]: ") + print(e.output.decode("utf-8")) + + # Print full traceback + traceback.print_exc() + + # Print stdout and stderr from ray down + print(f"stdout:\n{e.stdout.decode('utf-8')}") + print(f"stderr:\n{e.stderr.decode('utf-8')}") + + last_error = e + + raise last_error + + +def cleanup_security_group(ec2_client, id): + retry = 0 + while retry < 10: + try: + ec2_client.delete_security_group(GroupId=id) + return + except botocore.exceptions.ClientError as e: + if e.response["Error"]["Code"] == "DependencyViolation": + sleep_time = 2**retry + print( + f"Waiting {sleep_time}s for the instance to be terminated before deleting the security group {id}" # noqa E501 + ) + time.sleep(sleep_time) + retry += 1 + else: + print(f"Error deleting security group: {e}") + return + + +def cleanup_security_groups(config): + provider_type = config.get("provider", {}).get("type") + if provider_type != "aws": + return + + try: + ec2_client = boto3.client("ec2", region_name="us-west-2") + response = ec2_client.describe_security_groups( + Filters=[ + { + "Name": "tag-key", + "Values": [RAY], + }, + { + "Name": "tag:ray-cluster-name", + "Values": [config["cluster_name"]], + }, + ] + ) + for security_group in response["SecurityGroups"]: + cleanup_security_group(ec2_client, security_group["GroupId"]) + except Exception as e: + print(f"Error cleaning up security groups: {e}") + + +def run_ray_commands( + config_yaml, cluster_config, retries, no_config_cache, num_expected_nodes=1 +): + """ + Run the necessary Ray commands to start a cluster, verify Ray is running, and clean + up the cluster. + + Args: + cluster_config: The path of the cluster configuration file. + retries: The number of retries for the verification step. + no_config_cache: Whether to pass the --no-config-cache flag to the ray CLI + commands. + """ + + print("======================================") + print("Starting new cluster...") + cmd = ["ray", "up", "-v", "-y"] + if no_config_cache: + cmd.append("--no-config-cache") + cmd.append(str(cluster_config)) + + print(" ".join(cmd)) + + try: + subprocess.run(cmd, check=True, capture_output=True) + except subprocess.CalledProcessError as e: + print(e.output) + # print stdout and stderr + print(f"stdout:\n{e.stdout.decode('utf-8')}") + print(f"stderr:\n{e.stderr.decode('utf-8')}") + raise e + + print("======================================") + print("Verifying Ray is running...") + + success = False + count = 0 + while count < retries: + try: + cmd = [ + "ray", + "exec", + "-v", + str(cluster_config), + ( + 'python -c \'import ray; ray.init("localhost:6379");' + + f" assert len(ray.nodes()) >= {num_expected_nodes}'" + ), + ] + if no_config_cache: + cmd.append("--no-config-cache") + subprocess.run(cmd, check=True) + success = True + break + except subprocess.CalledProcessError: + count += 1 + print(f"Verification failed. Retry attempt {count} of {retries}...") + time.sleep(60) + + if not success: + print("======================================") + print( + f"Error: Verification failed after {retries} attempts. Cleaning up cluster " + "before exiting..." + ) + cleanup_cluster(config_yaml, cluster_config) + print("======================================") + print("Exiting script.") + sys.exit(1) + + print("======================================") + print("Ray verification successful.") + + cleanup_cluster(config_yaml, cluster_config) + + print("======================================") + print("Finished executing script successfully.") + + +if __name__ == "__main__": + ( + cluster_config, + retries, + no_config_cache, + num_expected_nodes, + docker_override, + wheel_override, + ) = check_arguments() + cluster_config = Path(cluster_config) + check_file(cluster_config) + + print(f"Using cluster configuration file: {cluster_config}") + print(f"Number of retries for 'verify ray is running' step: {retries}") + print(f"Using --no-config-cache flag: {no_config_cache}") + print(f"Number of expected nodes for 'verify ray is running': {num_expected_nodes}") + + config_yaml = yaml.safe_load(cluster_config.read_text()) + # Make the cluster name unique + config_yaml["cluster_name"] = ( + config_yaml["cluster_name"] + "-" + str(int(time.time())) + ) + + print("======================================") + print(f"Overriding ray wheel...: {wheel_override}") + if wheel_override: + override_wheels_url(config_yaml, wheel_override) + + print("======================================") + print(f"Overriding docker image...: {docker_override}") + docker_override_image = get_docker_image(docker_override) + print(f"Using docker image: {docker_override_image}") + if docker_override_image: + override_docker_image(config_yaml, docker_override_image) + + provider_type = config_yaml.get("provider", {}).get("type") + config_yaml["provider"]["cache_stopped_nodes"] = False + if provider_type == "aws": + download_ssh_key_aws() + elif provider_type == "gcp": + download_ssh_key_gcp() + # Get the active account email + account_email = ( + subprocess.run( + ["gcloud", "config", "get-value", "account"], + stdout=subprocess.PIPE, + check=True, + ) + .stdout.decode("utf-8") + .strip() + ) + print("Active account email:", account_email) + # Get the current project ID + project_id = ( + subprocess.run( + ["gcloud", "config", "get-value", "project"], + stdout=subprocess.PIPE, + check=True, + ) + .stdout.decode("utf-8") + .strip() + ) + print( + f"Injecting GCP project '{project_id}' into cluster configuration file..." + ) + config_yaml["provider"]["project_id"] = project_id + elif provider_type == "vsphere": + print("======================================") + print("VSPHERE provider detected.") + else: + print("======================================") + print("Provider type not recognized. Exiting script.") + sys.exit(1) + + # Create a new temporary file and dump the updated configuration into it + with tempfile.NamedTemporaryFile(suffix=".yaml") as temp: + temp.write(yaml.dump(config_yaml).encode("utf-8")) + temp.flush() + cluster_config = Path(temp.name) + run_ray_commands( + config_yaml, cluster_config, retries, no_config_cache, num_expected_nodes + ) diff --git a/.venv/lib/python3.11/site-packages/ray/autoscaler/local/__init__.py b/.venv/lib/python3.11/site-packages/ray/autoscaler/local/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/.venv/lib/python3.11/site-packages/ray/autoscaler/local/__pycache__/__init__.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/autoscaler/local/__pycache__/__init__.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e8320b3c4c53a89a73ca25dffc8c4fe06852d68f Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/autoscaler/local/__pycache__/__init__.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/autoscaler/local/__pycache__/coordinator_server.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/autoscaler/local/__pycache__/coordinator_server.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..4a6226646a83362f9e18a93b90b3396cdae24e48 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/autoscaler/local/__pycache__/coordinator_server.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/autoscaler/local/coordinator_server.py b/.venv/lib/python3.11/site-packages/ray/autoscaler/local/coordinator_server.py new file mode 100644 index 0000000000000000000000000000000000000000..75c85b3966e54ea63980ea87a266cbd1e3d6fbb9 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/autoscaler/local/coordinator_server.py @@ -0,0 +1,124 @@ +"""Web server that runs on local/private clusters to coordinate and manage +different clusters for multiple users. It receives node provider function calls +through HTTP requests from remote CoordinatorSenderNodeProvider and runs them +locally in LocalNodeProvider. To start the webserver the user runs: +`python coordinator_server.py --ips --port `.""" +import argparse +import json +import logging +import socket +import threading +from http.server import HTTPServer, SimpleHTTPRequestHandler + +from ray.autoscaler._private.local.node_provider import LocalNodeProvider + +logger = logging.getLogger(__name__) +logging.basicConfig(level=logging.INFO) + + +def runner_handler(node_provider): + class Handler(SimpleHTTPRequestHandler): + """A custom handler for OnPremCoordinatorServer. + + Handles all requests and responses coming into and from the + remote CoordinatorSenderNodeProvider. + """ + + def _do_header(self, response_code=200, headers=None): + """Sends the header portion of the HTTP response. + + Args: + response_code: Standard HTTP response code + headers (list[tuples]): Standard HTTP response headers + """ + if headers is None: + headers = [("Content-type", "application/json")] + + self.send_response(response_code) + for key, value in headers: + self.send_header(key, value) + self.end_headers() + + def do_HEAD(self): + """HTTP HEAD handler method.""" + self._do_header() + + def do_GET(self): + """Processes requests from remote CoordinatorSenderNodeProvider.""" + if self.headers["content-length"]: + raw_data = ( + self.rfile.read(int(self.headers["content-length"])) + ).decode("utf-8") + logger.info( + "OnPremCoordinatorServer received request: " + str(raw_data) + ) + request = json.loads(raw_data) + response = getattr(node_provider, request["type"])(*request["args"]) + logger.info( + "OnPremCoordinatorServer response content: " + str(raw_data) + ) + response_code = 200 + message = json.dumps(response) + self._do_header(response_code=response_code) + self.wfile.write(message.encode()) + + return Handler + + +class OnPremCoordinatorServer(threading.Thread): + """Initializes HTTPServer and serves CoordinatorSenderNodeProvider forever. + + It handles requests from the remote CoordinatorSenderNodeProvider. The + requests are forwarded to LocalNodeProvider function calls. + """ + + def __init__(self, list_of_node_ips, host, port): + """Initialize HTTPServer and serve forever by invoking self.run().""" + + logger.info( + "Running on prem coordinator server on address " + host + ":" + str(port) + ) + threading.Thread.__init__(self) + self._port = port + self._list_of_node_ips = list_of_node_ips + address = (host, self._port) + config = {"list_of_node_ips": list_of_node_ips} + self._server = HTTPServer( + address, + runner_handler(LocalNodeProvider(config, cluster_name=None)), + ) + self.start() + + def run(self): + self._server.serve_forever() + + def shutdown(self): + """Shutdown the underlying server.""" + self._server.shutdown() + self._server.server_close() + + +def main(): + parser = argparse.ArgumentParser( + description="Please provide a list of node ips and port." + ) + parser.add_argument( + "--ips", required=True, help="Comma separated list of node ips." + ) + parser.add_argument( + "--port", + type=int, + required=True, + help="The port on which the coordinator listens.", + ) + args = parser.parse_args() + list_of_node_ips = args.ips.split(",") + OnPremCoordinatorServer( + list_of_node_ips=list_of_node_ips, + host=socket.gethostbyname(socket.gethostname()), + port=args.port, + ) + + +if __name__ == "__main__": + main() diff --git a/.venv/lib/python3.11/site-packages/ray/autoscaler/local/defaults.yaml b/.venv/lib/python3.11/site-packages/ray/autoscaler/local/defaults.yaml new file mode 100644 index 0000000000000000000000000000000000000000..8a3f8de6fb1fb1be8769256dbfc973512d33a75f --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/autoscaler/local/defaults.yaml @@ -0,0 +1,33 @@ +# This configuration file is used internally +# to fill default settings for on-prem Ray clusters +# bootstrapped by the Ray autoscaler. +# For annotated examples, see the example yamls in this directory. + +cluster_name: default + +auth: {} + +upscaling_speed: 1.0 +idle_timeout_minutes: 5 + +docker: {} + +# Defaults are empty to avoid any surprise changes to on-prem cluster's state. +# Refer to example yamls for examples of ray installation in setup commands. +initialization_commands: [] +setup_commands: [] +head_setup_commands: [] +worker_setup_commands: [] + +head_start_ray_commands: + - ray stop + - ulimit -c unlimited; ray start --head --port=6379 --autoscaling-config=~/ray_bootstrap_config.yaml +worker_start_ray_commands: + - ray stop + - ray start --address=$RAY_HEAD_IP:6379 + +file_mounts: {} +cluster_synced_files: [] +file_mounts_sync_continuously: false +rsync_exclude: [] +rsync_filter: [] diff --git a/.venv/lib/python3.11/site-packages/ray/autoscaler/node_launch_exception.py b/.venv/lib/python3.11/site-packages/ray/autoscaler/node_launch_exception.py new file mode 100644 index 0000000000000000000000000000000000000000..eb6bd25f2c612162b97bca2f721b4ebb9fad0242 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/autoscaler/node_launch_exception.py @@ -0,0 +1,37 @@ +from typing import Any, Optional, Tuple + +from ray.util.annotations import DeveloperAPI + + +@DeveloperAPI +class NodeLaunchException(Exception): + """A structured exception that can be thrown by a node provider during a + `create_node` call to pass additional information for observability. + """ + + def __init__( + self, + category: str, + description: str, + src_exc_info: Optional[Tuple[Any, Any, Any]], # The + ): + """Args: + category: A short (<20 chars) label for the error. + description: A longer, human readable description of the error. + src_exc_info: The source exception info if applicable. This is a + tuple of (type, exception, traceback) as returned by + sys.exc_info() + + """ + super().__init__(f"Node Launch Exception ({category}): {description}") + self.category = category + self.description = description + self.src_exc_info = src_exc_info + + def __reduce__(self): + # NOTE: Since tracebacks can't be pickled, we'll drop the optional + # traceback if we have to serialize this object. + return ( + self.__class__, + (self.category, self.description, None), + ) diff --git a/.venv/lib/python3.11/site-packages/ray/autoscaler/node_provider.py b/.venv/lib/python3.11/site-packages/ray/autoscaler/node_provider.py new file mode 100644 index 0000000000000000000000000000000000000000..454584ddebb241bbc136c7f5417cd932ee068a38 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/autoscaler/node_provider.py @@ -0,0 +1,263 @@ +import logging +from types import ModuleType +from typing import Any, Dict, List, Optional + +from ray.autoscaler._private.command_runner import DockerCommandRunner, SSHCommandRunner +from ray.autoscaler.command_runner import CommandRunnerInterface +from ray.util.annotations import DeveloperAPI + +logger = logging.getLogger(__name__) + + +@DeveloperAPI +class NodeProvider: + """Interface for getting and returning nodes from a Cloud. + + **Important**: This is an INTERNAL API that is only exposed for the purpose + of implementing custom node providers. It is not allowed to call into + NodeProvider methods from any Ray package outside the autoscaler, only to + define new implementations of NodeProvider for use with the "external" node + provider option. + + NodeProviders are namespaced by the `cluster_name` parameter; they only + operate on nodes within that namespace. + + Nodes may be in one of three states: {pending, running, terminated}. Nodes + appear immediately once started by `create_node`, and transition + immediately to terminated when `terminate_node` is called. + """ + + def __init__(self, provider_config: Dict[str, Any], cluster_name: str) -> None: + self.provider_config = provider_config + self.cluster_name = cluster_name + self._internal_ip_cache: Dict[str, str] = {} + self._external_ip_cache: Dict[str, str] = {} + + def is_readonly(self) -> bool: + """Returns whether this provider is readonly. + + Readonly node providers do not allow nodes to be created or terminated. + """ + return False + + def non_terminated_nodes(self, tag_filters: Dict[str, str]) -> List[str]: + """Return a list of node ids filtered by the specified tags dict. + + This list must not include terminated nodes. For performance reasons, + providers are allowed to cache the result of a call to + non_terminated_nodes() to serve single-node queries + (e.g. is_running(node_id)). This means that non_terminate_nodes() must + be called again to refresh results. + + Examples: + >>> from ray.autoscaler.node_provider import NodeProvider + >>> from ray.autoscaler.tags import TAG_RAY_NODE_KIND + >>> provider = NodeProvider(...) # doctest: +SKIP + >>> provider.non_terminated_nodes( # doctest: +SKIP + ... {TAG_RAY_NODE_KIND: "worker"}) + ["node-1", "node-2"] + + """ + raise NotImplementedError + + def is_running(self, node_id: str) -> bool: + """Return whether the specified node is running.""" + raise NotImplementedError + + def is_terminated(self, node_id: str) -> bool: + """Return whether the specified node is terminated.""" + raise NotImplementedError + + def node_tags(self, node_id: str) -> Dict[str, str]: + """Returns the tags of the given node (string dict).""" + raise NotImplementedError + + def external_ip(self, node_id: str) -> str: + """Returns the external ip of the given node.""" + raise NotImplementedError + + def internal_ip(self, node_id: str) -> str: + """Returns the internal ip (Ray ip) of the given node.""" + raise NotImplementedError + + def get_node_id(self, ip_address: str, use_internal_ip: bool = False) -> str: + """Returns the node_id given an IP address. + + Assumes ip-address is unique per node. + + Args: + ip_address: Address of node. + use_internal_ip: Whether the ip address is + public or private. + + Raises: + ValueError if not found. + """ + + def find_node_id(): + if use_internal_ip: + return self._internal_ip_cache.get(ip_address) + else: + return self._external_ip_cache.get(ip_address) + + if not find_node_id(): + all_nodes = self.non_terminated_nodes({}) + ip_func = self.internal_ip if use_internal_ip else self.external_ip + ip_cache = ( + self._internal_ip_cache if use_internal_ip else self._external_ip_cache + ) + for node_id in all_nodes: + ip_cache[ip_func(node_id)] = node_id + + if not find_node_id(): + if use_internal_ip: + known_msg = f"Worker internal IPs: {list(self._internal_ip_cache)}" + else: + known_msg = f"Worker external IP: {list(self._external_ip_cache)}" + raise ValueError(f"ip {ip_address} not found. " + known_msg) + + return find_node_id() + + def create_node( + self, node_config: Dict[str, Any], tags: Dict[str, str], count: int + ) -> Optional[Dict[str, Any]]: + """Creates a number of nodes within the namespace. + + Optionally returns a mapping from created node ids to node metadata. + + Optionally may throw a + ray.autoscaler.node_launch_exception.NodeLaunchException which the + autoscaler may use to provide additional functionality such as + observability. + + """ + raise NotImplementedError + + def create_node_with_resources_and_labels( + self, + node_config: Dict[str, Any], + tags: Dict[str, str], + count: int, + resources: Dict[str, float], + labels: Dict[str, str], + ) -> Optional[Dict[str, Any]]: + """Create nodes with a given resource and label config. + + This is the method actually called by the autoscaler. Prefer to + implement this when possible directly, otherwise it delegates to the + create_node() implementation. + + Optionally may throw a ray.autoscaler.node_launch_exception.NodeLaunchException. + """ + return self.create_node(node_config, tags, count) + + def set_node_tags(self, node_id: str, tags: Dict[str, str]) -> None: + """Sets the tag values (string dict) for the specified node.""" + raise NotImplementedError + + def terminate_node(self, node_id: str) -> Optional[Dict[str, Any]]: + """Terminates the specified node. + + Optionally return a mapping from deleted node ids to node + metadata. + """ + raise NotImplementedError + + def terminate_nodes(self, node_ids: List[str]) -> Optional[Dict[str, Any]]: + """Terminates a set of nodes. + + May be overridden with a batch method, which optionally may return a + mapping from deleted node ids to node metadata. + """ + for node_id in node_ids: + logger.info("NodeProvider: {}: Terminating node".format(node_id)) + self.terminate_node(node_id) + return None + + @property + def max_terminate_nodes(self) -> Optional[int]: + """The maximum number of nodes which can be terminated in one single + API request. By default, this is "None", which means that the node + provider's underlying API allows infinite requests to be terminated + with one request. + + For example, AWS only allows 1000 nodes to be terminated + at once; to terminate more, we must issue multiple separate API + requests. If the limit is infinity, then simply set this to None. + + This may be overridden. The value may be useful when overriding the + "terminate_nodes" method. + """ + return None + + @staticmethod + def bootstrap_config(cluster_config: Dict[str, Any]) -> Dict[str, Any]: + """Bootstraps the cluster config by adding env defaults if needed.""" + return cluster_config + + def get_command_runner( + self, + log_prefix: str, + node_id: str, + auth_config: Dict[str, Any], + cluster_name: str, + process_runner: ModuleType, + use_internal_ip: bool, + docker_config: Optional[Dict[str, Any]] = None, + ) -> CommandRunnerInterface: + """Returns the CommandRunner class used to perform SSH commands. + + Args: + log_prefix: stores "NodeUpdater: {}: ".format(). Used + to print progress in the CommandRunner. + node_id: the node ID. + auth_config: the authentication configs from the autoscaler + yaml file. + cluster_name: the name of the cluster. + process_runner: the module to use to run the commands + in the CommandRunner. E.g., subprocess. + use_internal_ip: whether the node_id belongs to an internal ip + or external ip. + docker_config: If set, the docker information of the docker + container that commands should be run on. + """ + common_args = { + "log_prefix": log_prefix, + "node_id": node_id, + "provider": self, + "auth_config": auth_config, + "cluster_name": cluster_name, + "process_runner": process_runner, + "use_internal_ip": use_internal_ip, + } + if docker_config and docker_config["container_name"] != "": + return DockerCommandRunner(docker_config, **common_args) + else: + return SSHCommandRunner(**common_args) + + def prepare_for_head_node(self, cluster_config: Dict[str, Any]) -> Dict[str, Any]: + """Returns a new cluster config with custom configs for head node.""" + return cluster_config + + @staticmethod + def fillout_available_node_types_resources( + cluster_config: Dict[str, Any] + ) -> Dict[str, Any]: + """Fills out missing "resources" field for available_node_types.""" + return cluster_config + + def safe_to_scale(self) -> bool: + """Optional condition to determine if it's safe to proceed with an autoscaling + update. Can be used to wait for convergence of state managed by an external + cluster manager. + + Called by the autoscaler immediately after non_terminated_nodes(). + If False is returned, the autoscaler will abort the update. + """ + return True + + def post_process(self) -> None: + """This optional method is executed at the end of + StandardAutoscaler._update(). + """ + pass diff --git a/.venv/lib/python3.11/site-packages/ray/autoscaler/ray-schema.json b/.venv/lib/python3.11/site-packages/ray/autoscaler/ray-schema.json new file mode 100644 index 0000000000000000000000000000000000000000..2e07dadac9121da275109e851d7603f51958409d --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/autoscaler/ray-schema.json @@ -0,0 +1,400 @@ +{ + "$schema": "http://json-schema.org/draft-07/schema#", + "$id": "http://github.com/ray-project/ray/python/ray/autoscaler/ray-schema.json", + "title": "Ray AutoScaler", + "description": "Ray autoscaler schema", + "type": "object", + "definitions": { + "commands": { + "type": "array", + "items": { + "type": "string", + "description": "shell command" + } + } + }, + "required": [ + "cluster_name", + "provider" + ], + "additionalProperties": false, + "properties": { + "cluster_name": { + "description": "A unique identifier for the head node and workers of this cluster.", + "type": "string" + }, + "max_workers": { + "description": "The maximum number of workers nodes to launch in addition to the head node. This should be no larger than the sum of min_workers for all available node types.", + "type": "integer", + "minimum": 0 + }, + "upscaling_speed": { + "description": "The autoscaler will scale up the cluster faster with higher upscaling speed. E.g., if the task requires adding more nodes then autoscaler will gradually scale up the cluster in chunks of upscaling_speed*currently_running_nodes. This number should be > 0.", + "type": "number", + "minimum": 0 + }, + "idle_timeout_minutes": { + "description": "If a node is idle for this many minutes, it will be removed.", + "type": "number", + "minimum": 0 + }, + "provider": { + "type": "object", + "description": "Cloud-provider specific configuration.", + "required": [ "type" ], + "additionalProperties": true, + "properties": { + "type": { + "type": "string", + "description": "e.g. aws, azure, gcp,..." + }, + "region": { + "type": "string", + "description": "e.g. us-east-1" + }, + "module": { + "type": "string", + "description": "module, if using external node provider" + }, + "head_ip": { + "type": "string", + "description": "gcp project id, if using gcp" + }, + "worker_ips": { + "type": "array", + "description": "local cluster head node" + }, + "use_internal_ips": { + "type": "boolean", + "description": "don't require public ips" + }, + "namespace": { + "type": "string", + "description": "k8s namespace, if using k8s" + }, + "location": { + "type": "string", + "description": "Azure location" + }, + "resource_group": { + "type": "string", + "description": "Azure resource group" + }, + "tags": { + "type": "object", + "description": "Azure user-defined tags" + }, + "subscription_id": { + "type": "string", + "description": "Azure subscription id" + }, + "msi_identity_id": { + "type": "string", + "description": "User-defined managed identity (generated by config)" + }, + "msi_identity_principal_id": { + "type": "string", + "description": "User-defined managed identity principal id (generated by config)" + }, + "subnet_id": { + "type": "string", + "description": "Network subnet id" + }, + "autoscaler_service_account": { + "type": "object", + "description": "k8s autoscaler permissions, if using k8s" + }, + "autoscaler_role": { + "type": "object", + "description": "k8s autoscaler permissions, if using k8s" + }, + "autoscaler_role_binding": { + "type": "object", + "description": "k8s autoscaler permissions, if using k8s" + }, + "cache_stopped_nodes": { + "type": "boolean", + "description": " Whether to try to reuse previously stopped nodes instead of launching nodes. This will also cause the autoscaler to stop nodes instead of terminating them. Only implemented for AWS." + }, + "availability_zone": { + "type": "string", + "description": "GCP availability zone" + }, + "project_id": { + "type": ["string", "null"], + "description": "GCP globally unique project id" + }, + "security_group": { + "type": "object", + "description": "AWS security group", + "additionalProperties": false, + "properties": { + "GroupName": { + "type": "string", + "description": "Security group name" + }, + "IpPermissions": { + "type": "array", + "description": "Security group in bound rules" + } + } + }, + "disable_node_updaters": { + "type": "boolean", + "description": "Disables node updaters if set to True. Default is False. (For Kubernetes operator usage.)" + }, + "gcp_credentials": { + "type": "object", + "description": "Credentials for authenticating with the GCP client", + "required": [ "type" ], + "additionalProperties": false, + "properties": { + "type": { + "type": "string", + "enum": ["credentials_token", "service_account"], + "description": "Credentials type: either temporary OAuth 2.0 token or permanent service account credentials blob." + }, + "credentials": { + "type": "string", + "description": "Oauth token or JSON string constituting service account credentials" + } + } + }, + "cloudwatch": { + "agent": { + "CLOUDWATCH_AGENT_INSTALLED_AMI_TAG": { + "type": ["string"], + "description": "Tag to be added to cloudwatch agent pre-installed AMI name." + }, + "config": { + "type": ["string", "null"], + "description": "Path to Unified CloudWatch Agent config file. See https://docs.aws.amazon.com/AmazonCloudWatch/latest/monitoring/CloudWatch-Agent-Configuration-File-Details.html for additional details." + }, + "retryer": { + "max_attempts": { + "type": ["integer", "null"], + "description": "Max allowed Unified CloudWatch Agent installation attempts on any host." + }, + "delay_seconds": { + "type": ["integer", "null"], + "description": "Seconds to wait between each Unified CloudWatch Agent installation attempt." + } + } + }, + "dashboard": { + "name": { + "type": ["string", "null"], + "description": "User defined CloudWatch Dashboard name." + }, + "config": { + "type": ["string", "null"], + "description": "Path to CloudWatch Dashboard config file. See https://docs.aws.amazon.com/AmazonCloudWatch/latest/APIReference/CloudWatch-Dashboard-Body-Structure.html for additional details." + } + }, + "alarm": { + "config": { + "type": ["string", "null"], + "description": "Path to CloudWatch Alarm config file. See https://docs.aws.amazon.com/AmazonCloudWatch/latest/APIReference/API_PutMetricAlarm.html for additional details." + } + } + } + } + }, + "auth": { + "type": "object", + "description": "How Ray will authenticate with newly launched nodes.", + "additionalProperties": false, + "properties": { + "ssh_user": { + "type": "string", + "default": "ubuntu" + }, + "ssh_public_key": { + "type": "string" + }, + "ssh_private_key": { + "type": "string" + }, + "ssh_proxy_command": { + "description": "A value for ProxyCommand ssh option, for connecting through proxies. Example: nc -x proxy.example.com:1234 %h %p", + "type": "string" + } + } + }, + "docker": { + "type": "object", + "description": "Docker configuration. If this is specified, all setup and start commands will be executed in the container.", + "additionalProperties": false, + "properties": { + "image": { + "type": "string", + "description": "the docker image name", + "default": "rayproject/ray:latest" + }, + "container_name": { + "type": "string", + "default": "ray_docker" + }, + "pull_before_run": { + "type": "boolean", + "description": "run `docker pull` first" + }, + "run_options": { + "type": "array", + "description": "shared options for starting head/worker docker" + }, + "head_image": { + "type": "string", + "description": "image for head node, takes precedence over 'image' if specified" + }, + "head_run_options": { + "type": "array", + "description": "head specific run options, appended to run_options" + }, + "worker_image": { + "type": "string", + "description": "analogous to head_image" + }, + "worker_run_options": { + "type": "array", + "description": "analogous to head_run_options" + }, + "disable_automatic_runtime_detection" : { + "type": "boolean", + "description": "disable Ray from automatically using the NVIDIA runtime if available", + "default": false + }, + "disable_shm_size_detection" : { + "type": "boolean", + "description": "disable Ray from automatically detecting /dev/shm size for the container", + "default": false + }, + "use_podman" : { + "type": "boolean", + "description": "Use 'podman' command in place of 'docker'", + "default": false + } + } + }, + "head_node_type": { + "type": "string", + "description": "If using multiple node types, specifies the head node type." + }, + "file_mounts": { + "type": "object", + "description": "Map of remote paths to local paths, e.g. {\"/tmp/data\": \"/my/local/data\"}" + }, + "cluster_synced_files": { + "type": "array", + "description": "List of paths on the head node which should sync to the worker nodes, e.g. [\"/some/data/somehwere\"]" + }, + "file_mounts_sync_continuously": { + "type": "boolean", + "description": "If enabled, file mounts will sync continously between the head node and the worker nodes. The nodes will not re-run setup commands if only the contents of the file mounts folders change." + }, + "rsync_exclude": { + "type": "array", + "description": "File pattern to not sync up or down when using the rsync command. Matches the format of rsync's --exclude param." + }, + "rsync_filter": { + "type": "array", + "description": "Pattern files to lookup patterns to exclude when using rsync up or rsync down. This file is checked for recursively in all directories. For example, if .gitignore is provided here, the behavior will match git's .gitignore behavior." + }, + "metadata": { + "type": "object", + "description": "Metadata field that can be used to store user-defined data in the cluster config. Ray does not interpret these fields." + }, + "initialization_commands": { + "$ref": "#/definitions/commands", + "description": "List of commands that will be run before `setup_commands`. If docker is enabled, these commands will run outside the container and before docker is setup." + }, + "setup_commands": { + "$ref": "#/definitions/commands", + "description": "List of common shell commands to run to setup nodes." + }, + "head_setup_commands": { + "$ref": "#/definitions/commands", + "description": "Commands that will be run on the head node after common setup." + }, + "worker_setup_commands": { + "$ref": "#/definitions/commands", + "description": "Commands that will be run on worker nodes after common setup." + }, + "head_start_ray_commands": { + "$ref": "#/definitions/commands", + "description": "Command to start ray on the head node. You shouldn't need to modify this." + }, + "worker_start_ray_commands": { + "$ref": "#/definitions/commands", + "description": "Command to start ray on worker nodes. You shouldn't need to modify this." + }, + "no_restart": { + "description": "Whether to avoid restarting the cluster during updates. This field is controlled by the ray --no-restart flag and cannot be set by the user." + }, + "available_node_types": { + "type": "object", + "description": "A list of node types for multi-node-type autoscaling.", + "patternProperties": { + ".*": { + "type": "object", + "required": [ "resources", "node_config" ], + "properties": { + "node_config": { + "type": "object", + "description": "Provider-specific config for the node, e.g. instance type." + }, + "min_workers": {"type": "integer"}, + "max_workers": {"type": "integer"}, + "idle_timeout_s": {"type": "number", "nullable": true}, + "resources": { + "type": "object", + "patternProperties": { + ".*":{ + "type": "integer", + "minimum": 0 + } + } + }, + "labels": { + "type": "object", + "patternProperties": { + ".*":{ + "type": "string" + } + } + }, + "initialization_commands": { + "$ref": "#/definitions/commands", + "description": "List of commands that will be run before `setup_commands`. If docker is enabled, these commands will run outside the container and before docker is setup." + }, + "worker_setup_commands": { + "$ref": "#/definitions/commands", + "description": "List of common shell commands to run to setup nodes. This node specfic list will override the global setup_commands and worker_setup_commands." + }, + "docker": { + "description": "Configuration of Worker nodes.", + "type": "object", + "properties": { + "pull_before_run": { + "type": "boolean", + "description": "run `docker pull` first" + }, + "worker_image": { + "type": "string", + "description": "analogous to head_image" + }, + "worker_run_options": { + "type": "array", + "description": "analogous to head_run_options, merged with the global docker run_options." + } + }, + "additionalProperties": false + } + }, + "additionalProperties": false + } + }, + "additionalProperties": false + } + } +} diff --git a/.venv/lib/python3.11/site-packages/ray/autoscaler/spark/__init__.py b/.venv/lib/python3.11/site-packages/ray/autoscaler/spark/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/.venv/lib/python3.11/site-packages/ray/autoscaler/spark/__pycache__/__init__.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/autoscaler/spark/__pycache__/__init__.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..95b041a97381f82e10e235b5972cfc8cb661b11d Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/autoscaler/spark/__pycache__/__init__.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/autoscaler/spark/defaults.yaml b/.venv/lib/python3.11/site-packages/ray/autoscaler/spark/defaults.yaml new file mode 100644 index 0000000000000000000000000000000000000000..bc69977cfc6f5678caf5a2c77df3072df088c21c --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/autoscaler/spark/defaults.yaml @@ -0,0 +1,49 @@ +# Example command to start a cluster with this config: +# +# ray start --autoscaling-config=default.yaml --head --block +# +cluster_name: spark +max_workers: 8 +provider: + type: spark + # This must be true since the nodes share the same ip! + use_node_id_as_ip: True + disable_node_updaters: True + disable_launch_config_check: True +available_node_types: + ray.head.default: + # You must set this manually to your "head" node resources!! The head + # node is launched via `ray start` and hence the autoscaler cannot + # configure its resources. The resources specified for its node type + # must line up with what Ray detects/is configured with on start. + resources: + CPU: 8 # <-- set this to num CPUs used/detected in `ray start` + GPU: 0 # <-- set this to num GPUs used/detected in `ray start` + node_config: {} + max_workers: 0 + ray.worker: + resources: + CPU: 1 + object_store_memory: 1000000000 + node_config: {} + min_workers: 0 + max_workers: 4 +head_node_type: ray.head.default +upscaling_speed: 1.0 +idle_timeout_minutes: 1.0 +# +# !!! Configurations below are not supported in spark cluster mode +# +auth: {} +docker: {} +initialization_commands: [] +setup_commands: [] +head_setup_commands: [] +worker_setup_commands: [] +head_start_ray_commands: [] +worker_start_ray_commands: [] +file_mounts: {} +cluster_synced_files: [] +file_mounts_sync_continuously: false +rsync_exclude: [] +rsync_filter: [] diff --git a/.venv/lib/python3.11/site-packages/ray/autoscaler/tags.py b/.venv/lib/python3.11/site-packages/ray/autoscaler/tags.py new file mode 100644 index 0000000000000000000000000000000000000000..38d03855040fd512e32b2d31f9972e1da8dcedbb --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/autoscaler/tags.py @@ -0,0 +1,47 @@ +"""The Ray autoscaler uses tags/labels to associate metadata with instances.""" + +# Tag for the name of the node +TAG_RAY_NODE_NAME = "ray-node-name" + +# Tag for the kind of node (e.g. Head, Worker). For legacy reasons, the tag +# value says 'type' instead of 'kind'. +TAG_RAY_NODE_KIND = "ray-node-type" +NODE_KIND_HEAD = "head" +NODE_KIND_WORKER = "worker" +NODE_KIND_UNMANAGED = "unmanaged" + +# Tag for user defined node types (e.g., m4xl_spot). This is used for multi +# node type clusters. +TAG_RAY_USER_NODE_TYPE = "ray-user-node-type" +# Tag for index of replica node belongs to. Used for multi-host worker groups. +TAG_RAY_REPLICA_INDEX = "ray-replica-index" +# Tag for autofilled node types for legacy cluster yamls without multi +# node type defined in the cluster configs. +NODE_TYPE_LEGACY_HEAD = "ray-legacy-head-node-type" +NODE_TYPE_LEGACY_WORKER = "ray-legacy-worker-node-type" + +# Tag that reports the current state of the node (e.g. Updating, Up-to-date) +TAG_RAY_NODE_STATUS = "ray-node-status" +STATUS_UNINITIALIZED = "uninitialized" +STATUS_WAITING_FOR_SSH = "waiting-for-ssh" +STATUS_SYNCING_FILES = "syncing-files" +STATUS_SETTING_UP = "setting-up" +STATUS_UPDATE_FAILED = "update-failed" +STATUS_UP_TO_DATE = "up-to-date" + +# Tag uniquely identifying all nodes of a cluster +TAG_RAY_CLUSTER_NAME = "ray-cluster-name" + +# Hash of the node launch config, used to identify out-of-date nodes +TAG_RAY_LAUNCH_CONFIG = "ray-launch-config" + +# Hash of the node runtime config, used to determine if updates are needed +TAG_RAY_RUNTIME_CONFIG = "ray-runtime-config" +# Hash of the contents of the directories specified by the file_mounts config +# if the node is a worker, this also hashes content of the directories +# specified by the cluster_synced_files config +TAG_RAY_FILE_MOUNTS_CONTENTS = "ray-file-mounts-contents" + +# Tag for the launch request id, used to identify nodes launched by the same +# launch request. +TAG_RAY_LAUNCH_REQUEST = "ray-launch-request" diff --git a/.venv/lib/python3.11/site-packages/ray/internal/__init__.py b/.venv/lib/python3.11/site-packages/ray/internal/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..36740dd1870a1e010cad2f438bb89ab1a0353047 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/internal/__init__.py @@ -0,0 +1,3 @@ +from ray._private.internal_api import free + +__all__ = ["free"] diff --git a/.venv/lib/python3.11/site-packages/ray/internal/__pycache__/__init__.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/internal/__pycache__/__init__.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..98b3dd1d72c37643960e94d5a033f1b246521aa2 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/internal/__pycache__/__init__.cpython-311.pyc differ