Add files using upload-large-folder tool
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- .gitattributes +1 -0
- .venv/lib/python3.11/site-packages/numpy/ma/tests/__pycache__/test_extras.cpython-311.pyc +3 -0
- .venv/lib/python3.11/site-packages/ray/dashboard/__init__.py +0 -0
- .venv/lib/python3.11/site-packages/ray/dashboard/agent.py +465 -0
- .venv/lib/python3.11/site-packages/ray/dashboard/consts.py +91 -0
- .venv/lib/python3.11/site-packages/ray/dashboard/dashboard.py +275 -0
- .venv/lib/python3.11/site-packages/ray/dashboard/dashboard_metrics.py +123 -0
- .venv/lib/python3.11/site-packages/ray/dashboard/datacenter.py +285 -0
- .venv/lib/python3.11/site-packages/ray/dashboard/head.py +351 -0
- .venv/lib/python3.11/site-packages/ray/dashboard/http_server_agent.py +83 -0
- .venv/lib/python3.11/site-packages/ray/dashboard/http_server_head.py +289 -0
- .venv/lib/python3.11/site-packages/ray/dashboard/k8s_utils.py +111 -0
- .venv/lib/python3.11/site-packages/ray/dashboard/memory_utils.py +524 -0
- .venv/lib/python3.11/site-packages/ray/dashboard/modules/__init__.py +0 -0
- .venv/lib/python3.11/site-packages/ray/dashboard/modules/dashboard_sdk.py +418 -0
- .venv/lib/python3.11/site-packages/ray/dashboard/modules/data/__pycache__/__init__.cpython-311.pyc +0 -0
- .venv/lib/python3.11/site-packages/ray/dashboard/modules/data/__pycache__/data_head.cpython-311.pyc +0 -0
- .venv/lib/python3.11/site-packages/ray/dashboard/modules/data/data_head.py +167 -0
- .venv/lib/python3.11/site-packages/ray/dashboard/modules/job/__init__.py +0 -0
- .venv/lib/python3.11/site-packages/ray/dashboard/modules/job/__pycache__/cli.cpython-311.pyc +0 -0
- .venv/lib/python3.11/site-packages/ray/dashboard/modules/job/__pycache__/cli_utils.cpython-311.pyc +0 -0
- .venv/lib/python3.11/site-packages/ray/dashboard/modules/job/__pycache__/job_agent.cpython-311.pyc +0 -0
- .venv/lib/python3.11/site-packages/ray/dashboard/modules/job/__pycache__/job_head.cpython-311.pyc +0 -0
- .venv/lib/python3.11/site-packages/ray/dashboard/modules/job/__pycache__/job_manager.cpython-311.pyc +0 -0
- .venv/lib/python3.11/site-packages/ray/dashboard/modules/job/__pycache__/job_supervisor.cpython-311.pyc +0 -0
- .venv/lib/python3.11/site-packages/ray/dashboard/modules/job/__pycache__/pydantic_models.cpython-311.pyc +0 -0
- .venv/lib/python3.11/site-packages/ray/dashboard/modules/job/cli.py +521 -0
- .venv/lib/python3.11/site-packages/ray/dashboard/modules/job/cli_utils.py +56 -0
- .venv/lib/python3.11/site-packages/ray/dashboard/modules/job/common.py +538 -0
- .venv/lib/python3.11/site-packages/ray/dashboard/modules/job/job_agent.py +211 -0
- .venv/lib/python3.11/site-packages/ray/dashboard/modules/job/job_head.py +587 -0
- .venv/lib/python3.11/site-packages/ray/dashboard/modules/job/job_log_storage_client.py +61 -0
- .venv/lib/python3.11/site-packages/ray/dashboard/modules/job/job_manager.py +640 -0
- .venv/lib/python3.11/site-packages/ray/dashboard/modules/job/job_supervisor.py +477 -0
- .venv/lib/python3.11/site-packages/ray/dashboard/modules/job/pydantic_models.py +110 -0
- .venv/lib/python3.11/site-packages/ray/dashboard/modules/job/sdk.py +492 -0
- .venv/lib/python3.11/site-packages/ray/dashboard/modules/job/utils.py +304 -0
- .venv/lib/python3.11/site-packages/ray/dashboard/modules/log/__init__.py +0 -0
- .venv/lib/python3.11/site-packages/ray/dashboard/modules/log/__pycache__/log_utils.cpython-311.pyc +0 -0
- .venv/lib/python3.11/site-packages/ray/dashboard/modules/log/log_agent.py +404 -0
- .venv/lib/python3.11/site-packages/ray/dashboard/modules/log/log_consts.py +8 -0
- .venv/lib/python3.11/site-packages/ray/dashboard/modules/log/log_manager.py +481 -0
- .venv/lib/python3.11/site-packages/ray/dashboard/modules/log/log_utils.py +9 -0
- .venv/lib/python3.11/site-packages/ray/dashboard/modules/node/__pycache__/node_head.cpython-311.pyc +0 -0
- .venv/lib/python3.11/site-packages/ray/dashboard/modules/serve/__init__.py +0 -0
- .venv/lib/python3.11/site-packages/ray/dashboard/modules/serve/__pycache__/__init__.cpython-311.pyc +0 -0
- .venv/lib/python3.11/site-packages/ray/dashboard/modules/serve/__pycache__/sdk.cpython-311.pyc +0 -0
- .venv/lib/python3.11/site-packages/ray/dashboard/modules/serve/__pycache__/serve_agent.cpython-311.pyc +0 -0
- .venv/lib/python3.11/site-packages/ray/dashboard/modules/serve/__pycache__/serve_head.cpython-311.pyc +0 -0
- .venv/lib/python3.11/site-packages/ray/dashboard/modules/serve/__pycache__/serve_rest_api_impl.cpython-311.pyc +0 -0
.gitattributes
CHANGED
|
@@ -154,3 +154,4 @@ tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_inductor/_
|
|
| 154 |
.venv/lib/python3.11/site-packages/mistral_common/data/tekken_240911.json filter=lfs diff=lfs merge=lfs -text
|
| 155 |
.venv/lib/python3.11/site-packages/ray/data/__pycache__/dataset.cpython-311.pyc filter=lfs diff=lfs merge=lfs -text
|
| 156 |
.venv/lib/python3.11/site-packages/ray/data/__pycache__/read_api.cpython-311.pyc filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
| 154 |
.venv/lib/python3.11/site-packages/mistral_common/data/tekken_240911.json filter=lfs diff=lfs merge=lfs -text
|
| 155 |
.venv/lib/python3.11/site-packages/ray/data/__pycache__/dataset.cpython-311.pyc filter=lfs diff=lfs merge=lfs -text
|
| 156 |
.venv/lib/python3.11/site-packages/ray/data/__pycache__/read_api.cpython-311.pyc filter=lfs diff=lfs merge=lfs -text
|
| 157 |
+
.venv/lib/python3.11/site-packages/numpy/ma/tests/__pycache__/test_extras.cpython-311.pyc filter=lfs diff=lfs merge=lfs -text
|
.venv/lib/python3.11/site-packages/numpy/ma/tests/__pycache__/test_extras.cpython-311.pyc
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:7d5a3dfa5e053841216226fbf83943d5cd4c680ae8ea252c2354cd124c900752
|
| 3 |
+
size 143846
|
.venv/lib/python3.11/site-packages/ray/dashboard/__init__.py
ADDED
|
File without changes
|
.venv/lib/python3.11/site-packages/ray/dashboard/agent.py
ADDED
|
@@ -0,0 +1,465 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import argparse
|
| 2 |
+
import asyncio
|
| 3 |
+
import json
|
| 4 |
+
import logging
|
| 5 |
+
import logging.handlers
|
| 6 |
+
import os
|
| 7 |
+
import pathlib
|
| 8 |
+
import signal
|
| 9 |
+
import sys
|
| 10 |
+
|
| 11 |
+
import ray
|
| 12 |
+
import ray._private.ray_constants as ray_constants
|
| 13 |
+
import ray._private.services
|
| 14 |
+
import ray._private.utils
|
| 15 |
+
import ray.dashboard.consts as dashboard_consts
|
| 16 |
+
import ray.dashboard.utils as dashboard_utils
|
| 17 |
+
from ray._private.gcs_utils import GcsAioClient
|
| 18 |
+
from ray._private.process_watcher import create_check_raylet_task
|
| 19 |
+
from ray._private.ray_constants import AGENT_GRPC_MAX_MESSAGE_LENGTH
|
| 20 |
+
from ray._private.ray_logging import configure_log_file, setup_component_logger
|
| 21 |
+
|
| 22 |
+
logger = logging.getLogger(__name__)
|
| 23 |
+
|
| 24 |
+
|
| 25 |
+
class DashboardAgent:
|
| 26 |
+
def __init__(
|
| 27 |
+
self,
|
| 28 |
+
node_ip_address,
|
| 29 |
+
dashboard_agent_port,
|
| 30 |
+
gcs_address,
|
| 31 |
+
cluster_id_hex,
|
| 32 |
+
minimal,
|
| 33 |
+
metrics_export_port=None,
|
| 34 |
+
node_manager_port=None,
|
| 35 |
+
listen_port=ray_constants.DEFAULT_DASHBOARD_AGENT_LISTEN_PORT,
|
| 36 |
+
disable_metrics_collection: bool = False,
|
| 37 |
+
*, # the following are required kwargs
|
| 38 |
+
object_store_name: str,
|
| 39 |
+
raylet_name: str,
|
| 40 |
+
log_dir: str,
|
| 41 |
+
temp_dir: str,
|
| 42 |
+
session_dir: str,
|
| 43 |
+
logging_params: dict,
|
| 44 |
+
agent_id: int,
|
| 45 |
+
session_name: str,
|
| 46 |
+
):
|
| 47 |
+
"""Initialize the DashboardAgent object."""
|
| 48 |
+
# Public attributes are accessible for all agent modules.
|
| 49 |
+
self.ip = node_ip_address
|
| 50 |
+
self.minimal = minimal
|
| 51 |
+
|
| 52 |
+
assert gcs_address is not None
|
| 53 |
+
self.gcs_address = gcs_address
|
| 54 |
+
self.cluster_id_hex = cluster_id_hex
|
| 55 |
+
|
| 56 |
+
self.temp_dir = temp_dir
|
| 57 |
+
self.session_dir = session_dir
|
| 58 |
+
self.log_dir = log_dir
|
| 59 |
+
self.dashboard_agent_port = dashboard_agent_port
|
| 60 |
+
self.metrics_export_port = metrics_export_port
|
| 61 |
+
self.node_manager_port = node_manager_port
|
| 62 |
+
self.listen_port = listen_port
|
| 63 |
+
self.object_store_name = object_store_name
|
| 64 |
+
self.raylet_name = raylet_name
|
| 65 |
+
self.logging_params = logging_params
|
| 66 |
+
self.node_id = os.environ["RAY_NODE_ID"]
|
| 67 |
+
self.metrics_collection_disabled = disable_metrics_collection
|
| 68 |
+
self.agent_id = agent_id
|
| 69 |
+
self.session_name = session_name
|
| 70 |
+
|
| 71 |
+
# grpc server is None in mininal.
|
| 72 |
+
self.server = None
|
| 73 |
+
# http_server is None in minimal.
|
| 74 |
+
self.http_server = None
|
| 75 |
+
|
| 76 |
+
# Used by the agent and sub-modules.
|
| 77 |
+
self.gcs_aio_client = GcsAioClient(
|
| 78 |
+
address=self.gcs_address,
|
| 79 |
+
nums_reconnect_retry=ray._config.gcs_rpc_server_reconnect_timeout_s(),
|
| 80 |
+
cluster_id=self.cluster_id_hex,
|
| 81 |
+
)
|
| 82 |
+
|
| 83 |
+
if not self.minimal:
|
| 84 |
+
self._init_non_minimal()
|
| 85 |
+
|
| 86 |
+
def _init_non_minimal(self):
|
| 87 |
+
from ray._private.gcs_pubsub import GcsAioPublisher
|
| 88 |
+
from ray.dashboard.http_server_agent import HttpServerAgent
|
| 89 |
+
|
| 90 |
+
self.aio_publisher = GcsAioPublisher(address=self.gcs_address)
|
| 91 |
+
|
| 92 |
+
try:
|
| 93 |
+
from grpc import aio as aiogrpc
|
| 94 |
+
except ImportError:
|
| 95 |
+
from grpc.experimental import aio as aiogrpc
|
| 96 |
+
|
| 97 |
+
# We would want to suppress deprecating warnings from aiogrpc library
|
| 98 |
+
# with the usage of asyncio.get_event_loop() in python version >=3.10
|
| 99 |
+
# This could be removed once https://github.com/grpc/grpc/issues/32526
|
| 100 |
+
# is released, and we used higher versions of grpcio that that.
|
| 101 |
+
if sys.version_info.major >= 3 and sys.version_info.minor >= 10:
|
| 102 |
+
import warnings
|
| 103 |
+
|
| 104 |
+
with warnings.catch_warnings():
|
| 105 |
+
warnings.simplefilter("ignore", category=DeprecationWarning)
|
| 106 |
+
aiogrpc.init_grpc_aio()
|
| 107 |
+
else:
|
| 108 |
+
aiogrpc.init_grpc_aio()
|
| 109 |
+
|
| 110 |
+
self.server = aiogrpc.server(
|
| 111 |
+
options=(
|
| 112 |
+
("grpc.so_reuseport", 0),
|
| 113 |
+
(
|
| 114 |
+
"grpc.max_send_message_length",
|
| 115 |
+
AGENT_GRPC_MAX_MESSAGE_LENGTH,
|
| 116 |
+
), # noqa
|
| 117 |
+
(
|
| 118 |
+
"grpc.max_receive_message_length",
|
| 119 |
+
AGENT_GRPC_MAX_MESSAGE_LENGTH,
|
| 120 |
+
),
|
| 121 |
+
) # noqa
|
| 122 |
+
)
|
| 123 |
+
grpc_ip = "127.0.0.1" if self.ip == "127.0.0.1" else "0.0.0.0"
|
| 124 |
+
try:
|
| 125 |
+
self.grpc_port = ray._private.tls_utils.add_port_to_grpc_server(
|
| 126 |
+
self.server, f"{grpc_ip}:{self.dashboard_agent_port}"
|
| 127 |
+
)
|
| 128 |
+
except Exception:
|
| 129 |
+
# TODO(SongGuyang): Catch the exception here because there is
|
| 130 |
+
# port conflict issue which brought from static port. We should
|
| 131 |
+
# remove this after we find better port resolution.
|
| 132 |
+
logger.exception(
|
| 133 |
+
"Failed to add port to grpc server. Agent will stay alive but "
|
| 134 |
+
"disable the grpc service."
|
| 135 |
+
)
|
| 136 |
+
self.server = None
|
| 137 |
+
self.grpc_port = None
|
| 138 |
+
else:
|
| 139 |
+
logger.info("Dashboard agent grpc address: %s:%s", grpc_ip, self.grpc_port)
|
| 140 |
+
|
| 141 |
+
# If the agent is not minimal it should start the http server
|
| 142 |
+
# to communicate with the dashboard in a head node.
|
| 143 |
+
# Http server is not started in the minimal version because
|
| 144 |
+
# it requires additional dependencies that are not
|
| 145 |
+
# included in the minimal ray package.
|
| 146 |
+
self.http_server = HttpServerAgent(self.ip, self.listen_port)
|
| 147 |
+
|
| 148 |
+
def _load_modules(self):
|
| 149 |
+
"""Load dashboard agent modules."""
|
| 150 |
+
modules = []
|
| 151 |
+
agent_cls_list = dashboard_utils.get_all_modules(
|
| 152 |
+
dashboard_utils.DashboardAgentModule
|
| 153 |
+
)
|
| 154 |
+
for cls in agent_cls_list:
|
| 155 |
+
logger.info(
|
| 156 |
+
"Loading %s: %s", dashboard_utils.DashboardAgentModule.__name__, cls
|
| 157 |
+
)
|
| 158 |
+
c = cls(self)
|
| 159 |
+
modules.append(c)
|
| 160 |
+
logger.info("Loaded %d modules.", len(modules))
|
| 161 |
+
return modules
|
| 162 |
+
|
| 163 |
+
@property
|
| 164 |
+
def http_session(self):
|
| 165 |
+
assert (
|
| 166 |
+
self.http_server
|
| 167 |
+
), "Accessing unsupported API (HttpServerAgent) in a minimal ray."
|
| 168 |
+
return self.http_server.http_session
|
| 169 |
+
|
| 170 |
+
@property
|
| 171 |
+
def publisher(self):
|
| 172 |
+
assert (
|
| 173 |
+
self.aio_publisher
|
| 174 |
+
), "Accessing unsupported API (GcsAioPublisher) in a minimal ray."
|
| 175 |
+
return self.aio_publisher
|
| 176 |
+
|
| 177 |
+
def get_node_id(self) -> str:
|
| 178 |
+
return self.node_id
|
| 179 |
+
|
| 180 |
+
async def run(self):
|
| 181 |
+
# Start a grpc asyncio server.
|
| 182 |
+
if self.server:
|
| 183 |
+
await self.server.start()
|
| 184 |
+
|
| 185 |
+
modules = self._load_modules()
|
| 186 |
+
|
| 187 |
+
if self.http_server:
|
| 188 |
+
try:
|
| 189 |
+
await self.http_server.start(modules)
|
| 190 |
+
except Exception:
|
| 191 |
+
# TODO(SongGuyang): Catch the exception here because there is
|
| 192 |
+
# port conflict issue which brought from static port. We should
|
| 193 |
+
# remove this after we find better port resolution.
|
| 194 |
+
logger.exception(
|
| 195 |
+
"Failed to start http server. Agent will stay alive but "
|
| 196 |
+
"disable the http service."
|
| 197 |
+
)
|
| 198 |
+
|
| 199 |
+
# Writes agent address to kv.
|
| 200 |
+
# DASHBOARD_AGENT_ADDR_NODE_ID_PREFIX: <node_id> -> (ip, http_port, grpc_port)
|
| 201 |
+
# DASHBOARD_AGENT_ADDR_IP_PREFIX: <ip> -> (node_id, http_port, grpc_port)
|
| 202 |
+
# -1 should indicate that http server is not started.
|
| 203 |
+
http_port = -1 if not self.http_server else self.http_server.http_port
|
| 204 |
+
grpc_port = -1 if not self.server else self.grpc_port
|
| 205 |
+
put_by_node_id = self.gcs_aio_client.internal_kv_put(
|
| 206 |
+
f"{dashboard_consts.DASHBOARD_AGENT_ADDR_NODE_ID_PREFIX}{self.node_id}".encode(),
|
| 207 |
+
json.dumps([self.ip, http_port, grpc_port]).encode(),
|
| 208 |
+
True,
|
| 209 |
+
namespace=ray_constants.KV_NAMESPACE_DASHBOARD,
|
| 210 |
+
)
|
| 211 |
+
put_by_ip = self.gcs_aio_client.internal_kv_put(
|
| 212 |
+
f"{dashboard_consts.DASHBOARD_AGENT_ADDR_IP_PREFIX}{self.ip}".encode(),
|
| 213 |
+
json.dumps([self.node_id, http_port, grpc_port]).encode(),
|
| 214 |
+
True,
|
| 215 |
+
namespace=ray_constants.KV_NAMESPACE_DASHBOARD,
|
| 216 |
+
)
|
| 217 |
+
|
| 218 |
+
await asyncio.gather(put_by_node_id, put_by_ip)
|
| 219 |
+
|
| 220 |
+
tasks = [m.run(self.server) for m in modules]
|
| 221 |
+
|
| 222 |
+
if sys.platform not in ["win32", "cygwin"]:
|
| 223 |
+
|
| 224 |
+
def callback(msg):
|
| 225 |
+
logger.info(
|
| 226 |
+
f"Terminated Raylet: ip={self.ip}, node_id={self.node_id}. {msg}"
|
| 227 |
+
)
|
| 228 |
+
|
| 229 |
+
check_parent_task = create_check_raylet_task(
|
| 230 |
+
self.log_dir, self.gcs_address, callback, loop
|
| 231 |
+
)
|
| 232 |
+
tasks.append(check_parent_task)
|
| 233 |
+
|
| 234 |
+
if self.server:
|
| 235 |
+
tasks.append(self.server.wait_for_termination())
|
| 236 |
+
else:
|
| 237 |
+
|
| 238 |
+
async def wait_forever():
|
| 239 |
+
while True:
|
| 240 |
+
await asyncio.sleep(3600)
|
| 241 |
+
|
| 242 |
+
tasks.append(wait_forever())
|
| 243 |
+
|
| 244 |
+
await asyncio.gather(*tasks)
|
| 245 |
+
|
| 246 |
+
if self.http_server:
|
| 247 |
+
await self.http_server.cleanup()
|
| 248 |
+
|
| 249 |
+
|
| 250 |
+
def open_capture_files(log_dir):
|
| 251 |
+
filename = f"agent-{args.agent_id}"
|
| 252 |
+
return (
|
| 253 |
+
ray._private.utils.open_log(pathlib.Path(log_dir) / f"{filename}.out"),
|
| 254 |
+
ray._private.utils.open_log(pathlib.Path(log_dir) / f"{filename}.err"),
|
| 255 |
+
)
|
| 256 |
+
|
| 257 |
+
|
| 258 |
+
if __name__ == "__main__":
|
| 259 |
+
parser = argparse.ArgumentParser(description="Dashboard agent.")
|
| 260 |
+
parser.add_argument(
|
| 261 |
+
"--node-ip-address",
|
| 262 |
+
required=True,
|
| 263 |
+
type=str,
|
| 264 |
+
help="the IP address of this node.",
|
| 265 |
+
)
|
| 266 |
+
parser.add_argument(
|
| 267 |
+
"--gcs-address", required=True, type=str, help="The address (ip:port) of GCS."
|
| 268 |
+
)
|
| 269 |
+
parser.add_argument(
|
| 270 |
+
"--cluster-id-hex",
|
| 271 |
+
required=True,
|
| 272 |
+
type=str,
|
| 273 |
+
help="The cluster id in hex.",
|
| 274 |
+
)
|
| 275 |
+
parser.add_argument(
|
| 276 |
+
"--metrics-export-port",
|
| 277 |
+
required=True,
|
| 278 |
+
type=int,
|
| 279 |
+
help="The port to expose metrics through Prometheus.",
|
| 280 |
+
)
|
| 281 |
+
parser.add_argument(
|
| 282 |
+
"--dashboard-agent-port",
|
| 283 |
+
required=True,
|
| 284 |
+
type=int,
|
| 285 |
+
help="The port on which the dashboard agent will receive GRPCs.",
|
| 286 |
+
)
|
| 287 |
+
parser.add_argument(
|
| 288 |
+
"--node-manager-port",
|
| 289 |
+
required=True,
|
| 290 |
+
type=int,
|
| 291 |
+
help="The port to use for starting the node manager",
|
| 292 |
+
)
|
| 293 |
+
parser.add_argument(
|
| 294 |
+
"--object-store-name",
|
| 295 |
+
required=True,
|
| 296 |
+
type=str,
|
| 297 |
+
default=None,
|
| 298 |
+
help="The socket name of the plasma store",
|
| 299 |
+
)
|
| 300 |
+
parser.add_argument(
|
| 301 |
+
"--listen-port",
|
| 302 |
+
required=False,
|
| 303 |
+
type=int,
|
| 304 |
+
default=ray_constants.DEFAULT_DASHBOARD_AGENT_LISTEN_PORT,
|
| 305 |
+
help="Port for HTTP server to listen on",
|
| 306 |
+
)
|
| 307 |
+
parser.add_argument(
|
| 308 |
+
"--raylet-name",
|
| 309 |
+
required=True,
|
| 310 |
+
type=str,
|
| 311 |
+
default=None,
|
| 312 |
+
help="The socket path of the raylet process",
|
| 313 |
+
)
|
| 314 |
+
parser.add_argument(
|
| 315 |
+
"--logging-level",
|
| 316 |
+
required=False,
|
| 317 |
+
type=lambda s: logging.getLevelName(s.upper()),
|
| 318 |
+
default=ray_constants.LOGGER_LEVEL,
|
| 319 |
+
choices=ray_constants.LOGGER_LEVEL_CHOICES,
|
| 320 |
+
help=ray_constants.LOGGER_LEVEL_HELP,
|
| 321 |
+
)
|
| 322 |
+
parser.add_argument(
|
| 323 |
+
"--logging-format",
|
| 324 |
+
required=False,
|
| 325 |
+
type=str,
|
| 326 |
+
default=ray_constants.LOGGER_FORMAT,
|
| 327 |
+
help=ray_constants.LOGGER_FORMAT_HELP,
|
| 328 |
+
)
|
| 329 |
+
parser.add_argument(
|
| 330 |
+
"--logging-filename",
|
| 331 |
+
required=False,
|
| 332 |
+
type=str,
|
| 333 |
+
default=dashboard_consts.DASHBOARD_AGENT_LOG_FILENAME,
|
| 334 |
+
help="Specify the name of log file, "
|
| 335 |
+
'log to stdout if set empty, default is "{}".'.format(
|
| 336 |
+
dashboard_consts.DASHBOARD_AGENT_LOG_FILENAME
|
| 337 |
+
),
|
| 338 |
+
)
|
| 339 |
+
parser.add_argument(
|
| 340 |
+
"--logging-rotate-bytes",
|
| 341 |
+
required=False,
|
| 342 |
+
type=int,
|
| 343 |
+
default=ray_constants.LOGGING_ROTATE_BYTES,
|
| 344 |
+
help="Specify the max bytes for rotating "
|
| 345 |
+
"log file, default is {} bytes.".format(ray_constants.LOGGING_ROTATE_BYTES),
|
| 346 |
+
)
|
| 347 |
+
parser.add_argument(
|
| 348 |
+
"--logging-rotate-backup-count",
|
| 349 |
+
required=False,
|
| 350 |
+
type=int,
|
| 351 |
+
default=ray_constants.LOGGING_ROTATE_BACKUP_COUNT,
|
| 352 |
+
help="Specify the backup count of rotated log file, default is {}.".format(
|
| 353 |
+
ray_constants.LOGGING_ROTATE_BACKUP_COUNT
|
| 354 |
+
),
|
| 355 |
+
)
|
| 356 |
+
parser.add_argument(
|
| 357 |
+
"--log-dir",
|
| 358 |
+
required=True,
|
| 359 |
+
type=str,
|
| 360 |
+
default=None,
|
| 361 |
+
help="Specify the path of log directory.",
|
| 362 |
+
)
|
| 363 |
+
parser.add_argument(
|
| 364 |
+
"--temp-dir",
|
| 365 |
+
required=True,
|
| 366 |
+
type=str,
|
| 367 |
+
default=None,
|
| 368 |
+
help="Specify the path of the temporary directory use by Ray process.",
|
| 369 |
+
)
|
| 370 |
+
parser.add_argument(
|
| 371 |
+
"--session-dir",
|
| 372 |
+
required=True,
|
| 373 |
+
type=str,
|
| 374 |
+
default=None,
|
| 375 |
+
help="Specify the path of this session.",
|
| 376 |
+
)
|
| 377 |
+
|
| 378 |
+
parser.add_argument(
|
| 379 |
+
"--minimal",
|
| 380 |
+
action="store_true",
|
| 381 |
+
help=(
|
| 382 |
+
"Minimal agent only contains a subset of features that don't "
|
| 383 |
+
"require additional dependencies installed when ray is installed "
|
| 384 |
+
"by `pip install 'ray[default]'`."
|
| 385 |
+
),
|
| 386 |
+
)
|
| 387 |
+
parser.add_argument(
|
| 388 |
+
"--disable-metrics-collection",
|
| 389 |
+
action="store_true",
|
| 390 |
+
help=("If this arg is set, metrics report won't be enabled from the agent."),
|
| 391 |
+
)
|
| 392 |
+
parser.add_argument(
|
| 393 |
+
"--agent-id",
|
| 394 |
+
required=True,
|
| 395 |
+
type=int,
|
| 396 |
+
help="ID to report when registering with raylet",
|
| 397 |
+
default=os.getpid(),
|
| 398 |
+
)
|
| 399 |
+
parser.add_argument(
|
| 400 |
+
"--session-name",
|
| 401 |
+
required=False,
|
| 402 |
+
type=str,
|
| 403 |
+
default=None,
|
| 404 |
+
help="The session name (cluster id) of this cluster.",
|
| 405 |
+
)
|
| 406 |
+
|
| 407 |
+
args = parser.parse_args()
|
| 408 |
+
|
| 409 |
+
try:
|
| 410 |
+
logging_params = dict(
|
| 411 |
+
logging_level=args.logging_level,
|
| 412 |
+
logging_format=args.logging_format,
|
| 413 |
+
log_dir=args.log_dir,
|
| 414 |
+
filename=args.logging_filename,
|
| 415 |
+
max_bytes=args.logging_rotate_bytes,
|
| 416 |
+
backup_count=args.logging_rotate_backup_count,
|
| 417 |
+
)
|
| 418 |
+
logger = setup_component_logger(**logging_params)
|
| 419 |
+
|
| 420 |
+
# Initialize event loop, see Dashboard init code for caveat
|
| 421 |
+
# w.r.t grpc server init in the DashboardAgent initializer.
|
| 422 |
+
loop = ray._private.utils.get_or_create_event_loop()
|
| 423 |
+
|
| 424 |
+
# Setup stdout/stderr redirect files
|
| 425 |
+
out_file, err_file = open_capture_files(args.log_dir)
|
| 426 |
+
configure_log_file(out_file, err_file)
|
| 427 |
+
|
| 428 |
+
agent = DashboardAgent(
|
| 429 |
+
args.node_ip_address,
|
| 430 |
+
args.dashboard_agent_port,
|
| 431 |
+
args.gcs_address,
|
| 432 |
+
args.cluster_id_hex,
|
| 433 |
+
args.minimal,
|
| 434 |
+
temp_dir=args.temp_dir,
|
| 435 |
+
session_dir=args.session_dir,
|
| 436 |
+
log_dir=args.log_dir,
|
| 437 |
+
metrics_export_port=args.metrics_export_port,
|
| 438 |
+
node_manager_port=args.node_manager_port,
|
| 439 |
+
listen_port=args.listen_port,
|
| 440 |
+
object_store_name=args.object_store_name,
|
| 441 |
+
raylet_name=args.raylet_name,
|
| 442 |
+
logging_params=logging_params,
|
| 443 |
+
disable_metrics_collection=args.disable_metrics_collection,
|
| 444 |
+
agent_id=args.agent_id,
|
| 445 |
+
session_name=args.session_name,
|
| 446 |
+
)
|
| 447 |
+
|
| 448 |
+
def sigterm_handler():
|
| 449 |
+
logger.warning("Exiting with SIGTERM immediately...")
|
| 450 |
+
# Exit code 0 will be considered as an expected shutdown
|
| 451 |
+
os._exit(signal.SIGTERM)
|
| 452 |
+
|
| 453 |
+
if sys.platform != "win32":
|
| 454 |
+
# TODO(rickyyx): we currently do not have any logic for actual
|
| 455 |
+
# graceful termination in the agent. Most of the underlying
|
| 456 |
+
# async tasks run by the agent head doesn't handle CancelledError.
|
| 457 |
+
# So a truly graceful shutdown is not trivial w/o much refactoring.
|
| 458 |
+
# Re-open the issue: https://github.com/ray-project/ray/issues/25518
|
| 459 |
+
# if a truly graceful shutdown is required.
|
| 460 |
+
loop.add_signal_handler(signal.SIGTERM, sigterm_handler)
|
| 461 |
+
|
| 462 |
+
loop.run_until_complete(agent.run())
|
| 463 |
+
except Exception:
|
| 464 |
+
logger.exception("Agent is working abnormally. It will exit immediately.")
|
| 465 |
+
exit(1)
|
.venv/lib/python3.11/site-packages/ray/dashboard/consts.py
ADDED
|
@@ -0,0 +1,91 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
|
| 3 |
+
from ray._private.ray_constants import env_bool, env_integer
|
| 4 |
+
|
| 5 |
+
DASHBOARD_LOG_FILENAME = "dashboard.log"
|
| 6 |
+
DASHBOARD_AGENT_ADDR_NODE_ID_PREFIX = "DASHBOARD_AGENT_ADDR_NODE_ID_PREFIX:"
|
| 7 |
+
DASHBOARD_AGENT_ADDR_IP_PREFIX = "DASHBOARD_AGENT_ADDR_IP_PREFIX:"
|
| 8 |
+
DASHBOARD_AGENT_LOG_FILENAME = "dashboard_agent.log"
|
| 9 |
+
DASHBOARD_AGENT_CHECK_PARENT_INTERVAL_S_ENV_NAME = (
|
| 10 |
+
"RAY_DASHBOARD_AGENT_CHECK_PARENT_INTERVAL_S" # noqa
|
| 11 |
+
)
|
| 12 |
+
DASHBOARD_AGENT_CHECK_PARENT_INTERVAL_S = env_integer(
|
| 13 |
+
DASHBOARD_AGENT_CHECK_PARENT_INTERVAL_S_ENV_NAME, 0.4
|
| 14 |
+
)
|
| 15 |
+
# The maximum time that parent can be considered
|
| 16 |
+
# as dead before agent kills itself.
|
| 17 |
+
_PARENT_DEATH_THREASHOLD = 5
|
| 18 |
+
RAY_STATE_SERVER_MAX_HTTP_REQUEST_ENV_NAME = "RAY_STATE_SERVER_MAX_HTTP_REQUEST"
|
| 19 |
+
# Default number of in-progress requests to the state api server.
|
| 20 |
+
RAY_STATE_SERVER_MAX_HTTP_REQUEST = env_integer(
|
| 21 |
+
RAY_STATE_SERVER_MAX_HTTP_REQUEST_ENV_NAME, 100
|
| 22 |
+
)
|
| 23 |
+
# Max allowed number of in-progress requests could be configured.
|
| 24 |
+
RAY_STATE_SERVER_MAX_HTTP_REQUEST_ALLOWED = 1000
|
| 25 |
+
|
| 26 |
+
RAY_DASHBOARD_STATS_PURGING_INTERVAL = env_integer(
|
| 27 |
+
"RAY_DASHBOARD_STATS_PURGING_INTERVAL", 60 * 10
|
| 28 |
+
)
|
| 29 |
+
RAY_DASHBOARD_STATS_UPDATING_INTERVAL = env_integer(
|
| 30 |
+
"RAY_DASHBOARD_STATS_UPDATING_INTERVAL", 15
|
| 31 |
+
)
|
| 32 |
+
DASHBOARD_RPC_ADDRESS = "dashboard_rpc"
|
| 33 |
+
DASHBOARD_RPC_PORT = env_integer("RAY_DASHBOARD_RPC_PORT", 0)
|
| 34 |
+
GCS_SERVER_ADDRESS = "GcsServerAddress"
|
| 35 |
+
# GCS check alive
|
| 36 |
+
GCS_CHECK_ALIVE_INTERVAL_SECONDS = env_integer("GCS_CHECK_ALIVE_INTERVAL_SECONDS", 5)
|
| 37 |
+
GCS_RPC_TIMEOUT_SECONDS = env_integer("RAY_DASHBOARD_GCS_RPC_TIMEOUT_SECONDS", 60)
|
| 38 |
+
# aiohttp_cache
|
| 39 |
+
AIOHTTP_CACHE_TTL_SECONDS = 2
|
| 40 |
+
AIOHTTP_CACHE_MAX_SIZE = 128
|
| 41 |
+
AIOHTTP_CACHE_DISABLE_ENVIRONMENT_KEY = "RAY_DASHBOARD_NO_CACHE"
|
| 42 |
+
# Default value for datacenter (the default value in protobuf)
|
| 43 |
+
DEFAULT_LANGUAGE = "PYTHON"
|
| 44 |
+
DEFAULT_JOB_ID = "ffff"
|
| 45 |
+
# Hook that is invoked on the dashboard `/api/component_activities` endpoint.
|
| 46 |
+
# Environment variable stored here should be a callable that does not
|
| 47 |
+
# take any arguments and should return a dictionary mapping
|
| 48 |
+
# activity component type (str) to
|
| 49 |
+
# ray.dashboard.modules.snapshot.snapshot_head.RayActivityResponse.
|
| 50 |
+
# Example: "your.module.ray_cluster_activity_hook".
|
| 51 |
+
RAY_CLUSTER_ACTIVITY_HOOK = "RAY_CLUSTER_ACTIVITY_HOOK"
|
| 52 |
+
|
| 53 |
+
# The number of candidate agents
|
| 54 |
+
CANDIDATE_AGENT_NUMBER = max(env_integer("CANDIDATE_AGENT_NUMBER", 1), 1)
|
| 55 |
+
# when head receive JobSubmitRequest, maybe not any agent is available,
|
| 56 |
+
# we need to wait for agents in other node start
|
| 57 |
+
WAIT_AVAILABLE_AGENT_TIMEOUT = 10
|
| 58 |
+
TRY_TO_GET_AGENT_INFO_INTERVAL_SECONDS = 0.5
|
| 59 |
+
RAY_JOB_ALLOW_DRIVER_ON_WORKER_NODES_ENV_VAR = "RAY_JOB_ALLOW_DRIVER_ON_WORKER_NODES"
|
| 60 |
+
RAY_STREAM_RUNTIME_ENV_LOG_TO_JOB_DRIVER_LOG_ENV_VAR = (
|
| 61 |
+
"RAY_STREAM_RUNTIME_ENV_LOG_TO_JOB_DRIVER_LOG"
|
| 62 |
+
)
|
| 63 |
+
|
| 64 |
+
# The max time to wait for the JobSupervisor to start before failing the job.
|
| 65 |
+
DEFAULT_JOB_START_TIMEOUT_SECONDS = 60 * 15
|
| 66 |
+
RAY_JOB_START_TIMEOUT_SECONDS_ENV_VAR = "RAY_JOB_START_TIMEOUT_SECONDS"
|
| 67 |
+
# Port that dashboard prometheus metrics will be exported to
|
| 68 |
+
DASHBOARD_METRIC_PORT = env_integer("DASHBOARD_METRIC_PORT", 44227)
|
| 69 |
+
|
| 70 |
+
NODE_TAG_KEYS = ["ip", "Version", "SessionName", "IsHeadNode"]
|
| 71 |
+
GPU_TAG_KEYS = NODE_TAG_KEYS + ["GpuDeviceName", "GpuIndex"]
|
| 72 |
+
CLUSTER_TAG_KEYS = ["node_type", "Version", "SessionName"]
|
| 73 |
+
COMPONENT_METRICS_TAG_KEYS = ["ip", "pid", "Version", "Component", "SessionName"]
|
| 74 |
+
|
| 75 |
+
# Dashboard metrics are tracked separately at the dashboard. TODO(sang): Support GCS.
|
| 76 |
+
AVAILABLE_COMPONENT_NAMES_FOR_METRICS = {
|
| 77 |
+
"workers",
|
| 78 |
+
"raylet",
|
| 79 |
+
"agent",
|
| 80 |
+
"dashboard",
|
| 81 |
+
"gcs",
|
| 82 |
+
}
|
| 83 |
+
METRICS_INPUT_ROOT = os.path.join(
|
| 84 |
+
os.path.dirname(__file__), "modules", "metrics", "export"
|
| 85 |
+
)
|
| 86 |
+
PROMETHEUS_CONFIG_INPUT_PATH = os.path.join(
|
| 87 |
+
METRICS_INPUT_ROOT, "prometheus", "prometheus.yml"
|
| 88 |
+
)
|
| 89 |
+
PARENT_HEALTH_CHECK_BY_PIPE = env_bool(
|
| 90 |
+
"RAY_enable_pipe_based_agent_to_parent_health_check", False
|
| 91 |
+
)
|
.venv/lib/python3.11/site-packages/ray/dashboard/dashboard.py
ADDED
|
@@ -0,0 +1,275 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import argparse
|
| 2 |
+
import logging
|
| 3 |
+
import logging.handlers
|
| 4 |
+
import os
|
| 5 |
+
import platform
|
| 6 |
+
import signal
|
| 7 |
+
import sys
|
| 8 |
+
import traceback
|
| 9 |
+
from typing import Optional, Set
|
| 10 |
+
|
| 11 |
+
import ray._private.ray_constants as ray_constants
|
| 12 |
+
import ray._private.services
|
| 13 |
+
import ray._private.utils
|
| 14 |
+
import ray.dashboard.consts as dashboard_consts
|
| 15 |
+
import ray.dashboard.head as dashboard_head
|
| 16 |
+
import ray.dashboard.utils as dashboard_utils
|
| 17 |
+
from ray._private.ray_logging import setup_component_logger
|
| 18 |
+
|
| 19 |
+
# Logger for this module. It should be configured at the entry point
|
| 20 |
+
# into the program using Ray. Ray provides a default configuration at
|
| 21 |
+
# entry/init points.
|
| 22 |
+
logger = logging.getLogger(__name__)
|
| 23 |
+
|
| 24 |
+
|
| 25 |
+
class Dashboard:
|
| 26 |
+
"""A dashboard process for monitoring Ray nodes.
|
| 27 |
+
|
| 28 |
+
This dashboard is made up of a REST API which collates data published by
|
| 29 |
+
Reporter processes on nodes into a json structure, and a webserver
|
| 30 |
+
which polls said API for display purposes.
|
| 31 |
+
|
| 32 |
+
Args:
|
| 33 |
+
host: Host address of dashboard aiohttp server.
|
| 34 |
+
port: Port number of dashboard aiohttp server.
|
| 35 |
+
port_retries: The retry times to select a valid port.
|
| 36 |
+
gcs_address: GCS address of the cluster.
|
| 37 |
+
cluster_id_hex: Cluster ID hex string.
|
| 38 |
+
grpc_port: Port used to listen for gRPC on.
|
| 39 |
+
node_ip_address: The IP address of the dashboard.
|
| 40 |
+
serve_frontend: If configured, frontend HTML
|
| 41 |
+
is not served from the dashboard.
|
| 42 |
+
log_dir: Log directory of dashboard.
|
| 43 |
+
"""
|
| 44 |
+
|
| 45 |
+
def __init__(
|
| 46 |
+
self,
|
| 47 |
+
host: str,
|
| 48 |
+
port: int,
|
| 49 |
+
port_retries: int,
|
| 50 |
+
gcs_address: str,
|
| 51 |
+
cluster_id_hex: str,
|
| 52 |
+
grpc_port: int,
|
| 53 |
+
node_ip_address: str,
|
| 54 |
+
log_dir: str = None,
|
| 55 |
+
temp_dir: str = None,
|
| 56 |
+
session_dir: str = None,
|
| 57 |
+
minimal: bool = False,
|
| 58 |
+
serve_frontend: bool = True,
|
| 59 |
+
modules_to_load: Optional[Set[str]] = None,
|
| 60 |
+
):
|
| 61 |
+
self.dashboard_head = dashboard_head.DashboardHead(
|
| 62 |
+
http_host=host,
|
| 63 |
+
http_port=port,
|
| 64 |
+
http_port_retries=port_retries,
|
| 65 |
+
gcs_address=gcs_address,
|
| 66 |
+
cluster_id_hex=cluster_id_hex,
|
| 67 |
+
node_ip_address=node_ip_address,
|
| 68 |
+
grpc_port=grpc_port,
|
| 69 |
+
log_dir=log_dir,
|
| 70 |
+
temp_dir=temp_dir,
|
| 71 |
+
session_dir=session_dir,
|
| 72 |
+
minimal=minimal,
|
| 73 |
+
serve_frontend=serve_frontend,
|
| 74 |
+
modules_to_load=modules_to_load,
|
| 75 |
+
)
|
| 76 |
+
|
| 77 |
+
async def run(self):
|
| 78 |
+
await self.dashboard_head.run()
|
| 79 |
+
|
| 80 |
+
|
| 81 |
+
if __name__ == "__main__":
|
| 82 |
+
parser = argparse.ArgumentParser(description="Ray dashboard.")
|
| 83 |
+
parser.add_argument(
|
| 84 |
+
"--host", required=True, type=str, help="The host to use for the HTTP server."
|
| 85 |
+
)
|
| 86 |
+
parser.add_argument(
|
| 87 |
+
"--port", required=True, type=int, help="The port to use for the HTTP server."
|
| 88 |
+
)
|
| 89 |
+
parser.add_argument(
|
| 90 |
+
"--port-retries",
|
| 91 |
+
required=False,
|
| 92 |
+
type=int,
|
| 93 |
+
default=0,
|
| 94 |
+
help="The retry times to select a valid port.",
|
| 95 |
+
)
|
| 96 |
+
parser.add_argument(
|
| 97 |
+
"--gcs-address", required=True, type=str, help="The address (ip:port) of GCS."
|
| 98 |
+
)
|
| 99 |
+
parser.add_argument(
|
| 100 |
+
"--cluster-id-hex", required=True, type=str, help="The cluster ID in hex."
|
| 101 |
+
)
|
| 102 |
+
parser.add_argument(
|
| 103 |
+
"--grpc-port",
|
| 104 |
+
required=False,
|
| 105 |
+
type=int,
|
| 106 |
+
default=dashboard_consts.DASHBOARD_RPC_PORT,
|
| 107 |
+
help="The port for the dashboard to listen for gRPC on.",
|
| 108 |
+
)
|
| 109 |
+
parser.add_argument(
|
| 110 |
+
"--node-ip-address",
|
| 111 |
+
required=True,
|
| 112 |
+
type=str,
|
| 113 |
+
help="The IP address of the node where this is running.",
|
| 114 |
+
)
|
| 115 |
+
parser.add_argument(
|
| 116 |
+
"--logging-level",
|
| 117 |
+
required=False,
|
| 118 |
+
type=lambda s: logging.getLevelName(s.upper()),
|
| 119 |
+
default=ray_constants.LOGGER_LEVEL,
|
| 120 |
+
choices=ray_constants.LOGGER_LEVEL_CHOICES,
|
| 121 |
+
help=ray_constants.LOGGER_LEVEL_HELP,
|
| 122 |
+
)
|
| 123 |
+
parser.add_argument(
|
| 124 |
+
"--logging-format",
|
| 125 |
+
required=False,
|
| 126 |
+
type=str,
|
| 127 |
+
default=ray_constants.LOGGER_FORMAT,
|
| 128 |
+
help=ray_constants.LOGGER_FORMAT_HELP,
|
| 129 |
+
)
|
| 130 |
+
parser.add_argument(
|
| 131 |
+
"--logging-filename",
|
| 132 |
+
required=False,
|
| 133 |
+
type=str,
|
| 134 |
+
default=dashboard_consts.DASHBOARD_LOG_FILENAME,
|
| 135 |
+
help="Specify the name of log file, "
|
| 136 |
+
'log to stdout if set empty, default is "{}"'.format(
|
| 137 |
+
dashboard_consts.DASHBOARD_LOG_FILENAME
|
| 138 |
+
),
|
| 139 |
+
)
|
| 140 |
+
parser.add_argument(
|
| 141 |
+
"--logging-rotate-bytes",
|
| 142 |
+
required=False,
|
| 143 |
+
type=int,
|
| 144 |
+
default=ray_constants.LOGGING_ROTATE_BYTES,
|
| 145 |
+
help="Specify the max bytes for rotating "
|
| 146 |
+
"log file, default is {} bytes.".format(ray_constants.LOGGING_ROTATE_BYTES),
|
| 147 |
+
)
|
| 148 |
+
parser.add_argument(
|
| 149 |
+
"--logging-rotate-backup-count",
|
| 150 |
+
required=False,
|
| 151 |
+
type=int,
|
| 152 |
+
default=ray_constants.LOGGING_ROTATE_BACKUP_COUNT,
|
| 153 |
+
help="Specify the backup count of rotated log file, default is {}.".format(
|
| 154 |
+
ray_constants.LOGGING_ROTATE_BACKUP_COUNT
|
| 155 |
+
),
|
| 156 |
+
)
|
| 157 |
+
parser.add_argument(
|
| 158 |
+
"--log-dir",
|
| 159 |
+
required=True,
|
| 160 |
+
type=str,
|
| 161 |
+
default=None,
|
| 162 |
+
help="Specify the path of log directory.",
|
| 163 |
+
)
|
| 164 |
+
parser.add_argument(
|
| 165 |
+
"--temp-dir",
|
| 166 |
+
required=True,
|
| 167 |
+
type=str,
|
| 168 |
+
default=None,
|
| 169 |
+
help="Specify the path of the temporary directory use by Ray process.",
|
| 170 |
+
)
|
| 171 |
+
parser.add_argument(
|
| 172 |
+
"--session-dir",
|
| 173 |
+
required=True,
|
| 174 |
+
type=str,
|
| 175 |
+
default=None,
|
| 176 |
+
help="Specify the path of the session directory of the cluster.",
|
| 177 |
+
)
|
| 178 |
+
parser.add_argument(
|
| 179 |
+
"--minimal",
|
| 180 |
+
action="store_true",
|
| 181 |
+
help=(
|
| 182 |
+
"Minimal dashboard only contains a subset of features that don't "
|
| 183 |
+
"require additional dependencies installed when ray is installed "
|
| 184 |
+
"by `pip install ray[default]`."
|
| 185 |
+
),
|
| 186 |
+
)
|
| 187 |
+
parser.add_argument(
|
| 188 |
+
"--modules-to-load",
|
| 189 |
+
required=False,
|
| 190 |
+
default=None,
|
| 191 |
+
help=(
|
| 192 |
+
"Specify the list of module names in [module_1],[module_2] format."
|
| 193 |
+
"E.g., JobHead,StateHead... "
|
| 194 |
+
"If nothing is specified, all modules are loaded."
|
| 195 |
+
),
|
| 196 |
+
)
|
| 197 |
+
parser.add_argument(
|
| 198 |
+
"--disable-frontend",
|
| 199 |
+
action="store_true",
|
| 200 |
+
help=("If configured, frontend html is not served from the server."),
|
| 201 |
+
)
|
| 202 |
+
|
| 203 |
+
args = parser.parse_args()
|
| 204 |
+
|
| 205 |
+
try:
|
| 206 |
+
setup_component_logger(
|
| 207 |
+
logging_level=args.logging_level,
|
| 208 |
+
logging_format=args.logging_format,
|
| 209 |
+
log_dir=args.log_dir,
|
| 210 |
+
filename=args.logging_filename,
|
| 211 |
+
max_bytes=args.logging_rotate_bytes,
|
| 212 |
+
backup_count=args.logging_rotate_backup_count,
|
| 213 |
+
)
|
| 214 |
+
|
| 215 |
+
if args.modules_to_load:
|
| 216 |
+
modules_to_load = set(args.modules_to_load.strip(" ,").split(","))
|
| 217 |
+
else:
|
| 218 |
+
# None == default.
|
| 219 |
+
modules_to_load = None
|
| 220 |
+
|
| 221 |
+
# NOTE: Creating and attaching the event loop to the main OS thread be called
|
| 222 |
+
# before initializing Dashboard, which will initialize the grpc aio server,
|
| 223 |
+
# which assumes a working event loop. Ref:
|
| 224 |
+
# https://github.com/grpc/grpc/blob/master/src/python/grpcio/grpc/_cython/_cygrpc/aio/common.pyx.pxi#L174-L188
|
| 225 |
+
loop = ray._private.utils.get_or_create_event_loop()
|
| 226 |
+
dashboard = Dashboard(
|
| 227 |
+
host=args.host,
|
| 228 |
+
port=args.port,
|
| 229 |
+
port_retries=args.port_retries,
|
| 230 |
+
gcs_address=args.gcs_address,
|
| 231 |
+
cluster_id_hex=args.cluster_id_hex,
|
| 232 |
+
grpc_port=args.grpc_port,
|
| 233 |
+
node_ip_address=args.node_ip_address,
|
| 234 |
+
log_dir=args.log_dir,
|
| 235 |
+
temp_dir=args.temp_dir,
|
| 236 |
+
session_dir=args.session_dir,
|
| 237 |
+
minimal=args.minimal,
|
| 238 |
+
serve_frontend=(not args.disable_frontend),
|
| 239 |
+
modules_to_load=modules_to_load,
|
| 240 |
+
)
|
| 241 |
+
|
| 242 |
+
def sigterm_handler():
|
| 243 |
+
logger.warning("Exiting with SIGTERM immediately...")
|
| 244 |
+
os._exit(signal.SIGTERM)
|
| 245 |
+
|
| 246 |
+
if sys.platform != "win32":
|
| 247 |
+
# TODO(rickyyx): we currently do not have any logic for actual
|
| 248 |
+
# graceful termination in the dashboard. Most of the underlying
|
| 249 |
+
# async tasks run by the dashboard head doesn't handle CancelledError.
|
| 250 |
+
# So a truly graceful shutdown is not trivial w/o much refactoring.
|
| 251 |
+
# Re-open the issue: https://github.com/ray-project/ray/issues/25518
|
| 252 |
+
# if a truly graceful shutdown is required.
|
| 253 |
+
loop.add_signal_handler(signal.SIGTERM, sigterm_handler)
|
| 254 |
+
|
| 255 |
+
loop.run_until_complete(dashboard.run())
|
| 256 |
+
except Exception as e:
|
| 257 |
+
traceback_str = ray._private.utils.format_error_message(traceback.format_exc())
|
| 258 |
+
message = (
|
| 259 |
+
f"The dashboard on node {platform.uname()[1]} "
|
| 260 |
+
f"failed with the following "
|
| 261 |
+
f"error:\n{traceback_str}"
|
| 262 |
+
)
|
| 263 |
+
if isinstance(e, dashboard_utils.FrontendNotFoundError):
|
| 264 |
+
logger.warning(message)
|
| 265 |
+
else:
|
| 266 |
+
logger.error(message)
|
| 267 |
+
raise e
|
| 268 |
+
|
| 269 |
+
# Something went wrong, so push an error to all drivers.
|
| 270 |
+
gcs_publisher = ray._raylet.GcsPublisher(address=args.gcs_address)
|
| 271 |
+
ray._private.utils.publish_error_to_driver(
|
| 272 |
+
ray_constants.DASHBOARD_DIED_ERROR,
|
| 273 |
+
message,
|
| 274 |
+
gcs_publisher=gcs_publisher,
|
| 275 |
+
)
|
.venv/lib/python3.11/site-packages/ray/dashboard/dashboard_metrics.py
ADDED
|
@@ -0,0 +1,123 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from typing import Optional
|
| 2 |
+
|
| 3 |
+
from ray.dashboard.consts import COMPONENT_METRICS_TAG_KEYS
|
| 4 |
+
|
| 5 |
+
|
| 6 |
+
class NullMetric:
|
| 7 |
+
"""Mock metric class to be used in case of prometheus_client import error."""
|
| 8 |
+
|
| 9 |
+
def set(self, *args, **kwargs):
|
| 10 |
+
pass
|
| 11 |
+
|
| 12 |
+
def observe(self, *args, **kwargs):
|
| 13 |
+
pass
|
| 14 |
+
|
| 15 |
+
def inc(self, *args, **kwargs):
|
| 16 |
+
pass
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
try:
|
| 20 |
+
|
| 21 |
+
from prometheus_client import CollectorRegistry, Counter, Gauge, Histogram
|
| 22 |
+
|
| 23 |
+
# The metrics in this class should be kept in sync with
|
| 24 |
+
# python/ray/tests/test_metrics_agent.py
|
| 25 |
+
class DashboardPrometheusMetrics:
|
| 26 |
+
def __init__(self, registry: Optional[CollectorRegistry] = None):
|
| 27 |
+
self.registry: CollectorRegistry = registry or CollectorRegistry(
|
| 28 |
+
auto_describe=True
|
| 29 |
+
)
|
| 30 |
+
# Buckets: 5ms, 10ms, 25ms, 50ms, 75ms
|
| 31 |
+
# 100ms, 250ms, 500ms, 750ms
|
| 32 |
+
# 1s, 2.5s, 5s, 7.5s, 10s
|
| 33 |
+
# 20s, 40s, 60s
|
| 34 |
+
# used for API duration
|
| 35 |
+
histogram_buckets_s = [
|
| 36 |
+
0.005,
|
| 37 |
+
0.01,
|
| 38 |
+
0.025,
|
| 39 |
+
0.05,
|
| 40 |
+
0.075,
|
| 41 |
+
0.1,
|
| 42 |
+
0.25,
|
| 43 |
+
0.5,
|
| 44 |
+
0.75,
|
| 45 |
+
1,
|
| 46 |
+
2.5,
|
| 47 |
+
5,
|
| 48 |
+
7.5,
|
| 49 |
+
10,
|
| 50 |
+
20,
|
| 51 |
+
40,
|
| 52 |
+
60,
|
| 53 |
+
]
|
| 54 |
+
self.metrics_request_duration = Histogram(
|
| 55 |
+
"dashboard_api_requests_duration_seconds",
|
| 56 |
+
"Total duration in seconds per endpoint",
|
| 57 |
+
("endpoint", "http_status", "Version", "SessionName", "Component"),
|
| 58 |
+
unit="seconds",
|
| 59 |
+
namespace="ray",
|
| 60 |
+
registry=self.registry,
|
| 61 |
+
buckets=histogram_buckets_s,
|
| 62 |
+
)
|
| 63 |
+
self.metrics_request_count = Counter(
|
| 64 |
+
"dashboard_api_requests_count",
|
| 65 |
+
"Total requests count per endpoint",
|
| 66 |
+
(
|
| 67 |
+
"method",
|
| 68 |
+
"endpoint",
|
| 69 |
+
"http_status",
|
| 70 |
+
"Version",
|
| 71 |
+
"SessionName",
|
| 72 |
+
"Component",
|
| 73 |
+
),
|
| 74 |
+
unit="requests",
|
| 75 |
+
namespace="ray",
|
| 76 |
+
registry=self.registry,
|
| 77 |
+
)
|
| 78 |
+
self.metrics_event_loop_tasks = Gauge(
|
| 79 |
+
"dashboard_event_loop_tasks",
|
| 80 |
+
"Number of tasks currently pending in the event loop's queue.",
|
| 81 |
+
tuple(COMPONENT_METRICS_TAG_KEYS),
|
| 82 |
+
unit="tasks",
|
| 83 |
+
namespace="ray",
|
| 84 |
+
registry=self.registry,
|
| 85 |
+
)
|
| 86 |
+
self.metrics_event_loop_lag = Gauge(
|
| 87 |
+
"dashboard_event_loop_lag",
|
| 88 |
+
"Event loop lag in seconds.",
|
| 89 |
+
tuple(COMPONENT_METRICS_TAG_KEYS),
|
| 90 |
+
unit="seconds",
|
| 91 |
+
namespace="ray",
|
| 92 |
+
registry=self.registry,
|
| 93 |
+
)
|
| 94 |
+
self.metrics_dashboard_cpu = Gauge(
|
| 95 |
+
"component_cpu",
|
| 96 |
+
"Dashboard CPU percentage usage.",
|
| 97 |
+
tuple(COMPONENT_METRICS_TAG_KEYS),
|
| 98 |
+
unit="percentage",
|
| 99 |
+
namespace="ray",
|
| 100 |
+
registry=self.registry,
|
| 101 |
+
)
|
| 102 |
+
self.metrics_dashboard_mem_uss = Gauge(
|
| 103 |
+
"component_uss",
|
| 104 |
+
"USS usage of all components on the node.",
|
| 105 |
+
tuple(COMPONENT_METRICS_TAG_KEYS),
|
| 106 |
+
unit="mb",
|
| 107 |
+
namespace="ray",
|
| 108 |
+
registry=self.registry,
|
| 109 |
+
)
|
| 110 |
+
self.metrics_dashboard_mem_rss = Gauge(
|
| 111 |
+
"component_rss",
|
| 112 |
+
"RSS usage of all components on the node.",
|
| 113 |
+
tuple(COMPONENT_METRICS_TAG_KEYS),
|
| 114 |
+
unit="mb",
|
| 115 |
+
namespace="ray",
|
| 116 |
+
registry=self.registry,
|
| 117 |
+
)
|
| 118 |
+
|
| 119 |
+
except ImportError:
|
| 120 |
+
|
| 121 |
+
class DashboardPrometheusMetrics(object):
|
| 122 |
+
def __getattr__(self, attr):
|
| 123 |
+
return NullMetric()
|
.venv/lib/python3.11/site-packages/ray/dashboard/datacenter.py
ADDED
|
@@ -0,0 +1,285 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import logging
|
| 2 |
+
from typing import Any, List, Optional
|
| 3 |
+
|
| 4 |
+
import ray.dashboard.consts as dashboard_consts
|
| 5 |
+
from ray._private.utils import (
|
| 6 |
+
get_or_create_event_loop,
|
| 7 |
+
parse_pg_formatted_resources_to_original,
|
| 8 |
+
)
|
| 9 |
+
from ray.dashboard.utils import (
|
| 10 |
+
Dict,
|
| 11 |
+
MutableNotificationDict,
|
| 12 |
+
async_loop_forever,
|
| 13 |
+
compose_state_message,
|
| 14 |
+
)
|
| 15 |
+
|
| 16 |
+
logger = logging.getLogger(__name__)
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
# NOT thread safe. Every assignment must be on the main event loop thread.
|
| 20 |
+
class DataSource:
|
| 21 |
+
# {node id hex(str): node stats(dict of GetNodeStatsReply
|
| 22 |
+
# in node_manager.proto)}
|
| 23 |
+
node_stats = Dict()
|
| 24 |
+
# {node id hex(str): node physical stats(dict from reporter_agent.py)}
|
| 25 |
+
node_physical_stats = Dict()
|
| 26 |
+
# {actor id hex(str): actor table data(dict of ActorTableData
|
| 27 |
+
# in gcs.proto)}
|
| 28 |
+
actors = MutableNotificationDict()
|
| 29 |
+
# {job id hex(str): job table data(dict of JobTableData in gcs.proto)}
|
| 30 |
+
# {node id hex(str): dashboard agent [http port(int), grpc port(int)]}
|
| 31 |
+
agents = Dict()
|
| 32 |
+
# {node id hex(str): gcs node info(dict of GcsNodeInfo in gcs.proto)}
|
| 33 |
+
nodes = Dict()
|
| 34 |
+
# {node id hex(str): worker list}
|
| 35 |
+
node_workers = Dict()
|
| 36 |
+
# {node id hex(str): {actor id hex(str): actor table data}}
|
| 37 |
+
node_actors = MutableNotificationDict()
|
| 38 |
+
# {worker id(str): core worker stats}
|
| 39 |
+
core_worker_stats = Dict()
|
| 40 |
+
|
| 41 |
+
|
| 42 |
+
class DataOrganizer:
|
| 43 |
+
head_node_ip = None
|
| 44 |
+
|
| 45 |
+
@staticmethod
|
| 46 |
+
@async_loop_forever(dashboard_consts.RAY_DASHBOARD_STATS_PURGING_INTERVAL)
|
| 47 |
+
async def purge():
|
| 48 |
+
# Purge data that is out of date.
|
| 49 |
+
# These data sources are maintained by DashboardHead,
|
| 50 |
+
# we do not needs to purge them:
|
| 51 |
+
# * agents
|
| 52 |
+
# * nodes
|
| 53 |
+
alive_nodes = {
|
| 54 |
+
node_id
|
| 55 |
+
for node_id, node_info in DataSource.nodes.items()
|
| 56 |
+
if node_info["state"] == "ALIVE"
|
| 57 |
+
}
|
| 58 |
+
for key in DataSource.node_stats.keys() - alive_nodes:
|
| 59 |
+
DataSource.node_stats.pop(key)
|
| 60 |
+
|
| 61 |
+
for key in DataSource.node_physical_stats.keys() - alive_nodes:
|
| 62 |
+
DataSource.node_physical_stats.pop(key)
|
| 63 |
+
|
| 64 |
+
@classmethod
|
| 65 |
+
@async_loop_forever(dashboard_consts.RAY_DASHBOARD_STATS_UPDATING_INTERVAL)
|
| 66 |
+
async def organize(cls, thread_pool_executor):
|
| 67 |
+
"""
|
| 68 |
+
Organizes data: read from (node_physical_stats, node_stats) and updates
|
| 69 |
+
(node_workers, node_worker_stats).
|
| 70 |
+
|
| 71 |
+
This methods is not really async, but DataSource is not thread safe so we need
|
| 72 |
+
to make sure it's on the main event loop thread. To avoid blocking the main
|
| 73 |
+
event loop, we yield after each node processed.
|
| 74 |
+
"""
|
| 75 |
+
loop = get_or_create_event_loop()
|
| 76 |
+
|
| 77 |
+
node_workers = {}
|
| 78 |
+
core_worker_stats = {}
|
| 79 |
+
|
| 80 |
+
# NOTE: We copy keys of the `DataSource.nodes` to make sure
|
| 81 |
+
# it doesn't change during the iteration (since its being updated
|
| 82 |
+
# from another async task)
|
| 83 |
+
for node_id in list(DataSource.nodes.keys()):
|
| 84 |
+
node_physical_stats = DataSource.node_physical_stats.get(node_id, {})
|
| 85 |
+
node_stats = DataSource.node_stats.get(node_id, {})
|
| 86 |
+
# Offloads the blocking operation to a thread pool executor. This also
|
| 87 |
+
# yields to the event loop.
|
| 88 |
+
workers = await loop.run_in_executor(
|
| 89 |
+
thread_pool_executor,
|
| 90 |
+
cls._extract_workers_for_node,
|
| 91 |
+
node_physical_stats,
|
| 92 |
+
node_stats,
|
| 93 |
+
)
|
| 94 |
+
|
| 95 |
+
for worker in workers:
|
| 96 |
+
for stats in worker.get("coreWorkerStats", []):
|
| 97 |
+
worker_id = stats["workerId"]
|
| 98 |
+
core_worker_stats[worker_id] = stats
|
| 99 |
+
|
| 100 |
+
node_workers[node_id] = workers
|
| 101 |
+
|
| 102 |
+
DataSource.node_workers.reset(node_workers)
|
| 103 |
+
DataSource.core_worker_stats.reset(core_worker_stats)
|
| 104 |
+
|
| 105 |
+
@classmethod
|
| 106 |
+
def _extract_workers_for_node(cls, node_physical_stats, node_stats):
|
| 107 |
+
workers = []
|
| 108 |
+
# Merge coreWorkerStats (node stats) to workers (node physical stats)
|
| 109 |
+
pid_to_worker_stats = {}
|
| 110 |
+
pid_to_language = {}
|
| 111 |
+
pid_to_job_id = {}
|
| 112 |
+
|
| 113 |
+
for core_worker_stats in node_stats.get("coreWorkersStats", []):
|
| 114 |
+
pid = core_worker_stats["pid"]
|
| 115 |
+
|
| 116 |
+
pid_to_worker_stats[pid] = core_worker_stats
|
| 117 |
+
pid_to_language[pid] = core_worker_stats["language"]
|
| 118 |
+
pid_to_job_id[pid] = core_worker_stats["jobId"]
|
| 119 |
+
|
| 120 |
+
for worker in node_physical_stats.get("workers", []):
|
| 121 |
+
worker = dict(worker)
|
| 122 |
+
pid = worker["pid"]
|
| 123 |
+
|
| 124 |
+
core_worker_stats = pid_to_worker_stats.get(pid)
|
| 125 |
+
# Empty list means core worker stats is not available.
|
| 126 |
+
worker["coreWorkerStats"] = [core_worker_stats] if core_worker_stats else []
|
| 127 |
+
worker["language"] = pid_to_language.get(
|
| 128 |
+
pid, dashboard_consts.DEFAULT_LANGUAGE
|
| 129 |
+
)
|
| 130 |
+
worker["jobId"] = pid_to_job_id.get(pid, dashboard_consts.DEFAULT_JOB_ID)
|
| 131 |
+
|
| 132 |
+
workers.append(worker)
|
| 133 |
+
|
| 134 |
+
return workers
|
| 135 |
+
|
| 136 |
+
@classmethod
|
| 137 |
+
async def get_node_info(cls, node_id, get_summary=False):
|
| 138 |
+
node_physical_stats = dict(DataSource.node_physical_stats.get(node_id, {}))
|
| 139 |
+
node_stats = dict(DataSource.node_stats.get(node_id, {}))
|
| 140 |
+
node = DataSource.nodes.get(node_id, {})
|
| 141 |
+
|
| 142 |
+
if get_summary:
|
| 143 |
+
node_physical_stats.pop("workers", None)
|
| 144 |
+
node_stats.pop("workersStats", None)
|
| 145 |
+
else:
|
| 146 |
+
node_stats.pop("coreWorkersStats", None)
|
| 147 |
+
store_stats = node_stats.get("storeStats", {})
|
| 148 |
+
used = int(store_stats.get("objectStoreBytesUsed", 0))
|
| 149 |
+
# objectStoreBytesAvail == total in the object_manager.cc definition.
|
| 150 |
+
total = int(store_stats.get("objectStoreBytesAvail", 0))
|
| 151 |
+
ray_stats = {
|
| 152 |
+
"object_store_used_memory": used,
|
| 153 |
+
"object_store_available_memory": total - used,
|
| 154 |
+
}
|
| 155 |
+
|
| 156 |
+
node_info = node_physical_stats
|
| 157 |
+
# Merge node stats to node physical stats under raylet
|
| 158 |
+
node_info["raylet"] = node_stats
|
| 159 |
+
node_info["raylet"].update(ray_stats)
|
| 160 |
+
|
| 161 |
+
# Merge GcsNodeInfo to node physical stats
|
| 162 |
+
node_info["raylet"].update(node)
|
| 163 |
+
death_info = node.get("deathInfo", {})
|
| 164 |
+
node_info["raylet"]["stateMessage"] = compose_state_message(
|
| 165 |
+
death_info.get("reason", None), death_info.get("reasonMessage", None)
|
| 166 |
+
)
|
| 167 |
+
|
| 168 |
+
if not get_summary:
|
| 169 |
+
actor_table_entries = DataSource.node_actors.get(node_id, {})
|
| 170 |
+
|
| 171 |
+
# Merge actors to node physical stats
|
| 172 |
+
node_info["actors"] = {
|
| 173 |
+
actor_id: await DataOrganizer._get_actor_info(actor_table_entry)
|
| 174 |
+
for actor_id, actor_table_entry in actor_table_entries.items()
|
| 175 |
+
}
|
| 176 |
+
|
| 177 |
+
# Update workers to node physical stats
|
| 178 |
+
node_info["workers"] = DataSource.node_workers.get(node_id, [])
|
| 179 |
+
|
| 180 |
+
return node_info
|
| 181 |
+
|
| 182 |
+
@classmethod
|
| 183 |
+
async def get_all_node_summary(cls):
|
| 184 |
+
return [
|
| 185 |
+
# NOTE: We're intentionally awaiting in a loop to avoid excessive
|
| 186 |
+
# concurrency spinning up excessive # of tasks for large clusters
|
| 187 |
+
await DataOrganizer.get_node_info(node_id, get_summary=True)
|
| 188 |
+
for node_id in DataSource.nodes.keys()
|
| 189 |
+
]
|
| 190 |
+
|
| 191 |
+
@classmethod
|
| 192 |
+
async def get_agent_infos(
|
| 193 |
+
cls, target_node_ids: Optional[List[str]] = None
|
| 194 |
+
) -> Dict[str, Dict[str, Any]]:
|
| 195 |
+
"""Fetches running Agent (like HTTP/gRPC ports, IP, etc) running on every node
|
| 196 |
+
|
| 197 |
+
:param target_node_ids: Target node ids to fetch agent info for. If omitted will
|
| 198 |
+
fetch the info for all agents
|
| 199 |
+
"""
|
| 200 |
+
|
| 201 |
+
# Return all available agent infos in case no target node-ids were provided
|
| 202 |
+
target_node_ids = target_node_ids or DataSource.agents.keys()
|
| 203 |
+
|
| 204 |
+
missing_node_ids = [
|
| 205 |
+
node_id for node_id in target_node_ids if node_id not in DataSource.agents
|
| 206 |
+
]
|
| 207 |
+
if missing_node_ids:
|
| 208 |
+
logger.warning(
|
| 209 |
+
f"Agent info was not found for {missing_node_ids}"
|
| 210 |
+
f" (having agent infos for {list(DataSource.agents.keys())})"
|
| 211 |
+
)
|
| 212 |
+
return {}
|
| 213 |
+
|
| 214 |
+
def _create_agent_info(node_id: str):
|
| 215 |
+
(node_ip, http_port, grpc_port) = DataSource.agents[node_id]
|
| 216 |
+
|
| 217 |
+
return dict(
|
| 218 |
+
ipAddress=node_ip,
|
| 219 |
+
httpPort=int(http_port or -1),
|
| 220 |
+
grpcPort=int(grpc_port or -1),
|
| 221 |
+
httpAddress=f"{node_ip}:{http_port}",
|
| 222 |
+
)
|
| 223 |
+
|
| 224 |
+
return {node_id: _create_agent_info(node_id) for node_id in target_node_ids}
|
| 225 |
+
|
| 226 |
+
@classmethod
|
| 227 |
+
async def get_actor_infos(cls, actor_ids: Optional[List[str]] = None):
|
| 228 |
+
target_actor_table_entries: dict[str, Optional[dict]]
|
| 229 |
+
if actor_ids is not None:
|
| 230 |
+
target_actor_table_entries = {
|
| 231 |
+
actor_id: DataSource.actors.get(actor_id) for actor_id in actor_ids
|
| 232 |
+
}
|
| 233 |
+
else:
|
| 234 |
+
target_actor_table_entries = DataSource.actors
|
| 235 |
+
|
| 236 |
+
return {
|
| 237 |
+
actor_id: await DataOrganizer._get_actor_info(actor_table_entry)
|
| 238 |
+
for actor_id, actor_table_entry in target_actor_table_entries.items()
|
| 239 |
+
}
|
| 240 |
+
|
| 241 |
+
@staticmethod
|
| 242 |
+
async def _get_actor_info(actor):
|
| 243 |
+
if actor is None:
|
| 244 |
+
return None
|
| 245 |
+
|
| 246 |
+
actor = dict(actor)
|
| 247 |
+
worker_id = actor["address"]["workerId"]
|
| 248 |
+
core_worker_stats = DataSource.core_worker_stats.get(worker_id, {})
|
| 249 |
+
actor_constructor = core_worker_stats.get(
|
| 250 |
+
"actorTitle", "Unknown actor constructor"
|
| 251 |
+
)
|
| 252 |
+
actor["actorConstructor"] = actor_constructor
|
| 253 |
+
actor.update(core_worker_stats)
|
| 254 |
+
|
| 255 |
+
# TODO(fyrestone): remove this, give a link from actor
|
| 256 |
+
# info to worker info in front-end.
|
| 257 |
+
node_id = actor["address"]["rayletId"]
|
| 258 |
+
pid = core_worker_stats.get("pid")
|
| 259 |
+
node_physical_stats = DataSource.node_physical_stats.get(node_id, {})
|
| 260 |
+
actor_process_stats = None
|
| 261 |
+
actor_process_gpu_stats = []
|
| 262 |
+
if pid:
|
| 263 |
+
for process_stats in node_physical_stats.get("workers", []):
|
| 264 |
+
if process_stats["pid"] == pid:
|
| 265 |
+
actor_process_stats = process_stats
|
| 266 |
+
break
|
| 267 |
+
|
| 268 |
+
for gpu_stats in node_physical_stats.get("gpus", []):
|
| 269 |
+
# gpu_stats.get("processes") can be None, an empty list or a
|
| 270 |
+
# list of dictionaries.
|
| 271 |
+
for process in gpu_stats.get("processesPids") or []:
|
| 272 |
+
if process["pid"] == pid:
|
| 273 |
+
actor_process_gpu_stats.append(gpu_stats)
|
| 274 |
+
break
|
| 275 |
+
|
| 276 |
+
actor["gpus"] = actor_process_gpu_stats
|
| 277 |
+
actor["processStats"] = actor_process_stats
|
| 278 |
+
actor["mem"] = node_physical_stats.get("mem", [])
|
| 279 |
+
|
| 280 |
+
required_resources = parse_pg_formatted_resources_to_original(
|
| 281 |
+
actor["requiredResources"]
|
| 282 |
+
)
|
| 283 |
+
actor["requiredResources"] = required_resources
|
| 284 |
+
|
| 285 |
+
return actor
|
.venv/lib/python3.11/site-packages/ray/dashboard/head.py
ADDED
|
@@ -0,0 +1,351 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import asyncio
|
| 2 |
+
import logging
|
| 3 |
+
from concurrent.futures import ThreadPoolExecutor
|
| 4 |
+
from pathlib import Path
|
| 5 |
+
from typing import Optional, Set
|
| 6 |
+
|
| 7 |
+
import ray.dashboard.consts as dashboard_consts
|
| 8 |
+
import ray.dashboard.utils as dashboard_utils
|
| 9 |
+
import ray.experimental.internal_kv as internal_kv
|
| 10 |
+
from ray._private import ray_constants
|
| 11 |
+
from ray._private.gcs_utils import GcsAioClient
|
| 12 |
+
from ray._private.ray_constants import env_integer
|
| 13 |
+
from ray._private.usage.usage_lib import TagKey, record_extra_usage_tag
|
| 14 |
+
from ray._raylet import GcsClient
|
| 15 |
+
from ray.dashboard.consts import DASHBOARD_METRIC_PORT
|
| 16 |
+
from ray.dashboard.dashboard_metrics import DashboardPrometheusMetrics
|
| 17 |
+
from ray.dashboard.datacenter import DataOrganizer
|
| 18 |
+
from ray.dashboard.utils import (
|
| 19 |
+
DashboardHeadModule,
|
| 20 |
+
DashboardHeadModuleConfig,
|
| 21 |
+
async_loop_forever,
|
| 22 |
+
)
|
| 23 |
+
|
| 24 |
+
try:
|
| 25 |
+
import prometheus_client
|
| 26 |
+
except ImportError:
|
| 27 |
+
prometheus_client = None
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
logger = logging.getLogger(__name__)
|
| 31 |
+
|
| 32 |
+
GRPC_CHANNEL_OPTIONS = (
|
| 33 |
+
*ray_constants.GLOBAL_GRPC_OPTIONS,
|
| 34 |
+
("grpc.max_send_message_length", ray_constants.GRPC_CPP_MAX_MESSAGE_SIZE),
|
| 35 |
+
("grpc.max_receive_message_length", ray_constants.GRPC_CPP_MAX_MESSAGE_SIZE),
|
| 36 |
+
)
|
| 37 |
+
|
| 38 |
+
# NOTE: Executor in this head is intentionally constrained to just 1 thread by
|
| 39 |
+
# default to limit its concurrency, therefore reducing potential for
|
| 40 |
+
# GIL contention
|
| 41 |
+
RAY_DASHBOARD_DASHBOARD_HEAD_TPE_MAX_WORKERS = env_integer(
|
| 42 |
+
"RAY_DASHBOARD_DASHBOARD_HEAD_TPE_MAX_WORKERS", 1
|
| 43 |
+
)
|
| 44 |
+
|
| 45 |
+
|
| 46 |
+
def initialize_grpc_port_and_server(grpc_ip, grpc_port):
|
| 47 |
+
try:
|
| 48 |
+
from grpc import aio as aiogrpc
|
| 49 |
+
except ImportError:
|
| 50 |
+
from grpc.experimental import aio as aiogrpc
|
| 51 |
+
|
| 52 |
+
import ray._private.tls_utils
|
| 53 |
+
|
| 54 |
+
aiogrpc.init_grpc_aio()
|
| 55 |
+
|
| 56 |
+
server = aiogrpc.server(options=(("grpc.so_reuseport", 0),))
|
| 57 |
+
|
| 58 |
+
grpc_port = ray._private.tls_utils.add_port_to_grpc_server(
|
| 59 |
+
server, f"{grpc_ip}:{grpc_port}"
|
| 60 |
+
)
|
| 61 |
+
|
| 62 |
+
return server, grpc_port
|
| 63 |
+
|
| 64 |
+
|
| 65 |
+
class DashboardHead:
|
| 66 |
+
def __init__(
|
| 67 |
+
self,
|
| 68 |
+
http_host: str,
|
| 69 |
+
http_port: int,
|
| 70 |
+
http_port_retries: int,
|
| 71 |
+
gcs_address: str,
|
| 72 |
+
cluster_id_hex: str,
|
| 73 |
+
node_ip_address: str,
|
| 74 |
+
grpc_port: int,
|
| 75 |
+
log_dir: str,
|
| 76 |
+
temp_dir: str,
|
| 77 |
+
session_dir: str,
|
| 78 |
+
minimal: bool,
|
| 79 |
+
serve_frontend: bool,
|
| 80 |
+
modules_to_load: Optional[Set[str]] = None,
|
| 81 |
+
):
|
| 82 |
+
"""
|
| 83 |
+
Args:
|
| 84 |
+
http_host: The host address for the Http server.
|
| 85 |
+
http_port: The port for the Http server.
|
| 86 |
+
http_port_retries: The maximum retry to bind ports for the Http server.
|
| 87 |
+
gcs_address: The GCS address in the {address}:{port} format.
|
| 88 |
+
log_dir: The log directory. E.g., /tmp/session_latest/logs.
|
| 89 |
+
temp_dir: The temp directory. E.g., /tmp.
|
| 90 |
+
session_dir: The session directory. E.g., tmp/session_latest.
|
| 91 |
+
minimal: Whether or not it will load the minimal modules.
|
| 92 |
+
serve_frontend: If configured, frontend HTML is
|
| 93 |
+
served from the dashboard.
|
| 94 |
+
grpc_port: The port used to listen for gRPC on.
|
| 95 |
+
modules_to_load: A set of module name in string to load.
|
| 96 |
+
By default (None), it loads all available modules.
|
| 97 |
+
Note that available modules could be changed depending on
|
| 98 |
+
minimal flags.
|
| 99 |
+
"""
|
| 100 |
+
self.minimal = minimal
|
| 101 |
+
self.serve_frontend = serve_frontend
|
| 102 |
+
# If it is the minimal mode, we shouldn't serve frontend.
|
| 103 |
+
if self.minimal:
|
| 104 |
+
self.serve_frontend = False
|
| 105 |
+
# Public attributes are accessible for all head modules.
|
| 106 |
+
# Walkaround for issue: https://github.com/ray-project/ray/issues/7084
|
| 107 |
+
self.http_host = "127.0.0.1" if http_host == "localhost" else http_host
|
| 108 |
+
self.http_port = http_port
|
| 109 |
+
self.http_port_retries = http_port_retries
|
| 110 |
+
self._modules_to_load = modules_to_load
|
| 111 |
+
self._modules_loaded = False
|
| 112 |
+
self.metrics = None
|
| 113 |
+
|
| 114 |
+
self._executor = ThreadPoolExecutor(
|
| 115 |
+
max_workers=RAY_DASHBOARD_DASHBOARD_HEAD_TPE_MAX_WORKERS,
|
| 116 |
+
thread_name_prefix="dashboard_head_executor",
|
| 117 |
+
)
|
| 118 |
+
|
| 119 |
+
assert gcs_address is not None
|
| 120 |
+
self.gcs_address = gcs_address
|
| 121 |
+
self.cluster_id_hex = cluster_id_hex
|
| 122 |
+
self.log_dir = log_dir
|
| 123 |
+
self.temp_dir = temp_dir
|
| 124 |
+
self.session_dir = session_dir
|
| 125 |
+
self.session_name = Path(session_dir).name
|
| 126 |
+
self.gcs_error_subscriber = None
|
| 127 |
+
self.gcs_log_subscriber = None
|
| 128 |
+
self.ip = node_ip_address
|
| 129 |
+
DataOrganizer.head_node_ip = self.ip
|
| 130 |
+
|
| 131 |
+
if self.minimal:
|
| 132 |
+
self.server, self.grpc_port = None, None
|
| 133 |
+
else:
|
| 134 |
+
grpc_ip = "127.0.0.1" if self.ip == "127.0.0.1" else "0.0.0.0"
|
| 135 |
+
self.server, self.grpc_port = initialize_grpc_port_and_server(
|
| 136 |
+
grpc_ip, grpc_port
|
| 137 |
+
)
|
| 138 |
+
logger.info("Dashboard head grpc address: %s:%s", grpc_ip, self.grpc_port)
|
| 139 |
+
# If the dashboard is started as non-minimal version, http server should
|
| 140 |
+
# be configured to expose APIs.
|
| 141 |
+
self.http_server = None
|
| 142 |
+
|
| 143 |
+
async def _configure_http_server(self, modules):
|
| 144 |
+
from ray.dashboard.http_server_head import HttpServerDashboardHead
|
| 145 |
+
|
| 146 |
+
self.http_server = HttpServerDashboardHead(
|
| 147 |
+
self.ip,
|
| 148 |
+
self.http_host,
|
| 149 |
+
self.http_port,
|
| 150 |
+
self.http_port_retries,
|
| 151 |
+
self.gcs_address,
|
| 152 |
+
self.session_name,
|
| 153 |
+
self.metrics,
|
| 154 |
+
)
|
| 155 |
+
await self.http_server.run(modules)
|
| 156 |
+
|
| 157 |
+
@property
|
| 158 |
+
def http_session(self):
|
| 159 |
+
if not self._modules_loaded and not self.http_server:
|
| 160 |
+
# When the dashboard is still starting up, this property gets
|
| 161 |
+
# called as part of the method_route_table_factory magic. In
|
| 162 |
+
# this case, the property is not actually used but the magic
|
| 163 |
+
# method calls every property to look for a route to add to
|
| 164 |
+
# the global route table. It should be okay for http_server
|
| 165 |
+
# to still be None at this point.
|
| 166 |
+
return None
|
| 167 |
+
assert self.http_server, "Accessing unsupported API in a minimal ray."
|
| 168 |
+
return self.http_server.http_session
|
| 169 |
+
|
| 170 |
+
@async_loop_forever(dashboard_consts.GCS_CHECK_ALIVE_INTERVAL_SECONDS)
|
| 171 |
+
async def _gcs_check_alive(self):
|
| 172 |
+
try:
|
| 173 |
+
# If gcs is permanently dead, gcs client will exit the process
|
| 174 |
+
# (see gcs_rpc_client.h)
|
| 175 |
+
await self.gcs_aio_client.check_alive(node_ips=[], timeout=None)
|
| 176 |
+
except Exception:
|
| 177 |
+
logger.warning("Failed to check gcs aliveness, will retry", exc_info=True)
|
| 178 |
+
|
| 179 |
+
def _load_modules(self, modules_to_load: Optional[Set[str]] = None):
|
| 180 |
+
"""Load dashboard head modules.
|
| 181 |
+
|
| 182 |
+
Args:
|
| 183 |
+
modules: A list of module names to load. By default (None),
|
| 184 |
+
it loads all modules.
|
| 185 |
+
"""
|
| 186 |
+
modules = []
|
| 187 |
+
head_cls_list = dashboard_utils.get_all_modules(DashboardHeadModule)
|
| 188 |
+
|
| 189 |
+
config = DashboardHeadModuleConfig(
|
| 190 |
+
minimal=self.minimal,
|
| 191 |
+
cluster_id_hex=self.cluster_id_hex,
|
| 192 |
+
session_name=self.session_name,
|
| 193 |
+
gcs_address=self.gcs_address,
|
| 194 |
+
log_dir=self.log_dir,
|
| 195 |
+
temp_dir=self.temp_dir,
|
| 196 |
+
session_dir=self.session_dir,
|
| 197 |
+
ip=self.ip,
|
| 198 |
+
http_host=self.http_host,
|
| 199 |
+
http_port=self.http_port,
|
| 200 |
+
metrics=self.metrics,
|
| 201 |
+
)
|
| 202 |
+
|
| 203 |
+
# Select modules to load.
|
| 204 |
+
modules_to_load = modules_to_load or {m.__name__ for m in head_cls_list}
|
| 205 |
+
logger.info("Modules to load: %s", modules_to_load)
|
| 206 |
+
|
| 207 |
+
for cls in head_cls_list:
|
| 208 |
+
logger.info("Loading %s: %s", DashboardHeadModule.__name__, cls)
|
| 209 |
+
if cls.__name__ in modules_to_load:
|
| 210 |
+
c = cls(config)
|
| 211 |
+
modules.append(c)
|
| 212 |
+
|
| 213 |
+
# Verify modules are loaded as expected.
|
| 214 |
+
loaded_modules = {type(m).__name__ for m in modules}
|
| 215 |
+
if loaded_modules != modules_to_load:
|
| 216 |
+
assert False, (
|
| 217 |
+
"Actual loaded modules, {}, doesn't match the requested modules "
|
| 218 |
+
"to load, {}".format(loaded_modules, modules_to_load)
|
| 219 |
+
)
|
| 220 |
+
|
| 221 |
+
self._modules_loaded = True
|
| 222 |
+
logger.info("Loaded %d modules. %s", len(modules), modules)
|
| 223 |
+
return modules
|
| 224 |
+
|
| 225 |
+
async def _setup_metrics(self, gcs_aio_client):
|
| 226 |
+
metrics = DashboardPrometheusMetrics()
|
| 227 |
+
|
| 228 |
+
# Setup prometheus metrics export server
|
| 229 |
+
assert internal_kv._internal_kv_initialized()
|
| 230 |
+
assert gcs_aio_client is not None
|
| 231 |
+
address = f"{self.ip}:{DASHBOARD_METRIC_PORT}"
|
| 232 |
+
await gcs_aio_client.internal_kv_put(
|
| 233 |
+
"DashboardMetricsAddress".encode(), address.encode(), True, namespace=None
|
| 234 |
+
)
|
| 235 |
+
if prometheus_client:
|
| 236 |
+
try:
|
| 237 |
+
logger.info(
|
| 238 |
+
"Starting dashboard metrics server on port {}".format(
|
| 239 |
+
DASHBOARD_METRIC_PORT
|
| 240 |
+
)
|
| 241 |
+
)
|
| 242 |
+
kwargs = {"addr": "127.0.0.1"} if self.ip == "127.0.0.1" else {}
|
| 243 |
+
prometheus_client.start_http_server(
|
| 244 |
+
port=DASHBOARD_METRIC_PORT,
|
| 245 |
+
registry=metrics.registry,
|
| 246 |
+
**kwargs,
|
| 247 |
+
)
|
| 248 |
+
except Exception:
|
| 249 |
+
logger.exception(
|
| 250 |
+
"An exception occurred while starting the metrics server."
|
| 251 |
+
)
|
| 252 |
+
elif not prometheus_client:
|
| 253 |
+
logger.warning(
|
| 254 |
+
"`prometheus_client` not found, so metrics will not be exported."
|
| 255 |
+
)
|
| 256 |
+
|
| 257 |
+
return metrics
|
| 258 |
+
|
| 259 |
+
async def run(self):
|
| 260 |
+
gcs_address = self.gcs_address
|
| 261 |
+
|
| 262 |
+
# Dashboard will handle connection failure automatically
|
| 263 |
+
self.gcs_client = GcsClient(
|
| 264 |
+
address=gcs_address, nums_reconnect_retry=0, cluster_id=self.cluster_id_hex
|
| 265 |
+
)
|
| 266 |
+
self.gcs_aio_client = GcsAioClient(
|
| 267 |
+
address=gcs_address, nums_reconnect_retry=0, cluster_id=self.cluster_id_hex
|
| 268 |
+
)
|
| 269 |
+
internal_kv._initialize_internal_kv(self.gcs_client)
|
| 270 |
+
|
| 271 |
+
if not self.minimal:
|
| 272 |
+
self.metrics = await self._setup_metrics(self.gcs_aio_client)
|
| 273 |
+
|
| 274 |
+
try:
|
| 275 |
+
assert internal_kv._internal_kv_initialized()
|
| 276 |
+
# Note: We always record the usage, but it is not reported
|
| 277 |
+
# if the usage stats is disabled.
|
| 278 |
+
record_extra_usage_tag(TagKey.DASHBOARD_USED, "False")
|
| 279 |
+
except Exception as e:
|
| 280 |
+
logger.warning(
|
| 281 |
+
"Failed to record the dashboard usage. "
|
| 282 |
+
"This error message is harmless and can be ignored. "
|
| 283 |
+
f"Error: {e}"
|
| 284 |
+
)
|
| 285 |
+
|
| 286 |
+
# Start a grpc asyncio server.
|
| 287 |
+
if self.server:
|
| 288 |
+
await self.server.start()
|
| 289 |
+
|
| 290 |
+
async def _async_notify():
|
| 291 |
+
"""Notify signals from queue."""
|
| 292 |
+
while True:
|
| 293 |
+
co = await dashboard_utils.NotifyQueue.get()
|
| 294 |
+
try:
|
| 295 |
+
await co
|
| 296 |
+
except Exception:
|
| 297 |
+
logger.exception(f"Error notifying coroutine {co}")
|
| 298 |
+
|
| 299 |
+
modules = self._load_modules(self._modules_to_load)
|
| 300 |
+
|
| 301 |
+
http_host, http_port = self.http_host, self.http_port
|
| 302 |
+
if self.serve_frontend:
|
| 303 |
+
logger.info("Initialize the http server.")
|
| 304 |
+
await self._configure_http_server(modules)
|
| 305 |
+
http_host, http_port = self.http_server.get_address()
|
| 306 |
+
logger.info(f"http server initialized at {http_host}:{http_port}")
|
| 307 |
+
else:
|
| 308 |
+
logger.info("http server disabled.")
|
| 309 |
+
|
| 310 |
+
# We need to expose dashboard's node's ip for other worker nodes
|
| 311 |
+
# if it's listening to all interfaces.
|
| 312 |
+
dashboard_http_host = (
|
| 313 |
+
self.ip
|
| 314 |
+
if self.http_host != ray_constants.DEFAULT_DASHBOARD_IP
|
| 315 |
+
else http_host
|
| 316 |
+
)
|
| 317 |
+
# This synchronous code inside an async context is not great.
|
| 318 |
+
# It is however acceptable, because this only gets run once
|
| 319 |
+
# during initialization and therefore cannot block the event loop.
|
| 320 |
+
# This could be done better in the future, including
|
| 321 |
+
# removing the polling on the Ray side, by communicating the
|
| 322 |
+
# server address to Ray via stdin / stdout or a pipe.
|
| 323 |
+
self.gcs_client.internal_kv_put(
|
| 324 |
+
ray_constants.DASHBOARD_ADDRESS.encode(),
|
| 325 |
+
f"{dashboard_http_host}:{http_port}".encode(),
|
| 326 |
+
True,
|
| 327 |
+
namespace=ray_constants.KV_NAMESPACE_DASHBOARD,
|
| 328 |
+
)
|
| 329 |
+
self.gcs_client.internal_kv_put(
|
| 330 |
+
dashboard_consts.DASHBOARD_RPC_ADDRESS.encode(),
|
| 331 |
+
f"{self.ip}:{self.grpc_port}".encode(),
|
| 332 |
+
True,
|
| 333 |
+
namespace=ray_constants.KV_NAMESPACE_DASHBOARD,
|
| 334 |
+
)
|
| 335 |
+
|
| 336 |
+
# Freeze signal after all modules loaded.
|
| 337 |
+
dashboard_utils.SignalManager.freeze()
|
| 338 |
+
concurrent_tasks = [
|
| 339 |
+
self._gcs_check_alive(),
|
| 340 |
+
_async_notify(),
|
| 341 |
+
DataOrganizer.purge(),
|
| 342 |
+
DataOrganizer.organize(self._executor),
|
| 343 |
+
]
|
| 344 |
+
for m in modules:
|
| 345 |
+
concurrent_tasks.append(m.run(self.server))
|
| 346 |
+
if self.server:
|
| 347 |
+
concurrent_tasks.append(self.server.wait_for_termination())
|
| 348 |
+
await asyncio.gather(*concurrent_tasks)
|
| 349 |
+
|
| 350 |
+
if self.http_server:
|
| 351 |
+
await self.http_server.cleanup()
|
.venv/lib/python3.11/site-packages/ray/dashboard/http_server_agent.py
ADDED
|
@@ -0,0 +1,83 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import logging
|
| 2 |
+
|
| 3 |
+
from packaging.version import Version
|
| 4 |
+
|
| 5 |
+
import ray.dashboard.optional_utils as dashboard_optional_utils
|
| 6 |
+
from ray._private.utils import get_or_create_event_loop
|
| 7 |
+
from ray.dashboard.optional_deps import aiohttp, aiohttp_cors, hdrs
|
| 8 |
+
|
| 9 |
+
logger = logging.getLogger(__name__)
|
| 10 |
+
routes = dashboard_optional_utils.DashboardAgentRouteTable
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
class HttpServerAgent:
|
| 14 |
+
def __init__(self, ip, listen_port):
|
| 15 |
+
self.ip = ip
|
| 16 |
+
self.listen_port = listen_port
|
| 17 |
+
self.http_host = None
|
| 18 |
+
self.http_port = None
|
| 19 |
+
self.http_session = None
|
| 20 |
+
self.runner = None
|
| 21 |
+
|
| 22 |
+
async def start(self, modules):
|
| 23 |
+
# Create a http session for all modules.
|
| 24 |
+
# aiohttp<4.0.0 uses a 'loop' variable, aiohttp>=4.0.0 doesn't anymore
|
| 25 |
+
if Version(aiohttp.__version__) < Version("4.0.0"):
|
| 26 |
+
self.http_session = aiohttp.ClientSession(loop=get_or_create_event_loop())
|
| 27 |
+
else:
|
| 28 |
+
self.http_session = aiohttp.ClientSession()
|
| 29 |
+
|
| 30 |
+
# Bind routes for every module so that each module
|
| 31 |
+
# can use decorator-style routes.
|
| 32 |
+
for c in modules:
|
| 33 |
+
dashboard_optional_utils.DashboardAgentRouteTable.bind(c)
|
| 34 |
+
|
| 35 |
+
app = aiohttp.web.Application()
|
| 36 |
+
app.add_routes(routes=routes.bound_routes())
|
| 37 |
+
|
| 38 |
+
# Enable CORS on all routes.
|
| 39 |
+
cors = aiohttp_cors.setup(
|
| 40 |
+
app,
|
| 41 |
+
defaults={
|
| 42 |
+
"*": aiohttp_cors.ResourceOptions(
|
| 43 |
+
allow_credentials=True,
|
| 44 |
+
expose_headers="*",
|
| 45 |
+
allow_methods="*",
|
| 46 |
+
allow_headers=("Content-Type", "X-Header"),
|
| 47 |
+
)
|
| 48 |
+
},
|
| 49 |
+
)
|
| 50 |
+
for route in list(app.router.routes()):
|
| 51 |
+
cors.add(route)
|
| 52 |
+
|
| 53 |
+
self.runner = aiohttp.web.AppRunner(app)
|
| 54 |
+
await self.runner.setup()
|
| 55 |
+
try:
|
| 56 |
+
site = aiohttp.web.TCPSite(
|
| 57 |
+
self.runner,
|
| 58 |
+
"127.0.0.1" if self.ip == "127.0.0.1" else "0.0.0.0",
|
| 59 |
+
self.listen_port,
|
| 60 |
+
)
|
| 61 |
+
await site.start()
|
| 62 |
+
except OSError as e:
|
| 63 |
+
logger.error(
|
| 64 |
+
f"Agent port #{self.listen_port} already in use. "
|
| 65 |
+
"Failed to start agent. "
|
| 66 |
+
f"Ensure port #{self.listen_port} is available, and then try again."
|
| 67 |
+
)
|
| 68 |
+
raise e
|
| 69 |
+
self.http_host, self.http_port, *_ = site._server.sockets[0].getsockname()
|
| 70 |
+
logger.info(
|
| 71 |
+
"Dashboard agent http address: %s:%s", self.http_host, self.http_port
|
| 72 |
+
)
|
| 73 |
+
|
| 74 |
+
# Dump registered http routes.
|
| 75 |
+
dump_routes = [r for r in app.router.routes() if r.method != hdrs.METH_HEAD]
|
| 76 |
+
for r in dump_routes:
|
| 77 |
+
logger.info(r)
|
| 78 |
+
logger.info("Registered %s routes.", len(dump_routes))
|
| 79 |
+
|
| 80 |
+
async def cleanup(self):
|
| 81 |
+
# Wait for finish signal.
|
| 82 |
+
await self.runner.cleanup()
|
| 83 |
+
await self.http_session.close()
|
.venv/lib/python3.11/site-packages/ray/dashboard/http_server_head.py
ADDED
|
@@ -0,0 +1,289 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import asyncio
|
| 2 |
+
import errno
|
| 3 |
+
import ipaddress
|
| 4 |
+
import logging
|
| 5 |
+
import os
|
| 6 |
+
import pathlib
|
| 7 |
+
import sys
|
| 8 |
+
import time
|
| 9 |
+
from math import floor
|
| 10 |
+
|
| 11 |
+
from packaging.version import Version
|
| 12 |
+
|
| 13 |
+
import ray
|
| 14 |
+
import ray.dashboard.optional_utils as dashboard_optional_utils
|
| 15 |
+
import ray.dashboard.timezone_utils as timezone_utils
|
| 16 |
+
import ray.dashboard.utils as dashboard_utils
|
| 17 |
+
from ray._private.usage.usage_lib import TagKey, record_extra_usage_tag
|
| 18 |
+
from ray._private.utils import get_or_create_event_loop
|
| 19 |
+
from ray.dashboard.dashboard_metrics import DashboardPrometheusMetrics
|
| 20 |
+
|
| 21 |
+
# All third-party dependencies that are not included in the minimal Ray
|
| 22 |
+
# installation must be included in this file. This allows us to determine if
|
| 23 |
+
# the agent has the necessary dependencies to be started.
|
| 24 |
+
from ray.dashboard.optional_deps import aiohttp, hdrs
|
| 25 |
+
|
| 26 |
+
# Logger for this module. It should be configured at the entry point
|
| 27 |
+
# into the program using Ray. Ray provides a default configuration at
|
| 28 |
+
# entry/init points.
|
| 29 |
+
logger = logging.getLogger(__name__)
|
| 30 |
+
routes = dashboard_optional_utils.DashboardHeadRouteTable
|
| 31 |
+
|
| 32 |
+
# Env var that enables follow_symlinks for serving UI static files.
|
| 33 |
+
# This is an advanced setting that should only be used with special Ray installations
|
| 34 |
+
# where the dashboard build files are symlinked to a different directory.
|
| 35 |
+
# This is not recommended for most users and can pose a security risk.
|
| 36 |
+
# Please reference the aiohttp docs here:
|
| 37 |
+
# https://docs.aiohttp.org/en/stable/web_reference.html#aiohttp.web.UrlDispatcher.add_static
|
| 38 |
+
ENV_VAR_FOLLOW_SYMLINKS = "RAY_DASHBOARD_BUILD_FOLLOW_SYMLINKS"
|
| 39 |
+
FOLLOW_SYMLINKS_ENABLED = os.environ.get(ENV_VAR_FOLLOW_SYMLINKS) == "1"
|
| 40 |
+
if FOLLOW_SYMLINKS_ENABLED:
|
| 41 |
+
logger.warning(
|
| 42 |
+
"Enabling RAY_DASHBOARD_BUILD_FOLLOW_SYMLINKS is not recommended as it "
|
| 43 |
+
"allows symlinks to directories outside the dashboard build folder. "
|
| 44 |
+
"You may accidentally expose files on your system outside of the "
|
| 45 |
+
"build directory."
|
| 46 |
+
)
|
| 47 |
+
|
| 48 |
+
|
| 49 |
+
def setup_static_dir():
|
| 50 |
+
build_dir = os.path.join(
|
| 51 |
+
os.path.dirname(os.path.abspath(__file__)), "client", "build"
|
| 52 |
+
)
|
| 53 |
+
module_name = os.path.basename(os.path.dirname(__file__))
|
| 54 |
+
if not os.path.isdir(build_dir):
|
| 55 |
+
raise dashboard_utils.FrontendNotFoundError(
|
| 56 |
+
errno.ENOENT,
|
| 57 |
+
"Dashboard build directory not found. If installing "
|
| 58 |
+
"from source, please follow the additional steps "
|
| 59 |
+
"required to build the dashboard"
|
| 60 |
+
f"(cd python/ray/{module_name}/client "
|
| 61 |
+
"&& npm ci "
|
| 62 |
+
"&& npm run build)",
|
| 63 |
+
build_dir,
|
| 64 |
+
)
|
| 65 |
+
|
| 66 |
+
static_dir = os.path.join(build_dir, "static")
|
| 67 |
+
routes.static("/static", static_dir, follow_symlinks=FOLLOW_SYMLINKS_ENABLED)
|
| 68 |
+
return build_dir
|
| 69 |
+
|
| 70 |
+
|
| 71 |
+
class HttpServerDashboardHead:
|
| 72 |
+
def __init__(
|
| 73 |
+
self,
|
| 74 |
+
ip: str,
|
| 75 |
+
http_host: str,
|
| 76 |
+
http_port: int,
|
| 77 |
+
http_port_retries: int,
|
| 78 |
+
gcs_address: str,
|
| 79 |
+
session_name: str,
|
| 80 |
+
metrics: DashboardPrometheusMetrics,
|
| 81 |
+
):
|
| 82 |
+
self.ip = ip
|
| 83 |
+
self.http_host = http_host
|
| 84 |
+
self.http_port = http_port
|
| 85 |
+
self.http_port_retries = http_port_retries
|
| 86 |
+
self.head_node_ip = gcs_address.split(":")[0]
|
| 87 |
+
self.metrics = metrics
|
| 88 |
+
self._session_name = session_name
|
| 89 |
+
|
| 90 |
+
# Below attirubtes are filled after `run` API is invoked.
|
| 91 |
+
self.runner = None
|
| 92 |
+
|
| 93 |
+
# Setup Dashboard Routes
|
| 94 |
+
try:
|
| 95 |
+
build_dir = setup_static_dir()
|
| 96 |
+
logger.info("Setup static dir for dashboard: %s", build_dir)
|
| 97 |
+
except dashboard_utils.FrontendNotFoundError as ex:
|
| 98 |
+
# Not to raise FrontendNotFoundError due to NPM incompatibilities
|
| 99 |
+
# with Windows.
|
| 100 |
+
# Please refer to ci.sh::build_dashboard_front_end()
|
| 101 |
+
if sys.platform in ["win32", "cygwin"]:
|
| 102 |
+
logger.warning(ex)
|
| 103 |
+
else:
|
| 104 |
+
raise ex
|
| 105 |
+
dashboard_optional_utils.DashboardHeadRouteTable.bind(self)
|
| 106 |
+
|
| 107 |
+
# Create a http session for all modules.
|
| 108 |
+
# aiohttp<4.0.0 uses a 'loop' variable, aiohttp>=4.0.0 doesn't anymore
|
| 109 |
+
if Version(aiohttp.__version__) < Version("4.0.0"):
|
| 110 |
+
self.http_session = aiohttp.ClientSession(loop=get_or_create_event_loop())
|
| 111 |
+
else:
|
| 112 |
+
self.http_session = aiohttp.ClientSession()
|
| 113 |
+
|
| 114 |
+
@routes.get("/")
|
| 115 |
+
async def get_index(self, req) -> aiohttp.web.FileResponse:
|
| 116 |
+
try:
|
| 117 |
+
# This API will be no-op after the first report.
|
| 118 |
+
# Note: We always record the usage, but it is not reported
|
| 119 |
+
# if the usage stats is disabled.
|
| 120 |
+
record_extra_usage_tag(TagKey.DASHBOARD_USED, "True")
|
| 121 |
+
except Exception as e:
|
| 122 |
+
logger.warning(
|
| 123 |
+
"Failed to record the dashboard usage. "
|
| 124 |
+
"This error message is harmless and can be ignored. "
|
| 125 |
+
f"Error: {e}"
|
| 126 |
+
)
|
| 127 |
+
resp = aiohttp.web.FileResponse(
|
| 128 |
+
os.path.join(
|
| 129 |
+
os.path.dirname(os.path.abspath(__file__)), "client/build/index.html"
|
| 130 |
+
)
|
| 131 |
+
)
|
| 132 |
+
resp.headers["Cache-Control"] = "no-cache"
|
| 133 |
+
return resp
|
| 134 |
+
|
| 135 |
+
@routes.get("/favicon.ico")
|
| 136 |
+
async def get_favicon(self, req) -> aiohttp.web.FileResponse:
|
| 137 |
+
return aiohttp.web.FileResponse(
|
| 138 |
+
os.path.join(
|
| 139 |
+
os.path.dirname(os.path.abspath(__file__)), "client/build/favicon.ico"
|
| 140 |
+
)
|
| 141 |
+
)
|
| 142 |
+
|
| 143 |
+
@routes.get("/timezone")
|
| 144 |
+
async def get_timezone(self, req) -> aiohttp.web.Response:
|
| 145 |
+
try:
|
| 146 |
+
current_timezone = timezone_utils.get_current_timezone_info()
|
| 147 |
+
return aiohttp.web.json_response(current_timezone)
|
| 148 |
+
|
| 149 |
+
except Exception as e:
|
| 150 |
+
logger.error(f"Error getting timezone: {e}")
|
| 151 |
+
return aiohttp.web.Response(
|
| 152 |
+
status=500, text="Internal Server Error:" + str(e)
|
| 153 |
+
)
|
| 154 |
+
|
| 155 |
+
def get_address(self):
|
| 156 |
+
assert self.http_host and self.http_port
|
| 157 |
+
return self.http_host, self.http_port
|
| 158 |
+
|
| 159 |
+
@aiohttp.web.middleware
|
| 160 |
+
async def path_clean_middleware(self, request, handler):
|
| 161 |
+
if request.path.startswith("/static") or request.path.startswith("/logs"):
|
| 162 |
+
parent = pathlib.PurePosixPath(
|
| 163 |
+
"/logs" if request.path.startswith("/logs") else "/static"
|
| 164 |
+
)
|
| 165 |
+
|
| 166 |
+
# If the destination is not relative to the expected directory,
|
| 167 |
+
# then the user is attempting path traversal, so deny the request.
|
| 168 |
+
request_path = pathlib.PurePosixPath(
|
| 169 |
+
pathlib.posixpath.realpath(request.path)
|
| 170 |
+
)
|
| 171 |
+
if request_path != parent and parent not in request_path.parents:
|
| 172 |
+
logger.info(
|
| 173 |
+
f"Rejecting {request_path=} because it is not relative to {parent=}"
|
| 174 |
+
)
|
| 175 |
+
raise aiohttp.web.HTTPForbidden()
|
| 176 |
+
return await handler(request)
|
| 177 |
+
|
| 178 |
+
@aiohttp.web.middleware
|
| 179 |
+
async def browsers_no_post_put_middleware(self, request, handler):
|
| 180 |
+
if (
|
| 181 |
+
# A best effort test for browser traffic. All common browsers
|
| 182 |
+
# start with Mozilla at the time of writing.
|
| 183 |
+
dashboard_optional_utils.is_browser_request(request)
|
| 184 |
+
and request.method in [hdrs.METH_POST, hdrs.METH_PUT]
|
| 185 |
+
):
|
| 186 |
+
return aiohttp.web.Response(
|
| 187 |
+
status=405, text="Method Not Allowed for browser traffic."
|
| 188 |
+
)
|
| 189 |
+
|
| 190 |
+
return await handler(request)
|
| 191 |
+
|
| 192 |
+
@aiohttp.web.middleware
|
| 193 |
+
async def metrics_middleware(self, request, handler):
|
| 194 |
+
start_time = time.monotonic()
|
| 195 |
+
|
| 196 |
+
try:
|
| 197 |
+
response = await handler(request)
|
| 198 |
+
status_tag = f"{floor(response.status / 100)}xx"
|
| 199 |
+
return response
|
| 200 |
+
except (Exception, asyncio.CancelledError):
|
| 201 |
+
status_tag = "5xx"
|
| 202 |
+
raise
|
| 203 |
+
finally:
|
| 204 |
+
resp_time = time.monotonic() - start_time
|
| 205 |
+
try:
|
| 206 |
+
self.metrics.metrics_request_duration.labels(
|
| 207 |
+
endpoint=handler.__name__,
|
| 208 |
+
http_status=status_tag,
|
| 209 |
+
Version=ray.__version__,
|
| 210 |
+
SessionName=self._session_name,
|
| 211 |
+
Component="dashboard",
|
| 212 |
+
).observe(resp_time)
|
| 213 |
+
self.metrics.metrics_request_count.labels(
|
| 214 |
+
method=request.method,
|
| 215 |
+
endpoint=handler.__name__,
|
| 216 |
+
http_status=status_tag,
|
| 217 |
+
Version=ray.__version__,
|
| 218 |
+
SessionName=self._session_name,
|
| 219 |
+
Component="dashboard",
|
| 220 |
+
).inc()
|
| 221 |
+
except Exception as e:
|
| 222 |
+
logger.exception(f"Error emitting api metrics: {e}")
|
| 223 |
+
|
| 224 |
+
@aiohttp.web.middleware
|
| 225 |
+
async def cache_control_static_middleware(self, request, handler):
|
| 226 |
+
if request.path.startswith("/static"):
|
| 227 |
+
response = await handler(request)
|
| 228 |
+
response.headers["Cache-Control"] = "max-age=31536000"
|
| 229 |
+
return response
|
| 230 |
+
return await handler(request)
|
| 231 |
+
|
| 232 |
+
async def run(self, modules):
|
| 233 |
+
# Bind http routes of each module.
|
| 234 |
+
for c in modules:
|
| 235 |
+
dashboard_optional_utils.DashboardHeadRouteTable.bind(c)
|
| 236 |
+
|
| 237 |
+
# Http server should be initialized after all modules loaded.
|
| 238 |
+
# working_dir uploads for job submission can be up to 100MiB.
|
| 239 |
+
app = aiohttp.web.Application(
|
| 240 |
+
client_max_size=100 * 1024**2,
|
| 241 |
+
middlewares=[
|
| 242 |
+
self.metrics_middleware,
|
| 243 |
+
self.path_clean_middleware,
|
| 244 |
+
self.browsers_no_post_put_middleware,
|
| 245 |
+
self.cache_control_static_middleware,
|
| 246 |
+
],
|
| 247 |
+
)
|
| 248 |
+
app.add_routes(routes=routes.bound_routes())
|
| 249 |
+
|
| 250 |
+
self.runner = aiohttp.web.AppRunner(
|
| 251 |
+
app,
|
| 252 |
+
access_log_format=(
|
| 253 |
+
"%a %t '%r' %s %b bytes %D us " "'%{Referer}i' '%{User-Agent}i'"
|
| 254 |
+
),
|
| 255 |
+
)
|
| 256 |
+
await self.runner.setup()
|
| 257 |
+
last_ex = None
|
| 258 |
+
for i in range(1 + self.http_port_retries):
|
| 259 |
+
try:
|
| 260 |
+
site = aiohttp.web.TCPSite(self.runner, self.http_host, self.http_port)
|
| 261 |
+
await site.start()
|
| 262 |
+
break
|
| 263 |
+
except OSError as e:
|
| 264 |
+
last_ex = e
|
| 265 |
+
self.http_port += 1
|
| 266 |
+
logger.warning("Try to use port %s: %s", self.http_port, e)
|
| 267 |
+
else:
|
| 268 |
+
raise Exception(
|
| 269 |
+
f"Failed to find a valid port for dashboard after "
|
| 270 |
+
f"{self.http_port_retries} retries: {last_ex}"
|
| 271 |
+
)
|
| 272 |
+
self.http_host, self.http_port, *_ = site._server.sockets[0].getsockname()
|
| 273 |
+
self.http_host = (
|
| 274 |
+
self.ip
|
| 275 |
+
if ipaddress.ip_address(self.http_host).is_unspecified
|
| 276 |
+
else self.http_host
|
| 277 |
+
)
|
| 278 |
+
logger.info(
|
| 279 |
+
"Dashboard head http address: %s:%s", self.http_host, self.http_port
|
| 280 |
+
)
|
| 281 |
+
# Dump registered http routes.
|
| 282 |
+
dump_routes = [r for r in app.router.routes() if r.method != hdrs.METH_HEAD]
|
| 283 |
+
for r in dump_routes:
|
| 284 |
+
logger.info(r)
|
| 285 |
+
logger.info("Registered %s routes.", len(dump_routes))
|
| 286 |
+
|
| 287 |
+
async def cleanup(self):
|
| 288 |
+
# Wait for finish signal.
|
| 289 |
+
await self.runner.cleanup()
|
.venv/lib/python3.11/site-packages/ray/dashboard/k8s_utils.py
ADDED
|
@@ -0,0 +1,111 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import logging
|
| 2 |
+
|
| 3 |
+
from ray._private.utils import get_num_cpus
|
| 4 |
+
|
| 5 |
+
logger = logging.getLogger(__name__)
|
| 6 |
+
|
| 7 |
+
CPU_USAGE_PATH = "/sys/fs/cgroup/cpuacct/cpuacct.usage"
|
| 8 |
+
CPU_USAGE_PATH_V2 = "/sys/fs/cgroup/cpu.stat"
|
| 9 |
+
PROC_STAT_PATH = "/proc/stat"
|
| 10 |
+
|
| 11 |
+
container_num_cpus = None
|
| 12 |
+
host_num_cpus = None
|
| 13 |
+
|
| 14 |
+
last_cpu_usage = None
|
| 15 |
+
last_system_usage = None
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
def cpu_percent():
|
| 19 |
+
"""Estimate CPU usage percent for Ray pod managed by Kubernetes
|
| 20 |
+
Operator.
|
| 21 |
+
|
| 22 |
+
Computed by the following steps
|
| 23 |
+
(1) Replicate the logic used by 'docker stats' cli command.
|
| 24 |
+
See https://github.com/docker/cli/blob/c0a6b1c7b30203fbc28cd619acb901a95a80e30e/cli/command/container/stats_helpers.go#L166.
|
| 25 |
+
(2) Divide by the number of CPUs available to the container, so that
|
| 26 |
+
e.g. full capacity use of 2 CPUs will read as 100%,
|
| 27 |
+
rather than 200%.
|
| 28 |
+
|
| 29 |
+
Step (1) above works by
|
| 30 |
+
dividing delta in cpu usage by
|
| 31 |
+
delta in total host cpu usage, averaged over host's cpus.
|
| 32 |
+
|
| 33 |
+
Since deltas are not initially available, return 0.0 on first call.
|
| 34 |
+
""" # noqa
|
| 35 |
+
global last_system_usage
|
| 36 |
+
global last_cpu_usage
|
| 37 |
+
try:
|
| 38 |
+
cpu_usage = _cpu_usage()
|
| 39 |
+
system_usage = _system_usage()
|
| 40 |
+
# Return 0.0 on first call.
|
| 41 |
+
if last_system_usage is None:
|
| 42 |
+
cpu_percent = 0.0
|
| 43 |
+
else:
|
| 44 |
+
cpu_delta = cpu_usage - last_cpu_usage
|
| 45 |
+
# "System time passed." (Typically close to clock time.)
|
| 46 |
+
system_delta = (system_usage - last_system_usage) / _host_num_cpus()
|
| 47 |
+
|
| 48 |
+
quotient = cpu_delta / system_delta
|
| 49 |
+
cpu_percent = round(quotient * 100 / get_num_cpus(), 1)
|
| 50 |
+
last_system_usage = system_usage
|
| 51 |
+
last_cpu_usage = cpu_usage
|
| 52 |
+
# Computed percentage might be slightly above 100%.
|
| 53 |
+
return min(cpu_percent, 100.0)
|
| 54 |
+
except Exception:
|
| 55 |
+
logger.exception("Error computing CPU usage of Ray Kubernetes pod.")
|
| 56 |
+
return 0.0
|
| 57 |
+
|
| 58 |
+
|
| 59 |
+
def _cpu_usage():
|
| 60 |
+
"""Compute total cpu usage of the container in nanoseconds
|
| 61 |
+
by reading from cpuacct in cgroups v1 or cpu.stat in cgroups v2."""
|
| 62 |
+
try:
|
| 63 |
+
# cgroups v1
|
| 64 |
+
return int(open(CPU_USAGE_PATH).read())
|
| 65 |
+
except FileNotFoundError:
|
| 66 |
+
# cgroups v2
|
| 67 |
+
cpu_stat_text = open(CPU_USAGE_PATH_V2).read()
|
| 68 |
+
# e.g. "usage_usec 16089294616"
|
| 69 |
+
cpu_stat_first_line = cpu_stat_text.split("\n")[0]
|
| 70 |
+
# get the second word of the first line, cast as an integer
|
| 71 |
+
# this is the CPU usage is microseconds
|
| 72 |
+
cpu_usec = int(cpu_stat_first_line.split()[1])
|
| 73 |
+
# Convert to nanoseconds and return.
|
| 74 |
+
return cpu_usec * 1000
|
| 75 |
+
|
| 76 |
+
|
| 77 |
+
def _system_usage():
|
| 78 |
+
"""
|
| 79 |
+
Computes total CPU usage of the host in nanoseconds.
|
| 80 |
+
|
| 81 |
+
Logic taken from here:
|
| 82 |
+
https://github.com/moby/moby/blob/b42ac8d370a8ef8ec720dff0ca9dfb3530ac0a6a/daemon/stats/collector_unix.go#L31
|
| 83 |
+
|
| 84 |
+
See also the /proc/stat entry here:
|
| 85 |
+
https://man7.org/linux/man-pages/man5/proc.5.html
|
| 86 |
+
""" # noqa
|
| 87 |
+
cpu_summary_str = open(PROC_STAT_PATH).read().split("\n")[0]
|
| 88 |
+
parts = cpu_summary_str.split()
|
| 89 |
+
assert parts[0] == "cpu"
|
| 90 |
+
usage_data = parts[1:8]
|
| 91 |
+
total_clock_ticks = sum(int(entry) for entry in usage_data)
|
| 92 |
+
# 100 clock ticks per second, 10^9 ns per second
|
| 93 |
+
usage_ns = total_clock_ticks * 10**7
|
| 94 |
+
return usage_ns
|
| 95 |
+
|
| 96 |
+
|
| 97 |
+
def _host_num_cpus():
|
| 98 |
+
"""Number of physical CPUs, obtained by parsing /proc/stat."""
|
| 99 |
+
global host_num_cpus
|
| 100 |
+
if host_num_cpus is None:
|
| 101 |
+
proc_stat_lines = open(PROC_STAT_PATH).read().split("\n")
|
| 102 |
+
split_proc_stat_lines = [line.split() for line in proc_stat_lines]
|
| 103 |
+
cpu_lines = [
|
| 104 |
+
split_line
|
| 105 |
+
for split_line in split_proc_stat_lines
|
| 106 |
+
if len(split_line) > 0 and "cpu" in split_line[0]
|
| 107 |
+
]
|
| 108 |
+
# Number of lines starting with a word including 'cpu', subtracting
|
| 109 |
+
# 1 for the first summary line.
|
| 110 |
+
host_num_cpus = len(cpu_lines) - 1
|
| 111 |
+
return host_num_cpus
|
.venv/lib/python3.11/site-packages/ray/dashboard/memory_utils.py
ADDED
|
@@ -0,0 +1,524 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import base64
|
| 2 |
+
import logging
|
| 3 |
+
from collections import defaultdict
|
| 4 |
+
from enum import Enum
|
| 5 |
+
from typing import List
|
| 6 |
+
|
| 7 |
+
import ray
|
| 8 |
+
from ray._private.internal_api import node_stats
|
| 9 |
+
from ray._raylet import ActorID, JobID, TaskID
|
| 10 |
+
|
| 11 |
+
logger = logging.getLogger(__name__)
|
| 12 |
+
|
| 13 |
+
# These values are used to calculate if objectRefs are actor handles.
|
| 14 |
+
TASKID_BYTES_SIZE = TaskID.size()
|
| 15 |
+
ACTORID_BYTES_SIZE = ActorID.size()
|
| 16 |
+
JOBID_BYTES_SIZE = JobID.size()
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
def decode_object_ref_if_needed(object_ref: str) -> bytes:
|
| 20 |
+
"""Decode objectRef bytes string.
|
| 21 |
+
|
| 22 |
+
gRPC reply contains an objectRef that is encodded by Base64.
|
| 23 |
+
This function is used to decode the objectRef.
|
| 24 |
+
Note that there are times that objectRef is already decoded as
|
| 25 |
+
a hex string. In this case, just convert it to a binary number.
|
| 26 |
+
"""
|
| 27 |
+
if object_ref.endswith("="):
|
| 28 |
+
# If the object ref ends with =, that means it is base64 encoded.
|
| 29 |
+
# Object refs will always have = as a padding
|
| 30 |
+
# when it is base64 encoded because objectRef is always 20B.
|
| 31 |
+
return base64.standard_b64decode(object_ref)
|
| 32 |
+
else:
|
| 33 |
+
return ray._private.utils.hex_to_binary(object_ref)
|
| 34 |
+
|
| 35 |
+
|
| 36 |
+
class SortingType(Enum):
|
| 37 |
+
PID = 1
|
| 38 |
+
OBJECT_SIZE = 3
|
| 39 |
+
REFERENCE_TYPE = 4
|
| 40 |
+
|
| 41 |
+
|
| 42 |
+
class GroupByType(Enum):
|
| 43 |
+
NODE_ADDRESS = "node"
|
| 44 |
+
STACK_TRACE = "stack_trace"
|
| 45 |
+
|
| 46 |
+
|
| 47 |
+
class ReferenceType(Enum):
|
| 48 |
+
# We don't use enum because enum is not json serializable.
|
| 49 |
+
ACTOR_HANDLE = "ACTOR_HANDLE"
|
| 50 |
+
PINNED_IN_MEMORY = "PINNED_IN_MEMORY"
|
| 51 |
+
LOCAL_REFERENCE = "LOCAL_REFERENCE"
|
| 52 |
+
USED_BY_PENDING_TASK = "USED_BY_PENDING_TASK"
|
| 53 |
+
CAPTURED_IN_OBJECT = "CAPTURED_IN_OBJECT"
|
| 54 |
+
UNKNOWN_STATUS = "UNKNOWN_STATUS"
|
| 55 |
+
|
| 56 |
+
|
| 57 |
+
def get_sorting_type(sort_by: str):
|
| 58 |
+
"""Translate string input into SortingType instance"""
|
| 59 |
+
sort_by = sort_by.upper()
|
| 60 |
+
if sort_by == "PID":
|
| 61 |
+
return SortingType.PID
|
| 62 |
+
elif sort_by == "OBJECT_SIZE":
|
| 63 |
+
return SortingType.OBJECT_SIZE
|
| 64 |
+
elif sort_by == "REFERENCE_TYPE":
|
| 65 |
+
return SortingType.REFERENCE_TYPE
|
| 66 |
+
else:
|
| 67 |
+
raise Exception(
|
| 68 |
+
"The sort-by input provided is not one of\
|
| 69 |
+
PID, OBJECT_SIZE, or REFERENCE_TYPE."
|
| 70 |
+
)
|
| 71 |
+
|
| 72 |
+
|
| 73 |
+
def get_group_by_type(group_by: str):
|
| 74 |
+
"""Translate string input into GroupByType instance"""
|
| 75 |
+
group_by = group_by.upper()
|
| 76 |
+
if group_by == "NODE_ADDRESS":
|
| 77 |
+
return GroupByType.NODE_ADDRESS
|
| 78 |
+
elif group_by == "STACK_TRACE":
|
| 79 |
+
return GroupByType.STACK_TRACE
|
| 80 |
+
else:
|
| 81 |
+
raise Exception(
|
| 82 |
+
"The group-by input provided is not one of\
|
| 83 |
+
NODE_ADDRESS or STACK_TRACE."
|
| 84 |
+
)
|
| 85 |
+
|
| 86 |
+
|
| 87 |
+
class MemoryTableEntry:
|
| 88 |
+
def __init__(
|
| 89 |
+
self, *, object_ref: dict, node_address: str, is_driver: bool, pid: int
|
| 90 |
+
):
|
| 91 |
+
# worker info
|
| 92 |
+
self.is_driver = is_driver
|
| 93 |
+
self.pid = pid
|
| 94 |
+
self.node_address = node_address
|
| 95 |
+
|
| 96 |
+
# object info
|
| 97 |
+
self.task_status = object_ref.get("taskStatus", "?")
|
| 98 |
+
if self.task_status == "NIL":
|
| 99 |
+
self.task_status = "-"
|
| 100 |
+
self.attempt_number = int(object_ref.get("attemptNumber", 0)) + 1
|
| 101 |
+
self.object_size = int(object_ref.get("objectSize", -1))
|
| 102 |
+
self.call_site = object_ref.get("callSite", "<Unknown>")
|
| 103 |
+
if len(self.call_site) == 0:
|
| 104 |
+
self.call_site = "disabled"
|
| 105 |
+
self.object_ref = ray.ObjectRef(
|
| 106 |
+
decode_object_ref_if_needed(object_ref["objectId"])
|
| 107 |
+
)
|
| 108 |
+
|
| 109 |
+
# reference info
|
| 110 |
+
self.local_ref_count = int(object_ref.get("localRefCount", 0))
|
| 111 |
+
self.pinned_in_memory = bool(object_ref.get("pinnedInMemory", False))
|
| 112 |
+
self.submitted_task_ref_count = int(object_ref.get("submittedTaskRefCount", 0))
|
| 113 |
+
self.contained_in_owned = [
|
| 114 |
+
ray.ObjectRef(decode_object_ref_if_needed(object_ref))
|
| 115 |
+
for object_ref in object_ref.get("containedInOwned", [])
|
| 116 |
+
]
|
| 117 |
+
self.reference_type = self._get_reference_type()
|
| 118 |
+
|
| 119 |
+
def is_valid(self) -> bool:
|
| 120 |
+
# If the entry doesn't have a reference type or some invalid state,
|
| 121 |
+
# (e.g., no object ref presented), it is considered invalid.
|
| 122 |
+
if (
|
| 123 |
+
not self.pinned_in_memory
|
| 124 |
+
and self.local_ref_count == 0
|
| 125 |
+
and self.submitted_task_ref_count == 0
|
| 126 |
+
and len(self.contained_in_owned) == 0
|
| 127 |
+
):
|
| 128 |
+
return False
|
| 129 |
+
elif self.object_ref.is_nil():
|
| 130 |
+
return False
|
| 131 |
+
else:
|
| 132 |
+
return True
|
| 133 |
+
|
| 134 |
+
def group_key(self, group_by_type: GroupByType) -> str:
|
| 135 |
+
if group_by_type == GroupByType.NODE_ADDRESS:
|
| 136 |
+
return self.node_address
|
| 137 |
+
elif group_by_type == GroupByType.STACK_TRACE:
|
| 138 |
+
return self.call_site
|
| 139 |
+
else:
|
| 140 |
+
raise ValueError(f"group by type {group_by_type} is invalid.")
|
| 141 |
+
|
| 142 |
+
def _get_reference_type(self) -> str:
|
| 143 |
+
if self._is_object_ref_actor_handle():
|
| 144 |
+
return ReferenceType.ACTOR_HANDLE.value
|
| 145 |
+
if self.pinned_in_memory:
|
| 146 |
+
return ReferenceType.PINNED_IN_MEMORY.value
|
| 147 |
+
elif self.submitted_task_ref_count > 0:
|
| 148 |
+
return ReferenceType.USED_BY_PENDING_TASK.value
|
| 149 |
+
elif self.local_ref_count > 0:
|
| 150 |
+
return ReferenceType.LOCAL_REFERENCE.value
|
| 151 |
+
elif len(self.contained_in_owned) > 0:
|
| 152 |
+
return ReferenceType.CAPTURED_IN_OBJECT.value
|
| 153 |
+
else:
|
| 154 |
+
return ReferenceType.UNKNOWN_STATUS.value
|
| 155 |
+
|
| 156 |
+
def _is_object_ref_actor_handle(self) -> bool:
|
| 157 |
+
object_ref_hex = self.object_ref.hex()
|
| 158 |
+
|
| 159 |
+
# We need to multiply 2 because we need bits size instead of bytes size.
|
| 160 |
+
taskid_random_bits_size = (TASKID_BYTES_SIZE - ACTORID_BYTES_SIZE) * 2
|
| 161 |
+
actorid_random_bits_size = (ACTORID_BYTES_SIZE - JOBID_BYTES_SIZE) * 2
|
| 162 |
+
|
| 163 |
+
# random (8B) | ActorID(6B) | flag (2B) | index (6B)
|
| 164 |
+
# ActorID(6B) == ActorRandomByte(4B) + JobID(2B)
|
| 165 |
+
# If random bytes are all 'f', but ActorRandomBytes
|
| 166 |
+
# are not all 'f', that means it is an actor creation
|
| 167 |
+
# task, which is an actor handle.
|
| 168 |
+
random_bits = object_ref_hex[:taskid_random_bits_size]
|
| 169 |
+
actor_random_bits = object_ref_hex[
|
| 170 |
+
taskid_random_bits_size : taskid_random_bits_size + actorid_random_bits_size
|
| 171 |
+
]
|
| 172 |
+
if random_bits == "f" * 16 and not actor_random_bits == "f" * 24:
|
| 173 |
+
return True
|
| 174 |
+
else:
|
| 175 |
+
return False
|
| 176 |
+
|
| 177 |
+
def as_dict(self):
|
| 178 |
+
return {
|
| 179 |
+
"object_ref": self.object_ref.hex(),
|
| 180 |
+
"pid": self.pid,
|
| 181 |
+
"node_ip_address": self.node_address,
|
| 182 |
+
"object_size": self.object_size,
|
| 183 |
+
"reference_type": self.reference_type,
|
| 184 |
+
"call_site": self.call_site,
|
| 185 |
+
"task_status": self.task_status,
|
| 186 |
+
"attempt_number": self.attempt_number,
|
| 187 |
+
"local_ref_count": self.local_ref_count,
|
| 188 |
+
"pinned_in_memory": self.pinned_in_memory,
|
| 189 |
+
"submitted_task_ref_count": self.submitted_task_ref_count,
|
| 190 |
+
"contained_in_owned": [
|
| 191 |
+
object_ref.hex() for object_ref in self.contained_in_owned
|
| 192 |
+
],
|
| 193 |
+
"type": "Driver" if self.is_driver else "Worker",
|
| 194 |
+
}
|
| 195 |
+
|
| 196 |
+
def __str__(self):
|
| 197 |
+
return self.__repr__()
|
| 198 |
+
|
| 199 |
+
def __repr__(self):
|
| 200 |
+
return str(self.as_dict())
|
| 201 |
+
|
| 202 |
+
|
| 203 |
+
class MemoryTable:
|
| 204 |
+
def __init__(
|
| 205 |
+
self,
|
| 206 |
+
entries: List[MemoryTableEntry],
|
| 207 |
+
group_by_type: GroupByType = GroupByType.NODE_ADDRESS,
|
| 208 |
+
sort_by_type: SortingType = SortingType.PID,
|
| 209 |
+
):
|
| 210 |
+
self.table = entries
|
| 211 |
+
# Group is a list of memory tables grouped by a group key.
|
| 212 |
+
self.group = {}
|
| 213 |
+
self.summary = defaultdict(int)
|
| 214 |
+
# NOTE YOU MUST SORT TABLE BEFORE GROUPING.
|
| 215 |
+
# self._group_by(..)._sort_by(..) != self._sort_by(..)._group_by(..)
|
| 216 |
+
if group_by_type and sort_by_type:
|
| 217 |
+
self.setup(group_by_type, sort_by_type)
|
| 218 |
+
elif group_by_type:
|
| 219 |
+
self._group_by(group_by_type)
|
| 220 |
+
elif sort_by_type:
|
| 221 |
+
self._sort_by(sort_by_type)
|
| 222 |
+
|
| 223 |
+
def setup(self, group_by_type: GroupByType, sort_by_type: SortingType):
|
| 224 |
+
"""Setup memory table.
|
| 225 |
+
|
| 226 |
+
This will sort entries first and group them after.
|
| 227 |
+
Sort order will be still kept.
|
| 228 |
+
"""
|
| 229 |
+
self._sort_by(sort_by_type)._group_by(group_by_type)
|
| 230 |
+
for group_memory_table in self.group.values():
|
| 231 |
+
group_memory_table.summarize()
|
| 232 |
+
self.summarize()
|
| 233 |
+
return self
|
| 234 |
+
|
| 235 |
+
def insert_entry(self, entry: MemoryTableEntry):
|
| 236 |
+
self.table.append(entry)
|
| 237 |
+
|
| 238 |
+
def summarize(self):
|
| 239 |
+
# Reset summary.
|
| 240 |
+
total_object_size = 0
|
| 241 |
+
total_local_ref_count = 0
|
| 242 |
+
total_pinned_in_memory = 0
|
| 243 |
+
total_used_by_pending_task = 0
|
| 244 |
+
total_captured_in_objects = 0
|
| 245 |
+
total_actor_handles = 0
|
| 246 |
+
|
| 247 |
+
for entry in self.table:
|
| 248 |
+
if entry.object_size > 0:
|
| 249 |
+
total_object_size += entry.object_size
|
| 250 |
+
if entry.reference_type == ReferenceType.LOCAL_REFERENCE.value:
|
| 251 |
+
total_local_ref_count += 1
|
| 252 |
+
elif entry.reference_type == ReferenceType.PINNED_IN_MEMORY.value:
|
| 253 |
+
total_pinned_in_memory += 1
|
| 254 |
+
elif entry.reference_type == ReferenceType.USED_BY_PENDING_TASK.value:
|
| 255 |
+
total_used_by_pending_task += 1
|
| 256 |
+
elif entry.reference_type == ReferenceType.CAPTURED_IN_OBJECT.value:
|
| 257 |
+
total_captured_in_objects += 1
|
| 258 |
+
elif entry.reference_type == ReferenceType.ACTOR_HANDLE.value:
|
| 259 |
+
total_actor_handles += 1
|
| 260 |
+
|
| 261 |
+
self.summary = {
|
| 262 |
+
"total_object_size": total_object_size,
|
| 263 |
+
"total_local_ref_count": total_local_ref_count,
|
| 264 |
+
"total_pinned_in_memory": total_pinned_in_memory,
|
| 265 |
+
"total_used_by_pending_task": total_used_by_pending_task,
|
| 266 |
+
"total_captured_in_objects": total_captured_in_objects,
|
| 267 |
+
"total_actor_handles": total_actor_handles,
|
| 268 |
+
}
|
| 269 |
+
return self
|
| 270 |
+
|
| 271 |
+
def _sort_by(self, sorting_type: SortingType):
|
| 272 |
+
if sorting_type == SortingType.PID:
|
| 273 |
+
self.table.sort(key=lambda entry: entry.pid)
|
| 274 |
+
elif sorting_type == SortingType.OBJECT_SIZE:
|
| 275 |
+
self.table.sort(key=lambda entry: entry.object_size)
|
| 276 |
+
elif sorting_type == SortingType.REFERENCE_TYPE:
|
| 277 |
+
self.table.sort(key=lambda entry: entry.reference_type)
|
| 278 |
+
else:
|
| 279 |
+
raise ValueError(f"Give sorting type: {sorting_type} is invalid.")
|
| 280 |
+
return self
|
| 281 |
+
|
| 282 |
+
def _group_by(self, group_by_type: GroupByType):
|
| 283 |
+
"""Group entries and summarize the result.
|
| 284 |
+
|
| 285 |
+
NOTE: Each group is another MemoryTable.
|
| 286 |
+
"""
|
| 287 |
+
# Reset group
|
| 288 |
+
self.group = {}
|
| 289 |
+
|
| 290 |
+
# Build entries per group.
|
| 291 |
+
group = defaultdict(list)
|
| 292 |
+
for entry in self.table:
|
| 293 |
+
group[entry.group_key(group_by_type)].append(entry)
|
| 294 |
+
|
| 295 |
+
# Build a group table.
|
| 296 |
+
for group_key, entries in group.items():
|
| 297 |
+
self.group[group_key] = MemoryTable(
|
| 298 |
+
entries, group_by_type=None, sort_by_type=None
|
| 299 |
+
)
|
| 300 |
+
for group_key, group_memory_table in self.group.items():
|
| 301 |
+
group_memory_table.summarize()
|
| 302 |
+
return self
|
| 303 |
+
|
| 304 |
+
def as_dict(self):
|
| 305 |
+
return {
|
| 306 |
+
"summary": self.summary,
|
| 307 |
+
"group": {
|
| 308 |
+
group_key: {
|
| 309 |
+
"entries": group_memory_table.get_entries(),
|
| 310 |
+
"summary": group_memory_table.summary,
|
| 311 |
+
}
|
| 312 |
+
for group_key, group_memory_table in self.group.items()
|
| 313 |
+
},
|
| 314 |
+
}
|
| 315 |
+
|
| 316 |
+
def get_entries(self) -> List[dict]:
|
| 317 |
+
return [entry.as_dict() for entry in self.table]
|
| 318 |
+
|
| 319 |
+
def __repr__(self):
|
| 320 |
+
return str(self.as_dict())
|
| 321 |
+
|
| 322 |
+
def __str__(self):
|
| 323 |
+
return self.__repr__()
|
| 324 |
+
|
| 325 |
+
|
| 326 |
+
def construct_memory_table(
|
| 327 |
+
workers_stats: List,
|
| 328 |
+
group_by: GroupByType = GroupByType.NODE_ADDRESS,
|
| 329 |
+
sort_by=SortingType.OBJECT_SIZE,
|
| 330 |
+
) -> MemoryTable:
|
| 331 |
+
memory_table_entries = []
|
| 332 |
+
for core_worker_stats in workers_stats:
|
| 333 |
+
pid = core_worker_stats["pid"]
|
| 334 |
+
is_driver = core_worker_stats.get("workerType") == "DRIVER"
|
| 335 |
+
node_address = core_worker_stats["ipAddress"]
|
| 336 |
+
object_refs = core_worker_stats.get("objectRefs", [])
|
| 337 |
+
|
| 338 |
+
for object_ref in object_refs:
|
| 339 |
+
memory_table_entry = MemoryTableEntry(
|
| 340 |
+
object_ref=object_ref,
|
| 341 |
+
node_address=node_address,
|
| 342 |
+
is_driver=is_driver,
|
| 343 |
+
pid=pid,
|
| 344 |
+
)
|
| 345 |
+
if memory_table_entry.is_valid():
|
| 346 |
+
memory_table_entries.append(memory_table_entry)
|
| 347 |
+
memory_table = MemoryTable(
|
| 348 |
+
memory_table_entries, group_by_type=group_by, sort_by_type=sort_by
|
| 349 |
+
)
|
| 350 |
+
return memory_table
|
| 351 |
+
|
| 352 |
+
|
| 353 |
+
def track_reference_size(group):
|
| 354 |
+
"""Returns dictionary mapping reference type
|
| 355 |
+
to memory usage for a given memory table group."""
|
| 356 |
+
d = defaultdict(int)
|
| 357 |
+
table_name = {
|
| 358 |
+
"LOCAL_REFERENCE": "total_local_ref_count",
|
| 359 |
+
"PINNED_IN_MEMORY": "total_pinned_in_memory",
|
| 360 |
+
"USED_BY_PENDING_TASK": "total_used_by_pending_task",
|
| 361 |
+
"CAPTURED_IN_OBJECT": "total_captured_in_objects",
|
| 362 |
+
"ACTOR_HANDLE": "total_actor_handles",
|
| 363 |
+
}
|
| 364 |
+
for entry in group["entries"]:
|
| 365 |
+
size = entry["object_size"]
|
| 366 |
+
if size == -1:
|
| 367 |
+
# size not recorded
|
| 368 |
+
size = 0
|
| 369 |
+
d[table_name[entry["reference_type"]]] += size
|
| 370 |
+
return d
|
| 371 |
+
|
| 372 |
+
|
| 373 |
+
def memory_summary(
|
| 374 |
+
state,
|
| 375 |
+
group_by="NODE_ADDRESS",
|
| 376 |
+
sort_by="OBJECT_SIZE",
|
| 377 |
+
line_wrap=True,
|
| 378 |
+
unit="B",
|
| 379 |
+
num_entries=None,
|
| 380 |
+
) -> str:
|
| 381 |
+
# Get terminal size
|
| 382 |
+
import shutil
|
| 383 |
+
|
| 384 |
+
from ray.dashboard.modules.node.node_head import node_stats_to_dict
|
| 385 |
+
|
| 386 |
+
size = shutil.get_terminal_size((80, 20)).columns
|
| 387 |
+
line_wrap_threshold = 137
|
| 388 |
+
|
| 389 |
+
# Unit conversions
|
| 390 |
+
units = {"B": 10**0, "KB": 10**3, "MB": 10**6, "GB": 10**9}
|
| 391 |
+
|
| 392 |
+
# Fetch core memory worker stats, store as a dictionary
|
| 393 |
+
core_worker_stats = []
|
| 394 |
+
for raylet in state.node_table():
|
| 395 |
+
if not raylet["Alive"]:
|
| 396 |
+
continue
|
| 397 |
+
try:
|
| 398 |
+
stats = node_stats_to_dict(
|
| 399 |
+
node_stats(raylet["NodeManagerAddress"], raylet["NodeManagerPort"])
|
| 400 |
+
)
|
| 401 |
+
except RuntimeError:
|
| 402 |
+
continue
|
| 403 |
+
core_worker_stats.extend(stats["coreWorkersStats"])
|
| 404 |
+
assert type(stats) is dict and "coreWorkersStats" in stats
|
| 405 |
+
|
| 406 |
+
# Build memory table with "group_by" and "sort_by" parameters
|
| 407 |
+
group_by, sort_by = get_group_by_type(group_by), get_sorting_type(sort_by)
|
| 408 |
+
memory_table = construct_memory_table(
|
| 409 |
+
core_worker_stats, group_by, sort_by
|
| 410 |
+
).as_dict()
|
| 411 |
+
assert "summary" in memory_table and "group" in memory_table
|
| 412 |
+
|
| 413 |
+
# Build memory summary
|
| 414 |
+
mem = ""
|
| 415 |
+
group_by, sort_by = group_by.name.lower().replace(
|
| 416 |
+
"_", " "
|
| 417 |
+
), sort_by.name.lower().replace("_", " ")
|
| 418 |
+
summary_labels = [
|
| 419 |
+
"Mem Used by Objects",
|
| 420 |
+
"Local References",
|
| 421 |
+
"Pinned",
|
| 422 |
+
"Used by task",
|
| 423 |
+
"Captured in Objects",
|
| 424 |
+
"Actor Handles",
|
| 425 |
+
]
|
| 426 |
+
summary_string = "{:<19} {:<16} {:<12} {:<13} {:<19} {:<13}\n"
|
| 427 |
+
|
| 428 |
+
object_ref_labels = [
|
| 429 |
+
"IP Address",
|
| 430 |
+
"PID",
|
| 431 |
+
"Type",
|
| 432 |
+
"Call Site",
|
| 433 |
+
"Status",
|
| 434 |
+
"Attampt",
|
| 435 |
+
"Size",
|
| 436 |
+
"Reference Type",
|
| 437 |
+
"Object Ref",
|
| 438 |
+
]
|
| 439 |
+
object_ref_string = "{:<13} | {:<8} | {:<7} | {:<9} \
|
| 440 |
+
| {:<9} | {:<8} | {:<8} | {:<14} | {:<10}\n"
|
| 441 |
+
|
| 442 |
+
if size > line_wrap_threshold and line_wrap:
|
| 443 |
+
object_ref_string = "{:<15} {:<5} {:<6} {:<22} {:<14} {:<8} {:<6} \
|
| 444 |
+
{:<18} {:<56}\n"
|
| 445 |
+
|
| 446 |
+
mem += f"Grouping by {group_by}...\
|
| 447 |
+
Sorting by {sort_by}...\
|
| 448 |
+
Display {num_entries if num_entries is not None else 'all'}\
|
| 449 |
+
entries per group...\n\n\n"
|
| 450 |
+
|
| 451 |
+
for key, group in memory_table["group"].items():
|
| 452 |
+
# Group summary
|
| 453 |
+
summary = group["summary"]
|
| 454 |
+
ref_size = track_reference_size(group)
|
| 455 |
+
for k, v in summary.items():
|
| 456 |
+
if k == "total_object_size":
|
| 457 |
+
summary[k] = str(v / units[unit]) + f" {unit}"
|
| 458 |
+
else:
|
| 459 |
+
summary[k] = str(v) + f", ({ref_size[k] / units[unit]} {unit})"
|
| 460 |
+
mem += f"--- Summary for {group_by}: {key} ---\n"
|
| 461 |
+
mem += summary_string.format(*summary_labels)
|
| 462 |
+
mem += summary_string.format(*summary.values()) + "\n"
|
| 463 |
+
|
| 464 |
+
# Memory table per group
|
| 465 |
+
mem += f"--- Object references for {group_by}: {key} ---\n"
|
| 466 |
+
mem += object_ref_string.format(*object_ref_labels)
|
| 467 |
+
n = 1 # Counter for num entries per group
|
| 468 |
+
for entry in group["entries"]:
|
| 469 |
+
if num_entries is not None and n > num_entries:
|
| 470 |
+
break
|
| 471 |
+
entry["object_size"] = (
|
| 472 |
+
str(entry["object_size"] / units[unit]) + f" {unit}"
|
| 473 |
+
if entry["object_size"] > -1
|
| 474 |
+
else "?"
|
| 475 |
+
)
|
| 476 |
+
num_lines = 1
|
| 477 |
+
if size > line_wrap_threshold and line_wrap:
|
| 478 |
+
call_site_length = 22
|
| 479 |
+
if len(entry["call_site"]) == 0:
|
| 480 |
+
entry["call_site"] = ["disabled"]
|
| 481 |
+
else:
|
| 482 |
+
entry["call_site"] = [
|
| 483 |
+
entry["call_site"][i : i + call_site_length]
|
| 484 |
+
for i in range(0, len(entry["call_site"]), call_site_length)
|
| 485 |
+
]
|
| 486 |
+
|
| 487 |
+
task_status_length = 12
|
| 488 |
+
entry["task_status"] = [
|
| 489 |
+
entry["task_status"][i : i + task_status_length]
|
| 490 |
+
for i in range(0, len(entry["task_status"]), task_status_length)
|
| 491 |
+
]
|
| 492 |
+
num_lines = max(len(entry["call_site"]), len(entry["task_status"]))
|
| 493 |
+
|
| 494 |
+
else:
|
| 495 |
+
mem += "\n"
|
| 496 |
+
object_ref_values = [
|
| 497 |
+
entry["node_ip_address"],
|
| 498 |
+
entry["pid"],
|
| 499 |
+
entry["type"],
|
| 500 |
+
entry["call_site"],
|
| 501 |
+
entry["task_status"],
|
| 502 |
+
entry["attempt_number"],
|
| 503 |
+
entry["object_size"],
|
| 504 |
+
entry["reference_type"],
|
| 505 |
+
entry["object_ref"],
|
| 506 |
+
]
|
| 507 |
+
for i in range(len(object_ref_values)):
|
| 508 |
+
if not isinstance(object_ref_values[i], list):
|
| 509 |
+
object_ref_values[i] = [object_ref_values[i]]
|
| 510 |
+
object_ref_values[i].extend(
|
| 511 |
+
["" for x in range(num_lines - len(object_ref_values[i]))]
|
| 512 |
+
)
|
| 513 |
+
for i in range(num_lines):
|
| 514 |
+
row = [elem[i] for elem in object_ref_values]
|
| 515 |
+
mem += object_ref_string.format(*row)
|
| 516 |
+
mem += "\n"
|
| 517 |
+
n += 1
|
| 518 |
+
|
| 519 |
+
mem += (
|
| 520 |
+
"To record callsite information for each ObjectRef created, set "
|
| 521 |
+
"env variable RAY_record_ref_creation_sites=1\n\n"
|
| 522 |
+
)
|
| 523 |
+
|
| 524 |
+
return mem
|
.venv/lib/python3.11/site-packages/ray/dashboard/modules/__init__.py
ADDED
|
File without changes
|
.venv/lib/python3.11/site-packages/ray/dashboard/modules/dashboard_sdk.py
ADDED
|
@@ -0,0 +1,418 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import dataclasses
|
| 2 |
+
import importlib
|
| 3 |
+
import json
|
| 4 |
+
import logging
|
| 5 |
+
import os
|
| 6 |
+
import ssl
|
| 7 |
+
import tempfile
|
| 8 |
+
from pathlib import Path
|
| 9 |
+
from typing import Any, Dict, List, Optional, Union
|
| 10 |
+
|
| 11 |
+
import packaging.version
|
| 12 |
+
import yaml
|
| 13 |
+
|
| 14 |
+
import ray
|
| 15 |
+
from ray._private.runtime_env.packaging import (
|
| 16 |
+
create_package,
|
| 17 |
+
get_uri_for_directory,
|
| 18 |
+
get_uri_for_package,
|
| 19 |
+
)
|
| 20 |
+
from ray._private.runtime_env.py_modules import upload_py_modules_if_needed
|
| 21 |
+
from ray._private.runtime_env.working_dir import upload_working_dir_if_needed
|
| 22 |
+
from ray._private.utils import split_address
|
| 23 |
+
from ray.autoscaler._private.cli_logger import cli_logger
|
| 24 |
+
from ray.dashboard.modules.job.common import uri_to_http_components
|
| 25 |
+
from ray.util.annotations import DeveloperAPI, PublicAPI
|
| 26 |
+
|
| 27 |
+
try:
|
| 28 |
+
import requests
|
| 29 |
+
except ImportError:
|
| 30 |
+
requests = None
|
| 31 |
+
|
| 32 |
+
|
| 33 |
+
logger = logging.getLogger(__name__)
|
| 34 |
+
logger.setLevel(logging.INFO)
|
| 35 |
+
|
| 36 |
+
# By default, connect to local cluster.
|
| 37 |
+
DEFAULT_DASHBOARD_ADDRESS = "http://localhost:8265"
|
| 38 |
+
|
| 39 |
+
|
| 40 |
+
def parse_runtime_env_args(
|
| 41 |
+
runtime_env: Optional[str] = None,
|
| 42 |
+
runtime_env_json: Optional[str] = None,
|
| 43 |
+
working_dir: Optional[str] = None,
|
| 44 |
+
):
|
| 45 |
+
"""
|
| 46 |
+
Generates a runtime_env dictionary using `runtime_env`, `runtime_env_json`,
|
| 47 |
+
and `working_dir` CLI options. Only one of `runtime_env` or
|
| 48 |
+
`runtime_env_json` may be defined. `working_dir` overwrites the
|
| 49 |
+
`working_dir` from any other option.
|
| 50 |
+
"""
|
| 51 |
+
|
| 52 |
+
final_runtime_env = {}
|
| 53 |
+
if runtime_env is not None:
|
| 54 |
+
if runtime_env_json is not None:
|
| 55 |
+
raise ValueError(
|
| 56 |
+
"Only one of --runtime_env and --runtime-env-json can be provided."
|
| 57 |
+
)
|
| 58 |
+
with open(runtime_env, "r") as f:
|
| 59 |
+
final_runtime_env = yaml.safe_load(f)
|
| 60 |
+
|
| 61 |
+
elif runtime_env_json is not None:
|
| 62 |
+
final_runtime_env = json.loads(runtime_env_json)
|
| 63 |
+
|
| 64 |
+
if working_dir is not None:
|
| 65 |
+
if "working_dir" in final_runtime_env:
|
| 66 |
+
cli_logger.warning(
|
| 67 |
+
"Overriding runtime_env working_dir with --working-dir option"
|
| 68 |
+
)
|
| 69 |
+
|
| 70 |
+
final_runtime_env["working_dir"] = working_dir
|
| 71 |
+
|
| 72 |
+
return final_runtime_env
|
| 73 |
+
|
| 74 |
+
|
| 75 |
+
@dataclasses.dataclass
|
| 76 |
+
class ClusterInfo:
|
| 77 |
+
address: str
|
| 78 |
+
cookies: Optional[Dict[str, Any]] = None
|
| 79 |
+
metadata: Optional[Dict[str, Any]] = None
|
| 80 |
+
headers: Optional[Dict[str, Any]] = None
|
| 81 |
+
|
| 82 |
+
|
| 83 |
+
# TODO (shrekris-anyscale): renaming breaks compatibility, do NOT rename
|
| 84 |
+
def get_job_submission_client_cluster_info(
|
| 85 |
+
address: str,
|
| 86 |
+
# For backwards compatibility
|
| 87 |
+
*,
|
| 88 |
+
# only used in importlib case in parse_cluster_info, but needed
|
| 89 |
+
# in function signature.
|
| 90 |
+
create_cluster_if_needed: Optional[bool] = False,
|
| 91 |
+
cookies: Optional[Dict[str, Any]] = None,
|
| 92 |
+
metadata: Optional[Dict[str, Any]] = None,
|
| 93 |
+
headers: Optional[Dict[str, Any]] = None,
|
| 94 |
+
_use_tls: Optional[bool] = False,
|
| 95 |
+
) -> ClusterInfo:
|
| 96 |
+
"""Get address, cookies, and metadata used for SubmissionClient.
|
| 97 |
+
|
| 98 |
+
If no port is specified in `address`, the Ray dashboard default will be
|
| 99 |
+
inserted.
|
| 100 |
+
|
| 101 |
+
Args:
|
| 102 |
+
address: Address without the module prefix that is passed
|
| 103 |
+
to SubmissionClient.
|
| 104 |
+
create_cluster_if_needed: Indicates whether the cluster
|
| 105 |
+
of the address returned needs to be running. Ray doesn't
|
| 106 |
+
start a cluster before interacting with jobs, but other
|
| 107 |
+
implementations may do so.
|
| 108 |
+
|
| 109 |
+
Returns:
|
| 110 |
+
ClusterInfo object consisting of address, cookies, and metadata
|
| 111 |
+
for SubmissionClient to use.
|
| 112 |
+
"""
|
| 113 |
+
|
| 114 |
+
scheme = "https" if _use_tls else "http"
|
| 115 |
+
return ClusterInfo(
|
| 116 |
+
address=f"{scheme}://{address}",
|
| 117 |
+
cookies=cookies,
|
| 118 |
+
metadata=metadata,
|
| 119 |
+
headers=headers,
|
| 120 |
+
)
|
| 121 |
+
|
| 122 |
+
|
| 123 |
+
def parse_cluster_info(
|
| 124 |
+
address: Optional[str] = None,
|
| 125 |
+
create_cluster_if_needed: bool = False,
|
| 126 |
+
cookies: Optional[Dict[str, Any]] = None,
|
| 127 |
+
metadata: Optional[Dict[str, Any]] = None,
|
| 128 |
+
headers: Optional[Dict[str, Any]] = None,
|
| 129 |
+
) -> ClusterInfo:
|
| 130 |
+
"""Create a cluster if needed and return its address, cookies, and metadata."""
|
| 131 |
+
if address is None:
|
| 132 |
+
if (
|
| 133 |
+
ray.is_initialized()
|
| 134 |
+
and ray._private.worker.global_worker.node.address_info["webui_url"]
|
| 135 |
+
is not None
|
| 136 |
+
):
|
| 137 |
+
address = (
|
| 138 |
+
"http://"
|
| 139 |
+
f"{ray._private.worker.global_worker.node.address_info['webui_url']}"
|
| 140 |
+
)
|
| 141 |
+
logger.info(
|
| 142 |
+
f"No address provided but Ray is running; using address {address}."
|
| 143 |
+
)
|
| 144 |
+
else:
|
| 145 |
+
logger.info(
|
| 146 |
+
f"No address provided, defaulting to {DEFAULT_DASHBOARD_ADDRESS}."
|
| 147 |
+
)
|
| 148 |
+
address = DEFAULT_DASHBOARD_ADDRESS
|
| 149 |
+
|
| 150 |
+
if address == "auto":
|
| 151 |
+
raise ValueError("Internal error: unexpected address 'auto'.")
|
| 152 |
+
|
| 153 |
+
if "://" not in address:
|
| 154 |
+
# Default to HTTP.
|
| 155 |
+
logger.info(
|
| 156 |
+
"No scheme (e.g. 'http://') or module string (e.g. 'ray://') "
|
| 157 |
+
f"provided in address {address}, defaulting to HTTP."
|
| 158 |
+
)
|
| 159 |
+
address = f"http://{address}"
|
| 160 |
+
|
| 161 |
+
module_string, inner_address = split_address(address)
|
| 162 |
+
|
| 163 |
+
if module_string == "ray":
|
| 164 |
+
raise ValueError(f"Internal error: unexpected Ray Client address {address}.")
|
| 165 |
+
# If user passes http(s)://, go through normal parsing.
|
| 166 |
+
if module_string in {"http", "https"}:
|
| 167 |
+
return get_job_submission_client_cluster_info(
|
| 168 |
+
inner_address,
|
| 169 |
+
create_cluster_if_needed=create_cluster_if_needed,
|
| 170 |
+
cookies=cookies,
|
| 171 |
+
metadata=metadata,
|
| 172 |
+
headers=headers,
|
| 173 |
+
_use_tls=(module_string == "https"),
|
| 174 |
+
)
|
| 175 |
+
# Try to dynamically import the function to get cluster info.
|
| 176 |
+
else:
|
| 177 |
+
try:
|
| 178 |
+
module = importlib.import_module(module_string)
|
| 179 |
+
except Exception:
|
| 180 |
+
raise RuntimeError(
|
| 181 |
+
f"Module: {module_string} does not exist.\n"
|
| 182 |
+
f"This module was parsed from address: {address}"
|
| 183 |
+
) from None
|
| 184 |
+
assert "get_job_submission_client_cluster_info" in dir(module), (
|
| 185 |
+
f"Module: {module_string} does "
|
| 186 |
+
"not have `get_job_submission_client_cluster_info`.\n"
|
| 187 |
+
f"This module was parsed from address: {address}"
|
| 188 |
+
)
|
| 189 |
+
|
| 190 |
+
return module.get_job_submission_client_cluster_info(
|
| 191 |
+
inner_address,
|
| 192 |
+
create_cluster_if_needed=create_cluster_if_needed,
|
| 193 |
+
cookies=cookies,
|
| 194 |
+
metadata=metadata,
|
| 195 |
+
headers=headers,
|
| 196 |
+
)
|
| 197 |
+
|
| 198 |
+
|
| 199 |
+
class SubmissionClient:
|
| 200 |
+
def __init__(
|
| 201 |
+
self,
|
| 202 |
+
address: Optional[str] = None,
|
| 203 |
+
create_cluster_if_needed: bool = False,
|
| 204 |
+
cookies: Optional[Dict[str, Any]] = None,
|
| 205 |
+
metadata: Optional[Dict[str, Any]] = None,
|
| 206 |
+
headers: Optional[Dict[str, Any]] = None,
|
| 207 |
+
verify: Optional[Union[str, bool]] = True,
|
| 208 |
+
):
|
| 209 |
+
# Remove any trailing slashes
|
| 210 |
+
if address is not None and address.endswith("/"):
|
| 211 |
+
address = address.rstrip("/")
|
| 212 |
+
logger.debug(
|
| 213 |
+
"The submission address cannot contain trailing slashes. Removing "
|
| 214 |
+
f'them from the requested submission address of "{address}".'
|
| 215 |
+
)
|
| 216 |
+
|
| 217 |
+
cluster_info = parse_cluster_info(
|
| 218 |
+
address, create_cluster_if_needed, cookies, metadata, headers
|
| 219 |
+
)
|
| 220 |
+
self._address = cluster_info.address
|
| 221 |
+
self._cookies = cluster_info.cookies
|
| 222 |
+
self._default_metadata = cluster_info.metadata or {}
|
| 223 |
+
# Headers used for all requests sent to job server, optional and only
|
| 224 |
+
# needed for cases like authentication to remote cluster.
|
| 225 |
+
self._headers = cluster_info.headers
|
| 226 |
+
# Set SSL verify parameter for the requests library and create an ssl_context
|
| 227 |
+
# object when needed for the aiohttp library.
|
| 228 |
+
self._verify = verify
|
| 229 |
+
if isinstance(self._verify, str):
|
| 230 |
+
if os.path.isdir(self._verify):
|
| 231 |
+
cafile, capath = None, self._verify
|
| 232 |
+
elif os.path.isfile(self._verify):
|
| 233 |
+
cafile, capath = self._verify, None
|
| 234 |
+
else:
|
| 235 |
+
raise FileNotFoundError(
|
| 236 |
+
f"Path to CA certificates: '{self._verify}', does not exist."
|
| 237 |
+
)
|
| 238 |
+
self._ssl_context = ssl.create_default_context(cafile=cafile, capath=capath)
|
| 239 |
+
else:
|
| 240 |
+
if self._verify is False:
|
| 241 |
+
self._ssl_context = False
|
| 242 |
+
else:
|
| 243 |
+
self._ssl_context = None
|
| 244 |
+
|
| 245 |
+
def _check_connection_and_version(
|
| 246 |
+
self, min_version: str = "1.9", version_error_message: str = None
|
| 247 |
+
):
|
| 248 |
+
self._check_connection_and_version_with_url(min_version, version_error_message)
|
| 249 |
+
|
| 250 |
+
def _check_connection_and_version_with_url(
|
| 251 |
+
self,
|
| 252 |
+
min_version: str = "1.9",
|
| 253 |
+
version_error_message: str = None,
|
| 254 |
+
url: str = "/api/version",
|
| 255 |
+
):
|
| 256 |
+
if version_error_message is None:
|
| 257 |
+
version_error_message = (
|
| 258 |
+
f"Please ensure the cluster is running Ray {min_version} or higher."
|
| 259 |
+
)
|
| 260 |
+
|
| 261 |
+
try:
|
| 262 |
+
r = self._do_request("GET", url)
|
| 263 |
+
if r.status_code == 404:
|
| 264 |
+
raise RuntimeError(
|
| 265 |
+
"Version check returned 404. " + version_error_message
|
| 266 |
+
)
|
| 267 |
+
r.raise_for_status()
|
| 268 |
+
|
| 269 |
+
running_ray_version = r.json()["ray_version"]
|
| 270 |
+
if packaging.version.parse(running_ray_version) < packaging.version.parse(
|
| 271 |
+
min_version
|
| 272 |
+
):
|
| 273 |
+
raise RuntimeError(
|
| 274 |
+
f"Ray version {running_ray_version} is running on the cluster. "
|
| 275 |
+
+ version_error_message
|
| 276 |
+
)
|
| 277 |
+
except requests.exceptions.ConnectionError:
|
| 278 |
+
raise ConnectionError(
|
| 279 |
+
f"Failed to connect to Ray at address: {self._address}."
|
| 280 |
+
)
|
| 281 |
+
|
| 282 |
+
def _raise_error(self, r: "requests.Response"):
|
| 283 |
+
raise RuntimeError(
|
| 284 |
+
f"Request failed with status code {r.status_code}: {r.text}."
|
| 285 |
+
)
|
| 286 |
+
|
| 287 |
+
def _do_request(
|
| 288 |
+
self,
|
| 289 |
+
method: str,
|
| 290 |
+
endpoint: str,
|
| 291 |
+
*,
|
| 292 |
+
data: Optional[bytes] = None,
|
| 293 |
+
json_data: Optional[dict] = None,
|
| 294 |
+
**kwargs,
|
| 295 |
+
) -> "requests.Response":
|
| 296 |
+
"""Perform the actual HTTP request
|
| 297 |
+
|
| 298 |
+
Keyword arguments other than "cookies", "headers" are forwarded to the
|
| 299 |
+
`requests.request()`.
|
| 300 |
+
"""
|
| 301 |
+
url = self._address + endpoint
|
| 302 |
+
logger.debug(f"Sending request to {url} with json data: {json_data or {}}.")
|
| 303 |
+
return requests.request(
|
| 304 |
+
method,
|
| 305 |
+
url,
|
| 306 |
+
cookies=self._cookies,
|
| 307 |
+
data=data,
|
| 308 |
+
json=json_data,
|
| 309 |
+
headers=self._headers,
|
| 310 |
+
verify=self._verify,
|
| 311 |
+
**kwargs,
|
| 312 |
+
)
|
| 313 |
+
|
| 314 |
+
def _package_exists(
|
| 315 |
+
self,
|
| 316 |
+
package_uri: str,
|
| 317 |
+
) -> bool:
|
| 318 |
+
protocol, package_name = uri_to_http_components(package_uri)
|
| 319 |
+
r = self._do_request("GET", f"/api/packages/{protocol}/{package_name}")
|
| 320 |
+
|
| 321 |
+
if r.status_code == 200:
|
| 322 |
+
logger.debug(f"Package {package_uri} already exists.")
|
| 323 |
+
return True
|
| 324 |
+
elif r.status_code == 404:
|
| 325 |
+
logger.debug(f"Package {package_uri} does not exist.")
|
| 326 |
+
return False
|
| 327 |
+
else:
|
| 328 |
+
self._raise_error(r)
|
| 329 |
+
|
| 330 |
+
def _upload_package(
|
| 331 |
+
self,
|
| 332 |
+
package_uri: str,
|
| 333 |
+
package_path: str,
|
| 334 |
+
include_parent_dir: Optional[bool] = False,
|
| 335 |
+
excludes: Optional[List[str]] = None,
|
| 336 |
+
is_file: bool = False,
|
| 337 |
+
) -> bool:
|
| 338 |
+
logger.info(f"Uploading package {package_uri}.")
|
| 339 |
+
with tempfile.TemporaryDirectory() as tmp_dir:
|
| 340 |
+
protocol, package_name = uri_to_http_components(package_uri)
|
| 341 |
+
if is_file:
|
| 342 |
+
package_file = Path(package_path)
|
| 343 |
+
else:
|
| 344 |
+
package_file = Path(tmp_dir) / package_name
|
| 345 |
+
create_package(
|
| 346 |
+
package_path,
|
| 347 |
+
package_file,
|
| 348 |
+
include_parent_dir=include_parent_dir,
|
| 349 |
+
excludes=excludes,
|
| 350 |
+
)
|
| 351 |
+
try:
|
| 352 |
+
r = self._do_request(
|
| 353 |
+
"PUT",
|
| 354 |
+
f"/api/packages/{protocol}/{package_name}",
|
| 355 |
+
data=package_file.read_bytes(),
|
| 356 |
+
)
|
| 357 |
+
if r.status_code != 200:
|
| 358 |
+
self._raise_error(r)
|
| 359 |
+
finally:
|
| 360 |
+
# If the package is a user's existing file, don't delete it.
|
| 361 |
+
if not is_file:
|
| 362 |
+
package_file.unlink()
|
| 363 |
+
|
| 364 |
+
def _upload_package_if_needed(
|
| 365 |
+
self,
|
| 366 |
+
package_path: str,
|
| 367 |
+
include_parent_dir: bool = False,
|
| 368 |
+
excludes: Optional[List[str]] = None,
|
| 369 |
+
is_file: bool = False,
|
| 370 |
+
) -> str:
|
| 371 |
+
if is_file:
|
| 372 |
+
package_uri = get_uri_for_package(Path(package_path))
|
| 373 |
+
else:
|
| 374 |
+
package_uri = get_uri_for_directory(package_path, excludes=excludes)
|
| 375 |
+
|
| 376 |
+
if not self._package_exists(package_uri):
|
| 377 |
+
self._upload_package(
|
| 378 |
+
package_uri,
|
| 379 |
+
package_path,
|
| 380 |
+
include_parent_dir=include_parent_dir,
|
| 381 |
+
excludes=excludes,
|
| 382 |
+
is_file=is_file,
|
| 383 |
+
)
|
| 384 |
+
else:
|
| 385 |
+
logger.info(f"Package {package_uri} already exists, skipping upload.")
|
| 386 |
+
|
| 387 |
+
return package_uri
|
| 388 |
+
|
| 389 |
+
def _upload_working_dir_if_needed(self, runtime_env: Dict[str, Any]):
|
| 390 |
+
def _upload_fn(working_dir, excludes, is_file=False):
|
| 391 |
+
self._upload_package_if_needed(
|
| 392 |
+
working_dir,
|
| 393 |
+
include_parent_dir=False,
|
| 394 |
+
excludes=excludes,
|
| 395 |
+
is_file=is_file,
|
| 396 |
+
)
|
| 397 |
+
|
| 398 |
+
upload_working_dir_if_needed(runtime_env, upload_fn=_upload_fn)
|
| 399 |
+
|
| 400 |
+
def _upload_py_modules_if_needed(self, runtime_env: Dict[str, Any]):
|
| 401 |
+
def _upload_fn(module_path, excludes, is_file=False):
|
| 402 |
+
self._upload_package_if_needed(
|
| 403 |
+
module_path, include_parent_dir=True, excludes=excludes, is_file=is_file
|
| 404 |
+
)
|
| 405 |
+
|
| 406 |
+
upload_py_modules_if_needed(runtime_env, upload_fn=_upload_fn)
|
| 407 |
+
|
| 408 |
+
@PublicAPI(stability="beta")
|
| 409 |
+
def get_version(self) -> str:
|
| 410 |
+
r = self._do_request("GET", "/api/version")
|
| 411 |
+
if r.status_code == 200:
|
| 412 |
+
return r.json().get("version")
|
| 413 |
+
else:
|
| 414 |
+
self._raise_error(r)
|
| 415 |
+
|
| 416 |
+
@DeveloperAPI
|
| 417 |
+
def get_address(self) -> str:
|
| 418 |
+
return self._address
|
.venv/lib/python3.11/site-packages/ray/dashboard/modules/data/__pycache__/__init__.cpython-311.pyc
ADDED
|
Binary file (199 Bytes). View file
|
|
|
.venv/lib/python3.11/site-packages/ray/dashboard/modules/data/__pycache__/data_head.cpython-311.pyc
ADDED
|
Binary file (9.36 kB). View file
|
|
|
.venv/lib/python3.11/site-packages/ray/dashboard/modules/data/data_head.py
ADDED
|
@@ -0,0 +1,167 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import json
|
| 2 |
+
import logging
|
| 3 |
+
import os
|
| 4 |
+
from enum import Enum
|
| 5 |
+
from urllib.parse import quote
|
| 6 |
+
|
| 7 |
+
import aiohttp
|
| 8 |
+
from aiohttp.web import Request, Response
|
| 9 |
+
|
| 10 |
+
import ray.dashboard.optional_utils as optional_utils
|
| 11 |
+
import ray.dashboard.utils as dashboard_utils
|
| 12 |
+
from ray.dashboard.modules.metrics.metrics_head import (
|
| 13 |
+
DEFAULT_PROMETHEUS_HEADERS,
|
| 14 |
+
DEFAULT_PROMETHEUS_HOST,
|
| 15 |
+
PROMETHEUS_HEADERS_ENV_VAR,
|
| 16 |
+
PROMETHEUS_HOST_ENV_VAR,
|
| 17 |
+
PrometheusQueryError,
|
| 18 |
+
parse_prom_headers,
|
| 19 |
+
)
|
| 20 |
+
|
| 21 |
+
logger = logging.getLogger(__name__)
|
| 22 |
+
logger.setLevel(logging.INFO)
|
| 23 |
+
|
| 24 |
+
|
| 25 |
+
# Window and sampling rate used for certain Prometheus queries.
|
| 26 |
+
# Datapoints up until `MAX_TIME_WINDOW` ago are queried at `SAMPLE_RATE` intervals.
|
| 27 |
+
MAX_TIME_WINDOW = "1h"
|
| 28 |
+
SAMPLE_RATE = "1s"
|
| 29 |
+
|
| 30 |
+
|
| 31 |
+
class PrometheusQuery(Enum):
|
| 32 |
+
"""Enum to store types of Prometheus queries for a given metric and grouping."""
|
| 33 |
+
|
| 34 |
+
VALUE = ("value", "sum({}{{SessionName='{}'}}) by ({})")
|
| 35 |
+
MAX = (
|
| 36 |
+
"max",
|
| 37 |
+
"max_over_time(sum({}{{SessionName='{}'}}) by ({})["
|
| 38 |
+
+ f"{MAX_TIME_WINDOW}:{SAMPLE_RATE}])",
|
| 39 |
+
)
|
| 40 |
+
|
| 41 |
+
|
| 42 |
+
DATASET_METRICS = {
|
| 43 |
+
"ray_data_output_rows": (PrometheusQuery.MAX,),
|
| 44 |
+
"ray_data_spilled_bytes": (PrometheusQuery.MAX,),
|
| 45 |
+
"ray_data_current_bytes": (PrometheusQuery.VALUE, PrometheusQuery.MAX),
|
| 46 |
+
"ray_data_cpu_usage_cores": (PrometheusQuery.VALUE, PrometheusQuery.MAX),
|
| 47 |
+
"ray_data_gpu_usage_cores": (PrometheusQuery.VALUE, PrometheusQuery.MAX),
|
| 48 |
+
}
|
| 49 |
+
|
| 50 |
+
|
| 51 |
+
class DataHead(dashboard_utils.DashboardHeadModule):
|
| 52 |
+
def __init__(self, config: dashboard_utils.DashboardHeadModuleConfig):
|
| 53 |
+
super().__init__(config)
|
| 54 |
+
self.prometheus_host = os.environ.get(
|
| 55 |
+
PROMETHEUS_HOST_ENV_VAR, DEFAULT_PROMETHEUS_HOST
|
| 56 |
+
)
|
| 57 |
+
self.prometheus_headers = parse_prom_headers(
|
| 58 |
+
os.environ.get(
|
| 59 |
+
PROMETHEUS_HEADERS_ENV_VAR,
|
| 60 |
+
DEFAULT_PROMETHEUS_HEADERS,
|
| 61 |
+
)
|
| 62 |
+
)
|
| 63 |
+
|
| 64 |
+
@optional_utils.DashboardHeadRouteTable.get("/api/data/datasets/{job_id}")
|
| 65 |
+
@optional_utils.init_ray_and_catch_exceptions()
|
| 66 |
+
async def get_datasets(self, req: Request) -> Response:
|
| 67 |
+
job_id = req.match_info["job_id"]
|
| 68 |
+
|
| 69 |
+
try:
|
| 70 |
+
from ray.data._internal.stats import _get_or_create_stats_actor
|
| 71 |
+
|
| 72 |
+
_stats_actor = _get_or_create_stats_actor()
|
| 73 |
+
datasets = await _stats_actor.get_datasets.remote(job_id)
|
| 74 |
+
# Initializes dataset metric values
|
| 75 |
+
for dataset in datasets:
|
| 76 |
+
for metric, queries in DATASET_METRICS.items():
|
| 77 |
+
datasets[dataset][metric] = {query.value[0]: 0 for query in queries}
|
| 78 |
+
for operator in datasets[dataset]["operators"]:
|
| 79 |
+
datasets[dataset]["operators"][operator][metric] = {
|
| 80 |
+
query.value[0]: 0 for query in queries
|
| 81 |
+
}
|
| 82 |
+
# Query dataset metric values from prometheus
|
| 83 |
+
try:
|
| 84 |
+
# TODO (Zandew): store results of completed datasets in stats actor.
|
| 85 |
+
for metric, queries in DATASET_METRICS.items():
|
| 86 |
+
for query in queries:
|
| 87 |
+
query_name, prom_query = query.value
|
| 88 |
+
# Dataset level
|
| 89 |
+
dataset_result = await self._query_prometheus(
|
| 90 |
+
prom_query.format(metric, self.session_name, "dataset")
|
| 91 |
+
)
|
| 92 |
+
for res in dataset_result["data"]["result"]:
|
| 93 |
+
dataset, value = res["metric"]["dataset"], res["value"][1]
|
| 94 |
+
if dataset in datasets:
|
| 95 |
+
datasets[dataset][metric][query_name] = value
|
| 96 |
+
|
| 97 |
+
# Operator level
|
| 98 |
+
operator_result = await self._query_prometheus(
|
| 99 |
+
prom_query.format(
|
| 100 |
+
metric, self.session_name, "dataset, operator"
|
| 101 |
+
)
|
| 102 |
+
)
|
| 103 |
+
for res in operator_result["data"]["result"]:
|
| 104 |
+
dataset, operator, value = (
|
| 105 |
+
res["metric"]["dataset"],
|
| 106 |
+
res["metric"]["operator"],
|
| 107 |
+
res["value"][1],
|
| 108 |
+
)
|
| 109 |
+
# Check if dataset/operator is in current _StatsActor scope.
|
| 110 |
+
# Prometheus server may contain metrics from previous
|
| 111 |
+
# cluster if not reset.
|
| 112 |
+
if (
|
| 113 |
+
dataset in datasets
|
| 114 |
+
and operator in datasets[dataset]["operators"]
|
| 115 |
+
):
|
| 116 |
+
datasets[dataset]["operators"][operator][metric][
|
| 117 |
+
query_name
|
| 118 |
+
] = value
|
| 119 |
+
except aiohttp.client_exceptions.ClientConnectorError:
|
| 120 |
+
# Prometheus server may not be running,
|
| 121 |
+
# leave these values blank and return other data
|
| 122 |
+
logging.exception(
|
| 123 |
+
"Exception occurred while querying Prometheus. "
|
| 124 |
+
"The Prometheus server may not be running."
|
| 125 |
+
)
|
| 126 |
+
# Flatten response
|
| 127 |
+
for dataset in datasets:
|
| 128 |
+
datasets[dataset]["operators"] = list(
|
| 129 |
+
map(
|
| 130 |
+
lambda item: {"operator": item[0], **item[1]},
|
| 131 |
+
datasets[dataset]["operators"].items(),
|
| 132 |
+
)
|
| 133 |
+
)
|
| 134 |
+
datasets = list(
|
| 135 |
+
map(lambda item: {"dataset": item[0], **item[1]}, datasets.items())
|
| 136 |
+
)
|
| 137 |
+
# Sort by descending start time
|
| 138 |
+
datasets = sorted(datasets, key=lambda x: x["start_time"], reverse=True)
|
| 139 |
+
return Response(
|
| 140 |
+
text=json.dumps({"datasets": datasets}),
|
| 141 |
+
content_type="application/json",
|
| 142 |
+
)
|
| 143 |
+
except Exception as e:
|
| 144 |
+
logging.exception("Exception occured while getting datasets.")
|
| 145 |
+
return Response(
|
| 146 |
+
status=503,
|
| 147 |
+
text=str(e),
|
| 148 |
+
)
|
| 149 |
+
|
| 150 |
+
async def run(self, server):
|
| 151 |
+
pass
|
| 152 |
+
|
| 153 |
+
@staticmethod
|
| 154 |
+
def is_minimal_module():
|
| 155 |
+
return False
|
| 156 |
+
|
| 157 |
+
async def _query_prometheus(self, query):
|
| 158 |
+
async with self.http_session.get(
|
| 159 |
+
f"{self.prometheus_host}/api/v1/query?query={quote(query)}",
|
| 160 |
+
headers=self.prometheus_headers,
|
| 161 |
+
) as resp:
|
| 162 |
+
if resp.status == 200:
|
| 163 |
+
prom_data = await resp.json()
|
| 164 |
+
return prom_data
|
| 165 |
+
|
| 166 |
+
message = await resp.text()
|
| 167 |
+
raise PrometheusQueryError(resp.status, message)
|
.venv/lib/python3.11/site-packages/ray/dashboard/modules/job/__init__.py
ADDED
|
File without changes
|
.venv/lib/python3.11/site-packages/ray/dashboard/modules/job/__pycache__/cli.cpython-311.pyc
ADDED
|
Binary file (20.6 kB). View file
|
|
|
.venv/lib/python3.11/site-packages/ray/dashboard/modules/job/__pycache__/cli_utils.cpython-311.pyc
ADDED
|
Binary file (2.57 kB). View file
|
|
|
.venv/lib/python3.11/site-packages/ray/dashboard/modules/job/__pycache__/job_agent.cpython-311.pyc
ADDED
|
Binary file (11.8 kB). View file
|
|
|
.venv/lib/python3.11/site-packages/ray/dashboard/modules/job/__pycache__/job_head.cpython-311.pyc
ADDED
|
Binary file (32.1 kB). View file
|
|
|
.venv/lib/python3.11/site-packages/ray/dashboard/modules/job/__pycache__/job_manager.cpython-311.pyc
ADDED
|
Binary file (28.4 kB). View file
|
|
|
.venv/lib/python3.11/site-packages/ray/dashboard/modules/job/__pycache__/job_supervisor.cpython-311.pyc
ADDED
|
Binary file (22.3 kB). View file
|
|
|
.venv/lib/python3.11/site-packages/ray/dashboard/modules/job/__pycache__/pydantic_models.cpython-311.pyc
ADDED
|
Binary file (5.29 kB). View file
|
|
|
.venv/lib/python3.11/site-packages/ray/dashboard/modules/job/cli.py
ADDED
|
@@ -0,0 +1,521 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import json
|
| 2 |
+
import os
|
| 3 |
+
import pprint
|
| 4 |
+
import sys
|
| 5 |
+
import time
|
| 6 |
+
from subprocess import list2cmdline
|
| 7 |
+
from typing import Any, Dict, Optional, Tuple, Union
|
| 8 |
+
|
| 9 |
+
import click
|
| 10 |
+
|
| 11 |
+
import ray._private.ray_constants as ray_constants
|
| 12 |
+
from ray._private.storage import _load_class
|
| 13 |
+
from ray._private.utils import (
|
| 14 |
+
get_or_create_event_loop,
|
| 15 |
+
parse_metadata_json,
|
| 16 |
+
parse_resources_json,
|
| 17 |
+
)
|
| 18 |
+
from ray.autoscaler._private.cli_logger import add_click_logging_options, cf, cli_logger
|
| 19 |
+
from ray.dashboard.modules.dashboard_sdk import parse_runtime_env_args
|
| 20 |
+
from ray.dashboard.modules.job.cli_utils import add_common_job_options
|
| 21 |
+
from ray.dashboard.modules.job.utils import redact_url_password
|
| 22 |
+
from ray.job_submission import JobStatus, JobSubmissionClient
|
| 23 |
+
from ray.util.annotations import PublicAPI
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
def _get_sdk_client(
|
| 27 |
+
address: Optional[str],
|
| 28 |
+
create_cluster_if_needed: bool = False,
|
| 29 |
+
headers: Optional[str] = None,
|
| 30 |
+
verify: Union[bool, str] = True,
|
| 31 |
+
) -> JobSubmissionClient:
|
| 32 |
+
client = JobSubmissionClient(
|
| 33 |
+
address,
|
| 34 |
+
create_cluster_if_needed,
|
| 35 |
+
headers=_handle_headers(headers),
|
| 36 |
+
verify=verify,
|
| 37 |
+
)
|
| 38 |
+
client_address = client.get_address()
|
| 39 |
+
cli_logger.labeled_value(
|
| 40 |
+
"Job submission server address", redact_url_password(client_address)
|
| 41 |
+
)
|
| 42 |
+
return client
|
| 43 |
+
|
| 44 |
+
|
| 45 |
+
def _handle_headers(headers: Optional[str]) -> Optional[Dict[str, Any]]:
|
| 46 |
+
if headers is None and "RAY_JOB_HEADERS" in os.environ:
|
| 47 |
+
headers = os.environ["RAY_JOB_HEADERS"]
|
| 48 |
+
if headers is not None:
|
| 49 |
+
try:
|
| 50 |
+
return json.loads(headers)
|
| 51 |
+
except Exception as exc:
|
| 52 |
+
raise ValueError(
|
| 53 |
+
"""Failed to parse headers into JSON.
|
| 54 |
+
Expected format: {{"KEY": "VALUE"}}, got {}, {}""".format(
|
| 55 |
+
headers, exc
|
| 56 |
+
)
|
| 57 |
+
)
|
| 58 |
+
return None
|
| 59 |
+
|
| 60 |
+
|
| 61 |
+
def _log_big_success_msg(success_msg):
|
| 62 |
+
cli_logger.newline()
|
| 63 |
+
cli_logger.success("-" * len(success_msg))
|
| 64 |
+
cli_logger.success(success_msg)
|
| 65 |
+
cli_logger.success("-" * len(success_msg))
|
| 66 |
+
cli_logger.newline()
|
| 67 |
+
|
| 68 |
+
|
| 69 |
+
def _log_big_error_msg(success_msg):
|
| 70 |
+
cli_logger.newline()
|
| 71 |
+
cli_logger.error("-" * len(success_msg))
|
| 72 |
+
cli_logger.error(success_msg)
|
| 73 |
+
cli_logger.error("-" * len(success_msg))
|
| 74 |
+
cli_logger.newline()
|
| 75 |
+
|
| 76 |
+
|
| 77 |
+
def _log_job_status(client: JobSubmissionClient, job_id: str) -> JobStatus:
|
| 78 |
+
info = client.get_job_info(job_id)
|
| 79 |
+
if info.status == JobStatus.SUCCEEDED:
|
| 80 |
+
_log_big_success_msg(f"Job '{job_id}' succeeded")
|
| 81 |
+
elif info.status == JobStatus.STOPPED:
|
| 82 |
+
cli_logger.warning(f"Job '{job_id}' was stopped")
|
| 83 |
+
elif info.status == JobStatus.FAILED:
|
| 84 |
+
_log_big_error_msg(f"Job '{job_id}' failed")
|
| 85 |
+
if info.message is not None:
|
| 86 |
+
cli_logger.print(f"Status message: {info.message}", no_format=True)
|
| 87 |
+
else:
|
| 88 |
+
# Catch-all.
|
| 89 |
+
cli_logger.print(f"Status for job '{job_id}': {info.status}")
|
| 90 |
+
if info.message is not None:
|
| 91 |
+
cli_logger.print(f"Status message: {info.message}", no_format=True)
|
| 92 |
+
return info.status
|
| 93 |
+
|
| 94 |
+
|
| 95 |
+
async def _tail_logs(client: JobSubmissionClient, job_id: str) -> JobStatus:
|
| 96 |
+
async for lines in client.tail_job_logs(job_id):
|
| 97 |
+
print(lines, end="")
|
| 98 |
+
|
| 99 |
+
return _log_job_status(client, job_id)
|
| 100 |
+
|
| 101 |
+
|
| 102 |
+
@click.group("job")
|
| 103 |
+
def job_cli_group():
|
| 104 |
+
"""Submit, stop, delete, or list Ray jobs."""
|
| 105 |
+
pass
|
| 106 |
+
|
| 107 |
+
|
| 108 |
+
@job_cli_group.command()
|
| 109 |
+
@click.option(
|
| 110 |
+
"--address",
|
| 111 |
+
type=str,
|
| 112 |
+
default=None,
|
| 113 |
+
required=False,
|
| 114 |
+
help=(
|
| 115 |
+
"Address of the Ray cluster to connect to. Can also be specified "
|
| 116 |
+
"using the RAY_ADDRESS environment variable."
|
| 117 |
+
),
|
| 118 |
+
)
|
| 119 |
+
@click.option(
|
| 120 |
+
"--job-id",
|
| 121 |
+
type=str,
|
| 122 |
+
default=None,
|
| 123 |
+
required=False,
|
| 124 |
+
help=("DEPRECATED: Use `--submission-id` instead."),
|
| 125 |
+
)
|
| 126 |
+
@click.option(
|
| 127 |
+
"--submission-id",
|
| 128 |
+
type=str,
|
| 129 |
+
default=None,
|
| 130 |
+
required=False,
|
| 131 |
+
help=(
|
| 132 |
+
"Submission ID to specify for the job. "
|
| 133 |
+
"If not provided, one will be generated."
|
| 134 |
+
),
|
| 135 |
+
)
|
| 136 |
+
@click.option(
|
| 137 |
+
"--runtime-env",
|
| 138 |
+
type=str,
|
| 139 |
+
default=None,
|
| 140 |
+
required=False,
|
| 141 |
+
help="Path to a local YAML file containing a runtime_env definition.",
|
| 142 |
+
)
|
| 143 |
+
@click.option(
|
| 144 |
+
"--runtime-env-json",
|
| 145 |
+
type=str,
|
| 146 |
+
default=None,
|
| 147 |
+
required=False,
|
| 148 |
+
help="JSON-serialized runtime_env dictionary.",
|
| 149 |
+
)
|
| 150 |
+
@click.option(
|
| 151 |
+
"--working-dir",
|
| 152 |
+
type=str,
|
| 153 |
+
default=None,
|
| 154 |
+
required=False,
|
| 155 |
+
help=(
|
| 156 |
+
"Directory containing files that your job will run in. Can be a "
|
| 157 |
+
"local directory or a remote URI to a .zip file (S3, GS, HTTP). "
|
| 158 |
+
"If specified, this overrides the option in `--runtime-env`."
|
| 159 |
+
),
|
| 160 |
+
)
|
| 161 |
+
@click.option(
|
| 162 |
+
"--metadata-json",
|
| 163 |
+
type=str,
|
| 164 |
+
default=None,
|
| 165 |
+
required=False,
|
| 166 |
+
help="JSON-serialized dictionary of metadata to attach to the job.",
|
| 167 |
+
)
|
| 168 |
+
@click.option(
|
| 169 |
+
"--entrypoint-num-cpus",
|
| 170 |
+
required=False,
|
| 171 |
+
type=float,
|
| 172 |
+
help="the quantity of CPU cores to reserve for the entrypoint command, "
|
| 173 |
+
"separately from any tasks or actors that are launched by it",
|
| 174 |
+
)
|
| 175 |
+
@click.option(
|
| 176 |
+
"--entrypoint-num-gpus",
|
| 177 |
+
required=False,
|
| 178 |
+
type=float,
|
| 179 |
+
help="the quantity of GPUs to reserve for the entrypoint command, "
|
| 180 |
+
"separately from any tasks or actors that are launched by it",
|
| 181 |
+
)
|
| 182 |
+
@click.option(
|
| 183 |
+
"--entrypoint-memory",
|
| 184 |
+
required=False,
|
| 185 |
+
type=int,
|
| 186 |
+
help="the amount of memory to reserve "
|
| 187 |
+
"for the entrypoint command, separately from any tasks or actors that are "
|
| 188 |
+
"launched by it",
|
| 189 |
+
)
|
| 190 |
+
@click.option(
|
| 191 |
+
"--entrypoint-resources",
|
| 192 |
+
required=False,
|
| 193 |
+
type=str,
|
| 194 |
+
help="a JSON-serialized dictionary mapping resource name to resource quantity "
|
| 195 |
+
"describing resources to reserve for the entrypoint command, "
|
| 196 |
+
"separately from any tasks or actors that are launched by it",
|
| 197 |
+
)
|
| 198 |
+
@click.option(
|
| 199 |
+
"--no-wait",
|
| 200 |
+
is_flag=True,
|
| 201 |
+
type=bool,
|
| 202 |
+
default=False,
|
| 203 |
+
help="If set, will not stream logs and wait for the job to exit.",
|
| 204 |
+
)
|
| 205 |
+
@add_common_job_options
|
| 206 |
+
@add_click_logging_options
|
| 207 |
+
@click.argument("entrypoint", nargs=-1, required=True, type=click.UNPROCESSED)
|
| 208 |
+
@PublicAPI
|
| 209 |
+
def submit(
|
| 210 |
+
address: Optional[str],
|
| 211 |
+
job_id: Optional[str],
|
| 212 |
+
submission_id: Optional[str],
|
| 213 |
+
runtime_env: Optional[str],
|
| 214 |
+
runtime_env_json: Optional[str],
|
| 215 |
+
metadata_json: Optional[str],
|
| 216 |
+
working_dir: Optional[str],
|
| 217 |
+
entrypoint: Tuple[str],
|
| 218 |
+
entrypoint_num_cpus: Optional[Union[int, float]],
|
| 219 |
+
entrypoint_num_gpus: Optional[Union[int, float]],
|
| 220 |
+
entrypoint_memory: Optional[int],
|
| 221 |
+
entrypoint_resources: Optional[str],
|
| 222 |
+
no_wait: bool,
|
| 223 |
+
verify: Union[bool, str],
|
| 224 |
+
headers: Optional[str],
|
| 225 |
+
):
|
| 226 |
+
"""Submits a job to be run on the cluster.
|
| 227 |
+
|
| 228 |
+
By default (if --no-wait is not set), streams logs to stdout until the job finishes.
|
| 229 |
+
If the job succeeded, exits with 0. If it failed, exits with 1.
|
| 230 |
+
|
| 231 |
+
Example:
|
| 232 |
+
`ray job submit -- python my_script.py --arg=val`
|
| 233 |
+
"""
|
| 234 |
+
if job_id:
|
| 235 |
+
cli_logger.warning(
|
| 236 |
+
"--job-id option is deprecated. Please use --submission-id instead."
|
| 237 |
+
)
|
| 238 |
+
if entrypoint_resources is not None:
|
| 239 |
+
entrypoint_resources = parse_resources_json(
|
| 240 |
+
entrypoint_resources, cli_logger, cf, command_arg="entrypoint-resources"
|
| 241 |
+
)
|
| 242 |
+
if metadata_json is not None:
|
| 243 |
+
metadata_json = parse_metadata_json(
|
| 244 |
+
metadata_json, cli_logger, cf, command_arg="metadata-json"
|
| 245 |
+
)
|
| 246 |
+
|
| 247 |
+
submission_id = submission_id or job_id
|
| 248 |
+
|
| 249 |
+
if ray_constants.RAY_JOB_SUBMIT_HOOK in os.environ:
|
| 250 |
+
# Submit all args as **kwargs per the JOB_SUBMIT_HOOK contract.
|
| 251 |
+
_load_class(os.environ[ray_constants.RAY_JOB_SUBMIT_HOOK])(
|
| 252 |
+
address=address,
|
| 253 |
+
job_id=submission_id,
|
| 254 |
+
submission_id=submission_id,
|
| 255 |
+
runtime_env=runtime_env,
|
| 256 |
+
runtime_env_json=runtime_env_json,
|
| 257 |
+
metadata_json=metadata_json,
|
| 258 |
+
working_dir=working_dir,
|
| 259 |
+
entrypoint=entrypoint,
|
| 260 |
+
entrypoint_num_cpus=entrypoint_num_cpus,
|
| 261 |
+
entrypoint_num_gpus=entrypoint_num_gpus,
|
| 262 |
+
entrypoint_memory=entrypoint_memory,
|
| 263 |
+
entrypoint_resources=entrypoint_resources,
|
| 264 |
+
no_wait=no_wait,
|
| 265 |
+
)
|
| 266 |
+
|
| 267 |
+
client = _get_sdk_client(
|
| 268 |
+
address, create_cluster_if_needed=True, headers=headers, verify=verify
|
| 269 |
+
)
|
| 270 |
+
|
| 271 |
+
final_runtime_env = parse_runtime_env_args(
|
| 272 |
+
runtime_env=runtime_env,
|
| 273 |
+
runtime_env_json=runtime_env_json,
|
| 274 |
+
working_dir=working_dir,
|
| 275 |
+
)
|
| 276 |
+
job_id = client.submit_job(
|
| 277 |
+
entrypoint=list2cmdline(entrypoint),
|
| 278 |
+
submission_id=submission_id,
|
| 279 |
+
runtime_env=final_runtime_env,
|
| 280 |
+
metadata=metadata_json,
|
| 281 |
+
entrypoint_num_cpus=entrypoint_num_cpus,
|
| 282 |
+
entrypoint_num_gpus=entrypoint_num_gpus,
|
| 283 |
+
entrypoint_memory=entrypoint_memory,
|
| 284 |
+
entrypoint_resources=entrypoint_resources,
|
| 285 |
+
)
|
| 286 |
+
|
| 287 |
+
_log_big_success_msg(f"Job '{job_id}' submitted successfully")
|
| 288 |
+
|
| 289 |
+
with cli_logger.group("Next steps"):
|
| 290 |
+
cli_logger.print("Query the logs of the job:")
|
| 291 |
+
with cli_logger.indented():
|
| 292 |
+
cli_logger.print(cf.bold(f"ray job logs {job_id}"))
|
| 293 |
+
|
| 294 |
+
cli_logger.print("Query the status of the job:")
|
| 295 |
+
with cli_logger.indented():
|
| 296 |
+
cli_logger.print(cf.bold(f"ray job status {job_id}"))
|
| 297 |
+
|
| 298 |
+
cli_logger.print("Request the job to be stopped:")
|
| 299 |
+
with cli_logger.indented():
|
| 300 |
+
cli_logger.print(cf.bold(f"ray job stop {job_id}"))
|
| 301 |
+
|
| 302 |
+
cli_logger.newline()
|
| 303 |
+
sdk_version = client.get_version()
|
| 304 |
+
# sdk version 0 does not have log streaming
|
| 305 |
+
if not no_wait:
|
| 306 |
+
if int(sdk_version) > 0:
|
| 307 |
+
cli_logger.print(
|
| 308 |
+
"Tailing logs until the job exits (disable with --no-wait):"
|
| 309 |
+
)
|
| 310 |
+
job_status = get_or_create_event_loop().run_until_complete(
|
| 311 |
+
_tail_logs(client, job_id)
|
| 312 |
+
)
|
| 313 |
+
if job_status == JobStatus.FAILED:
|
| 314 |
+
sys.exit(1)
|
| 315 |
+
else:
|
| 316 |
+
cli_logger.warning(
|
| 317 |
+
"Tailing logs is not enabled for job sdk client version "
|
| 318 |
+
f"{sdk_version}. Please upgrade Ray to the latest version "
|
| 319 |
+
"for this feature."
|
| 320 |
+
)
|
| 321 |
+
|
| 322 |
+
|
| 323 |
+
@job_cli_group.command()
|
| 324 |
+
@click.option(
|
| 325 |
+
"--address",
|
| 326 |
+
type=str,
|
| 327 |
+
default=None,
|
| 328 |
+
required=False,
|
| 329 |
+
help=(
|
| 330 |
+
"Address of the Ray cluster to connect to. Can also be specified "
|
| 331 |
+
"using the `RAY_ADDRESS` environment variable."
|
| 332 |
+
),
|
| 333 |
+
)
|
| 334 |
+
@click.argument("job-id", type=str)
|
| 335 |
+
@add_common_job_options
|
| 336 |
+
@add_click_logging_options
|
| 337 |
+
@PublicAPI(stability="stable")
|
| 338 |
+
def status(
|
| 339 |
+
address: Optional[str],
|
| 340 |
+
job_id: str,
|
| 341 |
+
headers: Optional[str],
|
| 342 |
+
verify: Union[bool, str],
|
| 343 |
+
):
|
| 344 |
+
"""Queries for the current status of a job.
|
| 345 |
+
|
| 346 |
+
Example:
|
| 347 |
+
`ray job status <my_job_id>`
|
| 348 |
+
"""
|
| 349 |
+
client = _get_sdk_client(address, headers=headers, verify=verify)
|
| 350 |
+
_log_job_status(client, job_id)
|
| 351 |
+
|
| 352 |
+
|
| 353 |
+
@job_cli_group.command()
|
| 354 |
+
@click.option(
|
| 355 |
+
"--address",
|
| 356 |
+
type=str,
|
| 357 |
+
default=None,
|
| 358 |
+
required=False,
|
| 359 |
+
help=(
|
| 360 |
+
"Address of the Ray cluster to connect to. Can also be specified "
|
| 361 |
+
"using the `RAY_ADDRESS` environment variable."
|
| 362 |
+
),
|
| 363 |
+
)
|
| 364 |
+
@click.option(
|
| 365 |
+
"--no-wait",
|
| 366 |
+
is_flag=True,
|
| 367 |
+
type=bool,
|
| 368 |
+
default=False,
|
| 369 |
+
help="If set, will not wait for the job to exit.",
|
| 370 |
+
)
|
| 371 |
+
@click.argument("job-id", type=str)
|
| 372 |
+
@add_common_job_options
|
| 373 |
+
@add_click_logging_options
|
| 374 |
+
@PublicAPI(stability="stable")
|
| 375 |
+
def stop(
|
| 376 |
+
address: Optional[str],
|
| 377 |
+
no_wait: bool,
|
| 378 |
+
job_id: str,
|
| 379 |
+
headers: Optional[str],
|
| 380 |
+
verify: Union[bool, str],
|
| 381 |
+
):
|
| 382 |
+
"""Attempts to stop a job.
|
| 383 |
+
|
| 384 |
+
Example:
|
| 385 |
+
`ray job stop <my_job_id>`
|
| 386 |
+
"""
|
| 387 |
+
client = _get_sdk_client(address, headers=headers, verify=verify)
|
| 388 |
+
cli_logger.print(f"Attempting to stop job '{job_id}'")
|
| 389 |
+
client.stop_job(job_id)
|
| 390 |
+
|
| 391 |
+
if no_wait:
|
| 392 |
+
return
|
| 393 |
+
else:
|
| 394 |
+
cli_logger.print(
|
| 395 |
+
f"Waiting for job '{job_id}' to exit " f"(disable with --no-wait):"
|
| 396 |
+
)
|
| 397 |
+
|
| 398 |
+
while True:
|
| 399 |
+
status = client.get_job_status(job_id)
|
| 400 |
+
if status in {JobStatus.STOPPED, JobStatus.SUCCEEDED, JobStatus.FAILED}:
|
| 401 |
+
_log_job_status(client, job_id)
|
| 402 |
+
break
|
| 403 |
+
else:
|
| 404 |
+
cli_logger.print(f"Job has not exited yet. Status: {status}")
|
| 405 |
+
time.sleep(1)
|
| 406 |
+
|
| 407 |
+
|
| 408 |
+
@job_cli_group.command()
|
| 409 |
+
@click.option(
|
| 410 |
+
"--address",
|
| 411 |
+
type=str,
|
| 412 |
+
default=None,
|
| 413 |
+
required=False,
|
| 414 |
+
help=(
|
| 415 |
+
"Address of the Ray cluster to connect to. Can also be specified "
|
| 416 |
+
"using the RAY_ADDRESS environment variable."
|
| 417 |
+
),
|
| 418 |
+
)
|
| 419 |
+
@click.argument("job-id", type=str)
|
| 420 |
+
@add_common_job_options
|
| 421 |
+
@add_click_logging_options
|
| 422 |
+
@PublicAPI(stability="stable")
|
| 423 |
+
def delete(
|
| 424 |
+
address: Optional[str],
|
| 425 |
+
job_id: str,
|
| 426 |
+
headers: Optional[str],
|
| 427 |
+
verify: Union[bool, str],
|
| 428 |
+
):
|
| 429 |
+
"""Deletes a stopped job and its associated data from memory.
|
| 430 |
+
|
| 431 |
+
Only supported for jobs that are already in a terminal state.
|
| 432 |
+
Fails with exit code 1 if the job is not already stopped.
|
| 433 |
+
Does not delete job logs from disk.
|
| 434 |
+
Submitting a job with the same submission ID as a previously
|
| 435 |
+
deleted job is not supported and may lead to unexpected behavior.
|
| 436 |
+
|
| 437 |
+
Example:
|
| 438 |
+
ray job delete <my_job_id>
|
| 439 |
+
"""
|
| 440 |
+
client = _get_sdk_client(address, headers=headers, verify=verify)
|
| 441 |
+
client.delete_job(job_id)
|
| 442 |
+
cli_logger.print(f"Job '{job_id}' deleted successfully")
|
| 443 |
+
|
| 444 |
+
|
| 445 |
+
@job_cli_group.command()
|
| 446 |
+
@click.option(
|
| 447 |
+
"--address",
|
| 448 |
+
type=str,
|
| 449 |
+
default=None,
|
| 450 |
+
required=False,
|
| 451 |
+
help=(
|
| 452 |
+
"Address of the Ray cluster to connect to. Can also be specified "
|
| 453 |
+
"using the RAY_ADDRESS environment variable."
|
| 454 |
+
),
|
| 455 |
+
)
|
| 456 |
+
@click.argument("job-id", type=str)
|
| 457 |
+
@click.option(
|
| 458 |
+
"-f",
|
| 459 |
+
"--follow",
|
| 460 |
+
is_flag=True,
|
| 461 |
+
type=bool,
|
| 462 |
+
default=False,
|
| 463 |
+
help="If set, follow the logs (like `tail -f`).",
|
| 464 |
+
)
|
| 465 |
+
@add_common_job_options
|
| 466 |
+
@add_click_logging_options
|
| 467 |
+
@PublicAPI(stability="stable")
|
| 468 |
+
def logs(
|
| 469 |
+
address: Optional[str],
|
| 470 |
+
job_id: str,
|
| 471 |
+
follow: bool,
|
| 472 |
+
headers: Optional[str],
|
| 473 |
+
verify: Union[bool, str],
|
| 474 |
+
):
|
| 475 |
+
"""Gets the logs of a job.
|
| 476 |
+
|
| 477 |
+
Example:
|
| 478 |
+
`ray job logs <my_job_id>`
|
| 479 |
+
"""
|
| 480 |
+
client = _get_sdk_client(address, headers=headers, verify=verify)
|
| 481 |
+
sdk_version = client.get_version()
|
| 482 |
+
# sdk version 0 did not have log streaming
|
| 483 |
+
if follow:
|
| 484 |
+
if int(sdk_version) > 0:
|
| 485 |
+
get_or_create_event_loop().run_until_complete(_tail_logs(client, job_id))
|
| 486 |
+
else:
|
| 487 |
+
cli_logger.warning(
|
| 488 |
+
"Tailing logs is not enabled for the Jobs SDK client version "
|
| 489 |
+
f"{sdk_version}. Please upgrade Ray to latest version "
|
| 490 |
+
"for this feature."
|
| 491 |
+
)
|
| 492 |
+
else:
|
| 493 |
+
# Set no_format to True because the logs may have unescaped "{" and "}"
|
| 494 |
+
# and the CLILogger calls str.format().
|
| 495 |
+
cli_logger.print(client.get_job_logs(job_id), end="", no_format=True)
|
| 496 |
+
|
| 497 |
+
|
| 498 |
+
@job_cli_group.command()
|
| 499 |
+
@click.option(
|
| 500 |
+
"--address",
|
| 501 |
+
type=str,
|
| 502 |
+
default=None,
|
| 503 |
+
required=False,
|
| 504 |
+
help=(
|
| 505 |
+
"Address of the Ray cluster to connect to. Can also be specified "
|
| 506 |
+
"using the RAY_ADDRESS environment variable."
|
| 507 |
+
),
|
| 508 |
+
)
|
| 509 |
+
@add_common_job_options
|
| 510 |
+
@add_click_logging_options
|
| 511 |
+
@PublicAPI(stability="stable")
|
| 512 |
+
def list(address: Optional[str], headers: Optional[str], verify: Union[bool, str]):
|
| 513 |
+
"""Lists all running jobs and their information.
|
| 514 |
+
|
| 515 |
+
Example:
|
| 516 |
+
`ray job list`
|
| 517 |
+
"""
|
| 518 |
+
client = _get_sdk_client(address, headers=headers, verify=verify)
|
| 519 |
+
# Set no_format to True because the logs may have unescaped "{" and "}"
|
| 520 |
+
# and the CLILogger calls str.format().
|
| 521 |
+
cli_logger.print(pprint.pformat(client.list_jobs()), no_format=True)
|
.venv/lib/python3.11/site-packages/ray/dashboard/modules/job/cli_utils.py
ADDED
|
@@ -0,0 +1,56 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import functools
|
| 2 |
+
from typing import Union
|
| 3 |
+
|
| 4 |
+
import click
|
| 5 |
+
|
| 6 |
+
|
| 7 |
+
def bool_cast(string: str) -> Union[bool, str]:
|
| 8 |
+
"""Cast a string to a boolean if possible, otherwise return the string."""
|
| 9 |
+
if string.lower() == "true" or string == "1":
|
| 10 |
+
return True
|
| 11 |
+
elif string.lower() == "false" or string == "0":
|
| 12 |
+
return False
|
| 13 |
+
else:
|
| 14 |
+
return string
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
class BoolOrStringParam(click.ParamType):
|
| 18 |
+
"""A click parameter that can be either a boolean or a string."""
|
| 19 |
+
|
| 20 |
+
name = "BOOL | TEXT"
|
| 21 |
+
|
| 22 |
+
def convert(self, value, param, ctx):
|
| 23 |
+
if isinstance(value, bool):
|
| 24 |
+
return value
|
| 25 |
+
else:
|
| 26 |
+
return bool_cast(value)
|
| 27 |
+
|
| 28 |
+
|
| 29 |
+
def add_common_job_options(func):
|
| 30 |
+
"""Decorator for adding CLI flags shared by all `ray job` commands."""
|
| 31 |
+
|
| 32 |
+
@click.option(
|
| 33 |
+
"--verify",
|
| 34 |
+
default=True,
|
| 35 |
+
show_default=True,
|
| 36 |
+
type=BoolOrStringParam(),
|
| 37 |
+
help=(
|
| 38 |
+
"Boolean indication to verify the server's TLS certificate or a path to"
|
| 39 |
+
" a file or directory of trusted certificates."
|
| 40 |
+
),
|
| 41 |
+
)
|
| 42 |
+
@click.option(
|
| 43 |
+
"--headers",
|
| 44 |
+
required=False,
|
| 45 |
+
type=str,
|
| 46 |
+
default=None,
|
| 47 |
+
help=(
|
| 48 |
+
"Used to pass headers through http/s to the Ray Cluster."
|
| 49 |
+
'please follow JSON formatting formatting {"key": "value"}'
|
| 50 |
+
),
|
| 51 |
+
)
|
| 52 |
+
@functools.wraps(func)
|
| 53 |
+
def wrapper(*args, **kwargs):
|
| 54 |
+
return func(*args, **kwargs)
|
| 55 |
+
|
| 56 |
+
return wrapper
|
.venv/lib/python3.11/site-packages/ray/dashboard/modules/job/common.py
ADDED
|
@@ -0,0 +1,538 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import asyncio
|
| 2 |
+
import json
|
| 3 |
+
import logging
|
| 4 |
+
import time
|
| 5 |
+
from dataclasses import asdict, dataclass, replace
|
| 6 |
+
from enum import Enum
|
| 7 |
+
from pathlib import Path
|
| 8 |
+
from typing import Any, Dict, Optional, Tuple, Union
|
| 9 |
+
|
| 10 |
+
from ray._private import ray_constants
|
| 11 |
+
from ray._private.event.export_event_logger import (
|
| 12 |
+
check_export_api_enabled,
|
| 13 |
+
get_export_event_logger,
|
| 14 |
+
)
|
| 15 |
+
from ray._private.gcs_utils import GcsAioClient
|
| 16 |
+
from ray._private.runtime_env.packaging import parse_uri
|
| 17 |
+
from ray.core.generated.export_event_pb2 import ExportEvent
|
| 18 |
+
from ray.core.generated.export_submission_job_event_pb2 import (
|
| 19 |
+
ExportSubmissionJobEventData,
|
| 20 |
+
)
|
| 21 |
+
from ray.util.annotations import PublicAPI
|
| 22 |
+
|
| 23 |
+
# NOTE(edoakes): these constants should be considered a public API because
|
| 24 |
+
# they're exposed in the snapshot API.
|
| 25 |
+
JOB_ID_METADATA_KEY = "job_submission_id"
|
| 26 |
+
JOB_NAME_METADATA_KEY = "job_name"
|
| 27 |
+
JOB_ACTOR_NAME_TEMPLATE = (
|
| 28 |
+
f"{ray_constants.RAY_INTERNAL_NAMESPACE_PREFIX}job_actor_" + "{job_id}"
|
| 29 |
+
)
|
| 30 |
+
# In order to get information about SupervisorActors launched by different jobs,
|
| 31 |
+
# they must be set to the same namespace.
|
| 32 |
+
SUPERVISOR_ACTOR_RAY_NAMESPACE = "SUPERVISOR_ACTOR_RAY_NAMESPACE"
|
| 33 |
+
JOB_LOGS_PATH_TEMPLATE = "job-driver-{submission_id}.log"
|
| 34 |
+
|
| 35 |
+
logger = logging.getLogger(__name__)
|
| 36 |
+
|
| 37 |
+
|
| 38 |
+
@PublicAPI(stability="stable")
|
| 39 |
+
class JobStatus(str, Enum):
|
| 40 |
+
"""An enumeration for describing the status of a job."""
|
| 41 |
+
|
| 42 |
+
#: The job has not started yet, likely waiting for the runtime_env to be set up.
|
| 43 |
+
PENDING = "PENDING"
|
| 44 |
+
#: The job is currently running.
|
| 45 |
+
RUNNING = "RUNNING"
|
| 46 |
+
#: The job was intentionally stopped by the user.
|
| 47 |
+
STOPPED = "STOPPED"
|
| 48 |
+
#: The job finished successfully.
|
| 49 |
+
SUCCEEDED = "SUCCEEDED"
|
| 50 |
+
#: The job failed.
|
| 51 |
+
FAILED = "FAILED"
|
| 52 |
+
|
| 53 |
+
def __str__(self) -> str:
|
| 54 |
+
return f"{self.value}"
|
| 55 |
+
|
| 56 |
+
def is_terminal(self) -> bool:
|
| 57 |
+
"""Return whether or not this status is terminal.
|
| 58 |
+
|
| 59 |
+
A terminal status is one that cannot transition to any other status.
|
| 60 |
+
The terminal statuses are "STOPPED", "SUCCEEDED", and "FAILED".
|
| 61 |
+
|
| 62 |
+
Returns:
|
| 63 |
+
True if this status is terminal, otherwise False.
|
| 64 |
+
"""
|
| 65 |
+
return self.value in {"STOPPED", "SUCCEEDED", "FAILED"}
|
| 66 |
+
|
| 67 |
+
|
| 68 |
+
# TODO(aguo): Convert to pydantic model
|
| 69 |
+
@PublicAPI(stability="stable")
|
| 70 |
+
@dataclass
|
| 71 |
+
class JobInfo:
|
| 72 |
+
"""A class for recording information associated with a job and its execution.
|
| 73 |
+
|
| 74 |
+
Please keep this in sync with the JobsAPIInfo proto in src/ray/protobuf/gcs.proto.
|
| 75 |
+
"""
|
| 76 |
+
|
| 77 |
+
#: The status of the job.
|
| 78 |
+
status: JobStatus
|
| 79 |
+
#: The entrypoint command for this job.
|
| 80 |
+
entrypoint: str
|
| 81 |
+
#: A message describing the status in more detail.
|
| 82 |
+
message: Optional[str] = None
|
| 83 |
+
# TODO(architkulkarni): Populate this field with e.g. Runtime env setup failure,
|
| 84 |
+
#: Internal error, user script error
|
| 85 |
+
error_type: Optional[str] = None
|
| 86 |
+
#: The time when the job was started. A Unix timestamp in ms.
|
| 87 |
+
start_time: Optional[int] = None
|
| 88 |
+
#: The time when the job moved into a terminal state. A Unix timestamp in ms.
|
| 89 |
+
end_time: Optional[int] = None
|
| 90 |
+
#: Arbitrary user-provided metadata for the job.
|
| 91 |
+
metadata: Optional[Dict[str, str]] = None
|
| 92 |
+
#: The runtime environment for the job.
|
| 93 |
+
runtime_env: Optional[Dict[str, Any]] = None
|
| 94 |
+
#: The quantity of CPU cores to reserve for the entrypoint command.
|
| 95 |
+
entrypoint_num_cpus: Optional[Union[int, float]] = None
|
| 96 |
+
#: The number of GPUs to reserve for the entrypoint command.
|
| 97 |
+
entrypoint_num_gpus: Optional[Union[int, float]] = None
|
| 98 |
+
#: The amount of memory for workers requesting memory for the entrypoint command.
|
| 99 |
+
entrypoint_memory: Optional[int] = None
|
| 100 |
+
#: The quantity of various custom resources to reserve for the entrypoint command.
|
| 101 |
+
entrypoint_resources: Optional[Dict[str, float]] = None
|
| 102 |
+
#: Driver agent http address
|
| 103 |
+
driver_agent_http_address: Optional[str] = None
|
| 104 |
+
#: The node id that driver running on. It will be None only when the job status
|
| 105 |
+
# is PENDING, and this field will not be deleted or modified even if the driver dies
|
| 106 |
+
driver_node_id: Optional[str] = None
|
| 107 |
+
#: The driver process exit code after the driver executed. Return None if driver
|
| 108 |
+
#: doesn't finish executing
|
| 109 |
+
driver_exit_code: Optional[int] = None
|
| 110 |
+
|
| 111 |
+
def __post_init__(self):
|
| 112 |
+
if isinstance(self.status, str):
|
| 113 |
+
self.status = JobStatus(self.status)
|
| 114 |
+
if self.message is None:
|
| 115 |
+
if self.status == JobStatus.PENDING:
|
| 116 |
+
self.message = "Job has not started yet."
|
| 117 |
+
if any(
|
| 118 |
+
[
|
| 119 |
+
self.entrypoint_num_cpus is not None
|
| 120 |
+
and self.entrypoint_num_cpus > 0,
|
| 121 |
+
self.entrypoint_num_gpus is not None
|
| 122 |
+
and self.entrypoint_num_gpus > 0,
|
| 123 |
+
self.entrypoint_memory is not None
|
| 124 |
+
and self.entrypoint_memory > 0,
|
| 125 |
+
self.entrypoint_resources not in [None, {}],
|
| 126 |
+
]
|
| 127 |
+
):
|
| 128 |
+
self.message += (
|
| 129 |
+
" It may be waiting for resources "
|
| 130 |
+
"(CPUs, GPUs, memory, custom resources) to become available."
|
| 131 |
+
)
|
| 132 |
+
if self.runtime_env not in [None, {}]:
|
| 133 |
+
self.message += (
|
| 134 |
+
" It may be waiting for the runtime environment to be set up."
|
| 135 |
+
)
|
| 136 |
+
elif self.status == JobStatus.RUNNING:
|
| 137 |
+
self.message = "Job is currently running."
|
| 138 |
+
elif self.status == JobStatus.STOPPED:
|
| 139 |
+
self.message = "Job was intentionally stopped."
|
| 140 |
+
elif self.status == JobStatus.SUCCEEDED:
|
| 141 |
+
self.message = "Job finished successfully."
|
| 142 |
+
elif self.status == JobStatus.FAILED:
|
| 143 |
+
self.message = "Job failed."
|
| 144 |
+
|
| 145 |
+
def to_json(self) -> Dict[str, Any]:
|
| 146 |
+
"""Convert this object to a JSON-serializable dictionary.
|
| 147 |
+
|
| 148 |
+
Note that the runtime_env field is converted to a JSON-serialized string
|
| 149 |
+
and the field is renamed to runtime_env_json.
|
| 150 |
+
|
| 151 |
+
Returns:
|
| 152 |
+
A JSON-serializable dictionary representing the JobInfo object.
|
| 153 |
+
"""
|
| 154 |
+
|
| 155 |
+
json_dict = asdict(self)
|
| 156 |
+
|
| 157 |
+
# Convert enum values to strings.
|
| 158 |
+
json_dict["status"] = str(json_dict["status"])
|
| 159 |
+
|
| 160 |
+
# Convert runtime_env to a JSON-serialized string.
|
| 161 |
+
if "runtime_env" in json_dict:
|
| 162 |
+
if json_dict["runtime_env"] is not None:
|
| 163 |
+
json_dict["runtime_env_json"] = json.dumps(json_dict["runtime_env"])
|
| 164 |
+
del json_dict["runtime_env"]
|
| 165 |
+
|
| 166 |
+
# Assert that the dictionary is JSON-serializable.
|
| 167 |
+
json.dumps(json_dict)
|
| 168 |
+
|
| 169 |
+
return json_dict
|
| 170 |
+
|
| 171 |
+
@classmethod
|
| 172 |
+
def from_json(cls, json_dict: Dict[str, Any]) -> None:
|
| 173 |
+
"""Initialize this object from a JSON dictionary.
|
| 174 |
+
|
| 175 |
+
Note that the runtime_env_json field is converted to a dictionary and
|
| 176 |
+
the field is renamed to runtime_env.
|
| 177 |
+
|
| 178 |
+
Args:
|
| 179 |
+
json_dict: A JSON dictionary to use to initialize the JobInfo object.
|
| 180 |
+
"""
|
| 181 |
+
# Convert enum values to enum objects.
|
| 182 |
+
json_dict["status"] = JobStatus(json_dict["status"])
|
| 183 |
+
|
| 184 |
+
# Convert runtime_env from a JSON-serialized string to a dictionary.
|
| 185 |
+
if "runtime_env_json" in json_dict:
|
| 186 |
+
if json_dict["runtime_env_json"] is not None:
|
| 187 |
+
json_dict["runtime_env"] = json.loads(json_dict["runtime_env_json"])
|
| 188 |
+
del json_dict["runtime_env_json"]
|
| 189 |
+
|
| 190 |
+
return cls(**json_dict)
|
| 191 |
+
|
| 192 |
+
|
| 193 |
+
class JobInfoStorageClient:
|
| 194 |
+
"""
|
| 195 |
+
Interface to put and get job data from the Internal KV store.
|
| 196 |
+
"""
|
| 197 |
+
|
| 198 |
+
# Please keep this format in sync with JobDataKey()
|
| 199 |
+
# in src/ray/gcs/gcs_server/gcs_job_manager.h.
|
| 200 |
+
JOB_DATA_KEY_PREFIX = f"{ray_constants.RAY_INTERNAL_NAMESPACE_PREFIX}job_info_"
|
| 201 |
+
JOB_DATA_KEY = f"{JOB_DATA_KEY_PREFIX}{{job_id}}"
|
| 202 |
+
|
| 203 |
+
def __init__(
|
| 204 |
+
self,
|
| 205 |
+
gcs_aio_client: GcsAioClient,
|
| 206 |
+
export_event_log_dir_root: Optional[str] = None,
|
| 207 |
+
):
|
| 208 |
+
"""
|
| 209 |
+
Initialize the JobInfoStorageClient which manages data in the internal KV store.
|
| 210 |
+
Export Submission Job events are written when the KV store is updated if
|
| 211 |
+
the feature flag is on and a export_event_log_dir_root is passed.
|
| 212 |
+
export_event_log_dir_root doesn't need to be passed if the caller
|
| 213 |
+
is not modifying data in the KV store.
|
| 214 |
+
"""
|
| 215 |
+
self._gcs_aio_client = gcs_aio_client
|
| 216 |
+
self._export_submission_job_event_logger: logging.Logger = None
|
| 217 |
+
try:
|
| 218 |
+
if (
|
| 219 |
+
check_export_api_enabled(ExportEvent.SourceType.EXPORT_SUBMISSION_JOB)
|
| 220 |
+
and export_event_log_dir_root is not None
|
| 221 |
+
):
|
| 222 |
+
self._export_submission_job_event_logger = get_export_event_logger(
|
| 223 |
+
ExportEvent.SourceType.EXPORT_SUBMISSION_JOB,
|
| 224 |
+
export_event_log_dir_root,
|
| 225 |
+
)
|
| 226 |
+
except Exception:
|
| 227 |
+
logger.exception(
|
| 228 |
+
"Unable to initialize export event logger so no export "
|
| 229 |
+
"events will be written."
|
| 230 |
+
)
|
| 231 |
+
|
| 232 |
+
async def put_info(
|
| 233 |
+
self, job_id: str, job_info: JobInfo, overwrite: bool = True
|
| 234 |
+
) -> bool:
|
| 235 |
+
"""Put job info to the internal kv store.
|
| 236 |
+
|
| 237 |
+
Args:
|
| 238 |
+
job_id: The job id.
|
| 239 |
+
job_info: The job info.
|
| 240 |
+
overwrite: Whether to overwrite the existing job info.
|
| 241 |
+
|
| 242 |
+
Returns:
|
| 243 |
+
True if a new key is added.
|
| 244 |
+
"""
|
| 245 |
+
added_num = await self._gcs_aio_client.internal_kv_put(
|
| 246 |
+
self.JOB_DATA_KEY.format(job_id=job_id).encode(),
|
| 247 |
+
json.dumps(job_info.to_json()).encode(),
|
| 248 |
+
overwrite,
|
| 249 |
+
namespace=ray_constants.KV_NAMESPACE_JOB,
|
| 250 |
+
)
|
| 251 |
+
if added_num == 1 or overwrite:
|
| 252 |
+
# Write export event if data was updated in the KV store
|
| 253 |
+
try:
|
| 254 |
+
self._write_submission_job_export_event(job_id, job_info)
|
| 255 |
+
except Exception:
|
| 256 |
+
logger.exception("Error while writing job submission export event.")
|
| 257 |
+
return added_num == 1
|
| 258 |
+
|
| 259 |
+
def _write_submission_job_export_event(
|
| 260 |
+
self, job_id: str, job_info: JobInfo
|
| 261 |
+
) -> None:
|
| 262 |
+
"""
|
| 263 |
+
Write Submission Job export event if _export_submission_job_event_logger
|
| 264 |
+
exists. The logger will exist if the export API feature flag is enabled
|
| 265 |
+
and a log directory was passed to JobInfoStorageClient.
|
| 266 |
+
"""
|
| 267 |
+
if not self._export_submission_job_event_logger:
|
| 268 |
+
return
|
| 269 |
+
|
| 270 |
+
status_value_descriptor = (
|
| 271 |
+
ExportSubmissionJobEventData.JobStatus.DESCRIPTOR.values_by_name.get(
|
| 272 |
+
job_info.status.name
|
| 273 |
+
)
|
| 274 |
+
)
|
| 275 |
+
if status_value_descriptor is None:
|
| 276 |
+
logger.error(
|
| 277 |
+
f"{job_info.status.name} is not a valid "
|
| 278 |
+
"ExportSubmissionJobEventData.JobStatus enum value. This event "
|
| 279 |
+
"will not be written."
|
| 280 |
+
)
|
| 281 |
+
return
|
| 282 |
+
job_status = status_value_descriptor.number
|
| 283 |
+
submission_event_data = ExportSubmissionJobEventData(
|
| 284 |
+
submission_job_id=job_id,
|
| 285 |
+
status=job_status,
|
| 286 |
+
entrypoint=job_info.entrypoint,
|
| 287 |
+
message=job_info.message,
|
| 288 |
+
metadata=job_info.metadata,
|
| 289 |
+
error_type=job_info.error_type,
|
| 290 |
+
start_time=job_info.start_time,
|
| 291 |
+
end_time=job_info.end_time,
|
| 292 |
+
runtime_env_json=json.dumps(job_info.runtime_env),
|
| 293 |
+
driver_agent_http_address=job_info.driver_agent_http_address,
|
| 294 |
+
driver_node_id=job_info.driver_node_id,
|
| 295 |
+
driver_exit_code=job_info.driver_exit_code,
|
| 296 |
+
)
|
| 297 |
+
self._export_submission_job_event_logger.send_event(submission_event_data)
|
| 298 |
+
|
| 299 |
+
async def get_info(self, job_id: str, timeout: int = 30) -> Optional[JobInfo]:
|
| 300 |
+
serialized_info = await self._gcs_aio_client.internal_kv_get(
|
| 301 |
+
self.JOB_DATA_KEY.format(job_id=job_id).encode(),
|
| 302 |
+
namespace=ray_constants.KV_NAMESPACE_JOB,
|
| 303 |
+
timeout=timeout,
|
| 304 |
+
)
|
| 305 |
+
if serialized_info is None:
|
| 306 |
+
return None
|
| 307 |
+
else:
|
| 308 |
+
return JobInfo.from_json(json.loads(serialized_info))
|
| 309 |
+
|
| 310 |
+
async def delete_info(self, job_id: str, timeout: int = 30):
|
| 311 |
+
await self._gcs_aio_client.internal_kv_del(
|
| 312 |
+
self.JOB_DATA_KEY.format(job_id=job_id).encode(),
|
| 313 |
+
False,
|
| 314 |
+
namespace=ray_constants.KV_NAMESPACE_JOB,
|
| 315 |
+
timeout=timeout,
|
| 316 |
+
)
|
| 317 |
+
|
| 318 |
+
async def put_status(
|
| 319 |
+
self,
|
| 320 |
+
job_id: str,
|
| 321 |
+
status: JobStatus,
|
| 322 |
+
message: Optional[str] = None,
|
| 323 |
+
driver_exit_code: Optional[int] = None,
|
| 324 |
+
jobinfo_replace_kwargs: Optional[Dict[str, Any]] = None,
|
| 325 |
+
):
|
| 326 |
+
"""Puts or updates job status. Sets end_time if status is terminal."""
|
| 327 |
+
|
| 328 |
+
old_info = await self.get_info(job_id)
|
| 329 |
+
|
| 330 |
+
if jobinfo_replace_kwargs is None:
|
| 331 |
+
jobinfo_replace_kwargs = dict()
|
| 332 |
+
jobinfo_replace_kwargs.update(
|
| 333 |
+
status=status, message=message, driver_exit_code=driver_exit_code
|
| 334 |
+
)
|
| 335 |
+
if old_info is not None:
|
| 336 |
+
if status != old_info.status and old_info.status.is_terminal():
|
| 337 |
+
assert False, "Attempted to change job status from a terminal state."
|
| 338 |
+
new_info = replace(old_info, **jobinfo_replace_kwargs)
|
| 339 |
+
else:
|
| 340 |
+
new_info = JobInfo(
|
| 341 |
+
entrypoint="Entrypoint not found.", **jobinfo_replace_kwargs
|
| 342 |
+
)
|
| 343 |
+
|
| 344 |
+
if status.is_terminal():
|
| 345 |
+
new_info.end_time = int(time.time() * 1000)
|
| 346 |
+
|
| 347 |
+
await self.put_info(job_id, new_info)
|
| 348 |
+
|
| 349 |
+
async def get_status(self, job_id: str) -> Optional[JobStatus]:
|
| 350 |
+
job_info = await self.get_info(job_id)
|
| 351 |
+
if job_info is None:
|
| 352 |
+
return None
|
| 353 |
+
else:
|
| 354 |
+
return job_info.status
|
| 355 |
+
|
| 356 |
+
async def get_all_jobs(self, timeout: int = 30) -> Dict[str, JobInfo]:
|
| 357 |
+
raw_job_ids_with_prefixes = await self._gcs_aio_client.internal_kv_keys(
|
| 358 |
+
self.JOB_DATA_KEY_PREFIX.encode(),
|
| 359 |
+
namespace=ray_constants.KV_NAMESPACE_JOB,
|
| 360 |
+
timeout=timeout,
|
| 361 |
+
)
|
| 362 |
+
job_ids_with_prefixes = [
|
| 363 |
+
job_id.decode() for job_id in raw_job_ids_with_prefixes
|
| 364 |
+
]
|
| 365 |
+
job_ids = []
|
| 366 |
+
for job_id_with_prefix in job_ids_with_prefixes:
|
| 367 |
+
assert job_id_with_prefix.startswith(
|
| 368 |
+
self.JOB_DATA_KEY_PREFIX
|
| 369 |
+
), "Unexpected format for internal_kv key for Job submission"
|
| 370 |
+
job_ids.append(job_id_with_prefix[len(self.JOB_DATA_KEY_PREFIX) :])
|
| 371 |
+
|
| 372 |
+
async def get_job_info(job_id: str):
|
| 373 |
+
job_info = await self.get_info(job_id, timeout)
|
| 374 |
+
return job_id, job_info
|
| 375 |
+
|
| 376 |
+
return {
|
| 377 |
+
job_id: job_info
|
| 378 |
+
for job_id, job_info in await asyncio.gather(
|
| 379 |
+
*[get_job_info(job_id) for job_id in job_ids]
|
| 380 |
+
)
|
| 381 |
+
}
|
| 382 |
+
|
| 383 |
+
|
| 384 |
+
def uri_to_http_components(package_uri: str) -> Tuple[str, str]:
|
| 385 |
+
suffix = Path(package_uri).suffix
|
| 386 |
+
if suffix not in {".zip", ".whl"}:
|
| 387 |
+
raise ValueError(f"package_uri ({package_uri}) does not end in .zip or .whl")
|
| 388 |
+
# We need to strip the <protocol>:// prefix to make it possible to pass
|
| 389 |
+
# the package_uri over HTTP.
|
| 390 |
+
protocol, package_name = parse_uri(package_uri)
|
| 391 |
+
return protocol.value, package_name
|
| 392 |
+
|
| 393 |
+
|
| 394 |
+
def http_uri_components_to_uri(protocol: str, package_name: str) -> str:
|
| 395 |
+
return f"{protocol}://{package_name}"
|
| 396 |
+
|
| 397 |
+
|
| 398 |
+
def validate_request_type(json_data: Dict[str, Any], request_type: dataclass) -> Any:
|
| 399 |
+
return request_type(**json_data)
|
| 400 |
+
|
| 401 |
+
|
| 402 |
+
@dataclass
|
| 403 |
+
class JobSubmitRequest:
|
| 404 |
+
# Command to start execution, ex: "python script.py"
|
| 405 |
+
entrypoint: str
|
| 406 |
+
# Optional submission_id to specify for the job. If the submission_id
|
| 407 |
+
# is not specified, one will be generated. If a job with the same
|
| 408 |
+
# submission_id already exists, it will be rejected.
|
| 409 |
+
submission_id: Optional[str] = None
|
| 410 |
+
# DEPRECATED. Use submission_id instead
|
| 411 |
+
job_id: Optional[str] = None
|
| 412 |
+
# Dict to setup execution environment.
|
| 413 |
+
runtime_env: Optional[Dict[str, Any]] = None
|
| 414 |
+
# Metadata to pass in to the JobConfig.
|
| 415 |
+
metadata: Optional[Dict[str, str]] = None
|
| 416 |
+
# The quantity of CPU cores to reserve for the execution
|
| 417 |
+
# of the entrypoint command, separately from any Ray tasks or actors
|
| 418 |
+
# that are created by it.
|
| 419 |
+
entrypoint_num_cpus: Optional[Union[int, float]] = None
|
| 420 |
+
# The quantity of GPUs to reserve for the execution
|
| 421 |
+
# of the entrypoint command, separately from any Ray tasks or actors
|
| 422 |
+
# that are created by it.
|
| 423 |
+
entrypoint_num_gpus: Optional[Union[int, float]] = None
|
| 424 |
+
# The amount of total available memory for workers requesting memory
|
| 425 |
+
# for the execution of the entrypoint command, separately from any Ray
|
| 426 |
+
# tasks or actors that are created by it.
|
| 427 |
+
entrypoint_memory: Optional[int] = None
|
| 428 |
+
# The quantity of various custom resources
|
| 429 |
+
# to reserve for the entrypoint command, separately from any Ray tasks
|
| 430 |
+
# or actors that are created by it.
|
| 431 |
+
entrypoint_resources: Optional[Dict[str, float]] = None
|
| 432 |
+
|
| 433 |
+
def __post_init__(self):
|
| 434 |
+
if not isinstance(self.entrypoint, str):
|
| 435 |
+
raise TypeError(f"entrypoint must be a string, got {type(self.entrypoint)}")
|
| 436 |
+
|
| 437 |
+
if self.submission_id is not None and not isinstance(self.submission_id, str):
|
| 438 |
+
raise TypeError(
|
| 439 |
+
"submission_id must be a string if provided, "
|
| 440 |
+
f"got {type(self.submission_id)}"
|
| 441 |
+
)
|
| 442 |
+
|
| 443 |
+
if self.job_id is not None and not isinstance(self.job_id, str):
|
| 444 |
+
raise TypeError(
|
| 445 |
+
"job_id must be a string if provided, " f"got {type(self.job_id)}"
|
| 446 |
+
)
|
| 447 |
+
|
| 448 |
+
if self.runtime_env is not None:
|
| 449 |
+
if not isinstance(self.runtime_env, dict):
|
| 450 |
+
raise TypeError(
|
| 451 |
+
f"runtime_env must be a dict, got {type(self.runtime_env)}"
|
| 452 |
+
)
|
| 453 |
+
else:
|
| 454 |
+
for k in self.runtime_env.keys():
|
| 455 |
+
if not isinstance(k, str):
|
| 456 |
+
raise TypeError(
|
| 457 |
+
f"runtime_env keys must be strings, got {type(k)}"
|
| 458 |
+
)
|
| 459 |
+
|
| 460 |
+
if self.metadata is not None:
|
| 461 |
+
if not isinstance(self.metadata, dict):
|
| 462 |
+
raise TypeError(f"metadata must be a dict, got {type(self.metadata)}")
|
| 463 |
+
else:
|
| 464 |
+
for k in self.metadata.keys():
|
| 465 |
+
if not isinstance(k, str):
|
| 466 |
+
raise TypeError(f"metadata keys must be strings, got {type(k)}")
|
| 467 |
+
for v in self.metadata.values():
|
| 468 |
+
if not isinstance(v, str):
|
| 469 |
+
raise TypeError(
|
| 470 |
+
f"metadata values must be strings, got {type(v)}"
|
| 471 |
+
)
|
| 472 |
+
|
| 473 |
+
if self.entrypoint_num_cpus is not None and not isinstance(
|
| 474 |
+
self.entrypoint_num_cpus, (int, float)
|
| 475 |
+
):
|
| 476 |
+
raise TypeError(
|
| 477 |
+
"entrypoint_num_cpus must be a number, "
|
| 478 |
+
f"got {type(self.entrypoint_num_cpus)}"
|
| 479 |
+
)
|
| 480 |
+
|
| 481 |
+
if self.entrypoint_num_gpus is not None and not isinstance(
|
| 482 |
+
self.entrypoint_num_gpus, (int, float)
|
| 483 |
+
):
|
| 484 |
+
raise TypeError(
|
| 485 |
+
"entrypoint_num_gpus must be a number, "
|
| 486 |
+
f"got {type(self.entrypoint_num_gpus)}"
|
| 487 |
+
)
|
| 488 |
+
|
| 489 |
+
if self.entrypoint_memory is not None and not isinstance(
|
| 490 |
+
self.entrypoint_memory, int
|
| 491 |
+
):
|
| 492 |
+
raise TypeError(
|
| 493 |
+
"entrypoint_memory must be an integer, "
|
| 494 |
+
f"got {type(self.entrypoint_memory)}"
|
| 495 |
+
)
|
| 496 |
+
|
| 497 |
+
if self.entrypoint_resources is not None:
|
| 498 |
+
if not isinstance(self.entrypoint_resources, dict):
|
| 499 |
+
raise TypeError(
|
| 500 |
+
"entrypoint_resources must be a dict, "
|
| 501 |
+
f"got {type(self.entrypoint_resources)}"
|
| 502 |
+
)
|
| 503 |
+
else:
|
| 504 |
+
for k in self.entrypoint_resources.keys():
|
| 505 |
+
if not isinstance(k, str):
|
| 506 |
+
raise TypeError(
|
| 507 |
+
"entrypoint_resources keys must be strings, "
|
| 508 |
+
f"got {type(k)}"
|
| 509 |
+
)
|
| 510 |
+
for v in self.entrypoint_resources.values():
|
| 511 |
+
if not isinstance(v, (int, float)):
|
| 512 |
+
raise TypeError(
|
| 513 |
+
"entrypoint_resources values must be numbers, "
|
| 514 |
+
f"got {type(v)}"
|
| 515 |
+
)
|
| 516 |
+
|
| 517 |
+
|
| 518 |
+
@dataclass
|
| 519 |
+
class JobSubmitResponse:
|
| 520 |
+
# DEPRECATED: Use submission_id instead.
|
| 521 |
+
job_id: str
|
| 522 |
+
submission_id: str
|
| 523 |
+
|
| 524 |
+
|
| 525 |
+
@dataclass
|
| 526 |
+
class JobStopResponse:
|
| 527 |
+
stopped: bool
|
| 528 |
+
|
| 529 |
+
|
| 530 |
+
@dataclass
|
| 531 |
+
class JobDeleteResponse:
|
| 532 |
+
deleted: bool
|
| 533 |
+
|
| 534 |
+
|
| 535 |
+
# TODO(jiaodong): Support log streaming #19415
|
| 536 |
+
@dataclass
|
| 537 |
+
class JobLogsResponse:
|
| 538 |
+
logs: str
|
.venv/lib/python3.11/site-packages/ray/dashboard/modules/job/job_agent.py
ADDED
|
@@ -0,0 +1,211 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import dataclasses
|
| 2 |
+
import json
|
| 3 |
+
import logging
|
| 4 |
+
import traceback
|
| 5 |
+
|
| 6 |
+
import aiohttp
|
| 7 |
+
from aiohttp.web import Request, Response
|
| 8 |
+
|
| 9 |
+
import ray
|
| 10 |
+
import ray.dashboard.optional_utils as optional_utils
|
| 11 |
+
import ray.dashboard.utils as dashboard_utils
|
| 12 |
+
from ray.dashboard.modules.job.common import (
|
| 13 |
+
JobDeleteResponse,
|
| 14 |
+
JobLogsResponse,
|
| 15 |
+
JobStopResponse,
|
| 16 |
+
JobSubmitRequest,
|
| 17 |
+
JobSubmitResponse,
|
| 18 |
+
)
|
| 19 |
+
from ray.dashboard.modules.job.job_manager import JobManager
|
| 20 |
+
from ray.dashboard.modules.job.pydantic_models import JobType
|
| 21 |
+
from ray.dashboard.modules.job.utils import find_job_by_ids, parse_and_validate_request
|
| 22 |
+
|
| 23 |
+
routes = optional_utils.DashboardAgentRouteTable
|
| 24 |
+
logger = logging.getLogger(__name__)
|
| 25 |
+
|
| 26 |
+
|
| 27 |
+
class JobAgent(dashboard_utils.DashboardAgentModule):
|
| 28 |
+
def __init__(self, dashboard_agent):
|
| 29 |
+
super().__init__(dashboard_agent)
|
| 30 |
+
self._job_manager = None
|
| 31 |
+
|
| 32 |
+
@routes.post("/api/job_agent/jobs/")
|
| 33 |
+
@optional_utils.deny_browser_requests()
|
| 34 |
+
@optional_utils.init_ray_and_catch_exceptions()
|
| 35 |
+
async def submit_job(self, req: Request) -> Response:
|
| 36 |
+
result = await parse_and_validate_request(req, JobSubmitRequest)
|
| 37 |
+
# Request parsing failed, returned with Response object.
|
| 38 |
+
if isinstance(result, Response):
|
| 39 |
+
return result
|
| 40 |
+
else:
|
| 41 |
+
submit_request = result
|
| 42 |
+
|
| 43 |
+
request_submission_id = submit_request.submission_id or submit_request.job_id
|
| 44 |
+
try:
|
| 45 |
+
ray._private.usage.usage_lib.record_library_usage("job_submission")
|
| 46 |
+
submission_id = await self.get_job_manager().submit_job(
|
| 47 |
+
entrypoint=submit_request.entrypoint,
|
| 48 |
+
submission_id=request_submission_id,
|
| 49 |
+
runtime_env=submit_request.runtime_env,
|
| 50 |
+
metadata=submit_request.metadata,
|
| 51 |
+
entrypoint_num_cpus=submit_request.entrypoint_num_cpus,
|
| 52 |
+
entrypoint_num_gpus=submit_request.entrypoint_num_gpus,
|
| 53 |
+
entrypoint_memory=submit_request.entrypoint_memory,
|
| 54 |
+
entrypoint_resources=submit_request.entrypoint_resources,
|
| 55 |
+
)
|
| 56 |
+
|
| 57 |
+
resp = JobSubmitResponse(job_id=submission_id, submission_id=submission_id)
|
| 58 |
+
except (TypeError, ValueError):
|
| 59 |
+
return Response(
|
| 60 |
+
text=traceback.format_exc(),
|
| 61 |
+
status=aiohttp.web.HTTPBadRequest.status_code,
|
| 62 |
+
)
|
| 63 |
+
except Exception:
|
| 64 |
+
return Response(
|
| 65 |
+
text=traceback.format_exc(),
|
| 66 |
+
status=aiohttp.web.HTTPInternalServerError.status_code,
|
| 67 |
+
)
|
| 68 |
+
|
| 69 |
+
return Response(
|
| 70 |
+
text=json.dumps(dataclasses.asdict(resp)),
|
| 71 |
+
content_type="application/json",
|
| 72 |
+
status=aiohttp.web.HTTPOk.status_code,
|
| 73 |
+
)
|
| 74 |
+
|
| 75 |
+
@routes.post("/api/job_agent/jobs/{job_or_submission_id}/stop")
|
| 76 |
+
@optional_utils.deny_browser_requests()
|
| 77 |
+
@optional_utils.init_ray_and_catch_exceptions()
|
| 78 |
+
async def stop_job(self, req: Request) -> Response:
|
| 79 |
+
job_or_submission_id = req.match_info["job_or_submission_id"]
|
| 80 |
+
job = await find_job_by_ids(
|
| 81 |
+
self._dashboard_agent.gcs_aio_client,
|
| 82 |
+
self.get_job_manager().job_info_client(),
|
| 83 |
+
job_or_submission_id,
|
| 84 |
+
)
|
| 85 |
+
if not job:
|
| 86 |
+
return Response(
|
| 87 |
+
text=f"Job {job_or_submission_id} does not exist",
|
| 88 |
+
status=aiohttp.web.HTTPNotFound.status_code,
|
| 89 |
+
)
|
| 90 |
+
if job.type is not JobType.SUBMISSION:
|
| 91 |
+
return Response(
|
| 92 |
+
text="Can only stop submission type jobs",
|
| 93 |
+
status=aiohttp.web.HTTPBadRequest.status_code,
|
| 94 |
+
)
|
| 95 |
+
|
| 96 |
+
try:
|
| 97 |
+
stopped = self.get_job_manager().stop_job(job.submission_id)
|
| 98 |
+
resp = JobStopResponse(stopped=stopped)
|
| 99 |
+
except Exception:
|
| 100 |
+
return Response(
|
| 101 |
+
text=traceback.format_exc(),
|
| 102 |
+
status=aiohttp.web.HTTPInternalServerError.status_code,
|
| 103 |
+
)
|
| 104 |
+
|
| 105 |
+
return Response(
|
| 106 |
+
text=json.dumps(dataclasses.asdict(resp)), content_type="application/json"
|
| 107 |
+
)
|
| 108 |
+
|
| 109 |
+
@routes.delete("/api/job_agent/jobs/{job_or_submission_id}")
|
| 110 |
+
@optional_utils.init_ray_and_catch_exceptions()
|
| 111 |
+
async def delete_job(self, req: Request) -> Response:
|
| 112 |
+
job_or_submission_id = req.match_info["job_or_submission_id"]
|
| 113 |
+
job = await find_job_by_ids(
|
| 114 |
+
self._dashboard_agent.gcs_aio_client,
|
| 115 |
+
self.get_job_manager().job_info_client(),
|
| 116 |
+
job_or_submission_id,
|
| 117 |
+
)
|
| 118 |
+
if not job:
|
| 119 |
+
return Response(
|
| 120 |
+
text=f"Job {job_or_submission_id} does not exist",
|
| 121 |
+
status=aiohttp.web.HTTPNotFound.status_code,
|
| 122 |
+
)
|
| 123 |
+
if job.type is not JobType.SUBMISSION:
|
| 124 |
+
return Response(
|
| 125 |
+
text="Can only delete submission type jobs",
|
| 126 |
+
status=aiohttp.web.HTTPBadRequest.status_code,
|
| 127 |
+
)
|
| 128 |
+
|
| 129 |
+
try:
|
| 130 |
+
deleted = await self.get_job_manager().delete_job(job.submission_id)
|
| 131 |
+
resp = JobDeleteResponse(deleted=deleted)
|
| 132 |
+
except Exception:
|
| 133 |
+
return Response(
|
| 134 |
+
text=traceback.format_exc(),
|
| 135 |
+
status=aiohttp.web.HTTPInternalServerError.status_code,
|
| 136 |
+
)
|
| 137 |
+
|
| 138 |
+
return Response(
|
| 139 |
+
text=json.dumps(dataclasses.asdict(resp)), content_type="application/json"
|
| 140 |
+
)
|
| 141 |
+
|
| 142 |
+
@routes.get("/api/job_agent/jobs/{job_or_submission_id}/logs")
|
| 143 |
+
@optional_utils.init_ray_and_catch_exceptions()
|
| 144 |
+
async def get_job_logs(self, req: Request) -> Response:
|
| 145 |
+
job_or_submission_id = req.match_info["job_or_submission_id"]
|
| 146 |
+
job = await find_job_by_ids(
|
| 147 |
+
self._dashboard_agent.gcs_aio_client,
|
| 148 |
+
self.get_job_manager().job_info_client(),
|
| 149 |
+
job_or_submission_id,
|
| 150 |
+
)
|
| 151 |
+
if not job:
|
| 152 |
+
return Response(
|
| 153 |
+
text=f"Job {job_or_submission_id} does not exist",
|
| 154 |
+
status=aiohttp.web.HTTPNotFound.status_code,
|
| 155 |
+
)
|
| 156 |
+
|
| 157 |
+
if job.type is not JobType.SUBMISSION:
|
| 158 |
+
return Response(
|
| 159 |
+
text="Can only get logs of submission type jobs",
|
| 160 |
+
status=aiohttp.web.HTTPBadRequest.status_code,
|
| 161 |
+
)
|
| 162 |
+
|
| 163 |
+
resp = JobLogsResponse(
|
| 164 |
+
logs=self.get_job_manager().get_job_logs(job.submission_id)
|
| 165 |
+
)
|
| 166 |
+
return Response(
|
| 167 |
+
text=json.dumps(dataclasses.asdict(resp)), content_type="application/json"
|
| 168 |
+
)
|
| 169 |
+
|
| 170 |
+
@routes.get("/api/job_agent/jobs/{job_or_submission_id}/logs/tail")
|
| 171 |
+
@optional_utils.init_ray_and_catch_exceptions()
|
| 172 |
+
async def tail_job_logs(self, req: Request) -> Response:
|
| 173 |
+
job_or_submission_id = req.match_info["job_or_submission_id"]
|
| 174 |
+
job = await find_job_by_ids(
|
| 175 |
+
self._dashboard_agent.gcs_aio_client,
|
| 176 |
+
self.get_job_manager().job_info_client(),
|
| 177 |
+
job_or_submission_id,
|
| 178 |
+
)
|
| 179 |
+
if not job:
|
| 180 |
+
return Response(
|
| 181 |
+
text=f"Job {job_or_submission_id} does not exist",
|
| 182 |
+
status=aiohttp.web.HTTPNotFound.status_code,
|
| 183 |
+
)
|
| 184 |
+
|
| 185 |
+
if job.type is not JobType.SUBMISSION:
|
| 186 |
+
return Response(
|
| 187 |
+
text="Can only get logs of submission type jobs",
|
| 188 |
+
status=aiohttp.web.HTTPBadRequest.status_code,
|
| 189 |
+
)
|
| 190 |
+
|
| 191 |
+
ws = aiohttp.web.WebSocketResponse()
|
| 192 |
+
await ws.prepare(req)
|
| 193 |
+
|
| 194 |
+
async for lines in self._job_manager.tail_job_logs(job.submission_id):
|
| 195 |
+
await ws.send_str(lines)
|
| 196 |
+
|
| 197 |
+
return ws
|
| 198 |
+
|
| 199 |
+
def get_job_manager(self):
|
| 200 |
+
if not self._job_manager:
|
| 201 |
+
self._job_manager = JobManager(
|
| 202 |
+
self._dashboard_agent.gcs_aio_client, self._dashboard_agent.log_dir
|
| 203 |
+
)
|
| 204 |
+
return self._job_manager
|
| 205 |
+
|
| 206 |
+
async def run(self, server):
|
| 207 |
+
pass
|
| 208 |
+
|
| 209 |
+
@staticmethod
|
| 210 |
+
def is_minimal_module():
|
| 211 |
+
return False
|
.venv/lib/python3.11/site-packages/ray/dashboard/modules/job/job_head.py
ADDED
|
@@ -0,0 +1,587 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import asyncio
|
| 2 |
+
import dataclasses
|
| 3 |
+
import json
|
| 4 |
+
import logging
|
| 5 |
+
import traceback
|
| 6 |
+
from random import sample
|
| 7 |
+
from typing import AsyncIterator, List, Optional
|
| 8 |
+
|
| 9 |
+
import aiohttp.web
|
| 10 |
+
from aiohttp.client import ClientResponse
|
| 11 |
+
from aiohttp.web import Request, Response
|
| 12 |
+
|
| 13 |
+
import ray
|
| 14 |
+
import ray.dashboard.consts as dashboard_consts
|
| 15 |
+
import ray.dashboard.optional_utils as optional_utils
|
| 16 |
+
import ray.dashboard.utils as dashboard_utils
|
| 17 |
+
from ray._private.ray_constants import env_bool
|
| 18 |
+
from ray._private.runtime_env.packaging import (
|
| 19 |
+
package_exists,
|
| 20 |
+
pin_runtime_env_uri,
|
| 21 |
+
upload_package_to_gcs,
|
| 22 |
+
)
|
| 23 |
+
from ray._private.utils import get_or_create_event_loop
|
| 24 |
+
from ray.dashboard.datacenter import DataOrganizer
|
| 25 |
+
from ray.dashboard.modules.job.common import (
|
| 26 |
+
JobDeleteResponse,
|
| 27 |
+
JobInfoStorageClient,
|
| 28 |
+
JobLogsResponse,
|
| 29 |
+
JobStopResponse,
|
| 30 |
+
JobSubmitRequest,
|
| 31 |
+
JobSubmitResponse,
|
| 32 |
+
http_uri_components_to_uri,
|
| 33 |
+
)
|
| 34 |
+
from ray.dashboard.modules.job.pydantic_models import JobDetails, JobType
|
| 35 |
+
from ray.dashboard.modules.job.utils import (
|
| 36 |
+
find_job_by_ids,
|
| 37 |
+
get_driver_jobs,
|
| 38 |
+
get_head_node_id,
|
| 39 |
+
parse_and_validate_request,
|
| 40 |
+
)
|
| 41 |
+
from ray.dashboard.modules.version import CURRENT_VERSION, VersionResponse
|
| 42 |
+
|
| 43 |
+
logger = logging.getLogger(__name__)
|
| 44 |
+
logger.setLevel(logging.INFO)
|
| 45 |
+
|
| 46 |
+
routes = optional_utils.DashboardHeadRouteTable
|
| 47 |
+
|
| 48 |
+
# Feature flag controlling whether critical Ray Job control operations are performed
|
| 49 |
+
# exclusively by the Job Agent running on the Head node (or randomly sampled Worker one)
|
| 50 |
+
#
|
| 51 |
+
# NOTE: This flag serves as a temporary kill-switch and should be eventually cleaned up
|
| 52 |
+
RAY_JOB_AGENT_USE_HEAD_NODE_ONLY = env_bool("RAY_JOB_AGENT_USE_HEAD_NODE_ONLY", True)
|
| 53 |
+
|
| 54 |
+
|
| 55 |
+
class JobAgentSubmissionClient:
|
| 56 |
+
"""A local client for submitting and interacting with jobs on a specific node
|
| 57 |
+
in the remote cluster.
|
| 58 |
+
Submits requests over HTTP to the job agent on the specific node using the REST API.
|
| 59 |
+
"""
|
| 60 |
+
|
| 61 |
+
def __init__(
|
| 62 |
+
self,
|
| 63 |
+
dashboard_agent_address: str,
|
| 64 |
+
):
|
| 65 |
+
self._agent_address = dashboard_agent_address
|
| 66 |
+
self._session = aiohttp.ClientSession()
|
| 67 |
+
|
| 68 |
+
async def _raise_error(self, resp: ClientResponse):
|
| 69 |
+
status = resp.status
|
| 70 |
+
error_text = await resp.text()
|
| 71 |
+
raise RuntimeError(f"Request failed with status code {status}: {error_text}.")
|
| 72 |
+
|
| 73 |
+
async def submit_job_internal(self, req: JobSubmitRequest) -> JobSubmitResponse:
|
| 74 |
+
logger.debug(f"Submitting job with submission_id={req.submission_id}.")
|
| 75 |
+
|
| 76 |
+
async with self._session.post(
|
| 77 |
+
f"{self._agent_address}/api/job_agent/jobs/", json=dataclasses.asdict(req)
|
| 78 |
+
) as resp:
|
| 79 |
+
if resp.status == 200:
|
| 80 |
+
result_json = await resp.json()
|
| 81 |
+
return JobSubmitResponse(**result_json)
|
| 82 |
+
else:
|
| 83 |
+
await self._raise_error(resp)
|
| 84 |
+
|
| 85 |
+
async def stop_job_internal(self, job_id: str) -> JobStopResponse:
|
| 86 |
+
logger.debug(f"Stopping job with job_id={job_id}.")
|
| 87 |
+
|
| 88 |
+
async with self._session.post(
|
| 89 |
+
f"{self._agent_address}/api/job_agent/jobs/{job_id}/stop"
|
| 90 |
+
) as resp:
|
| 91 |
+
if resp.status == 200:
|
| 92 |
+
result_json = await resp.json()
|
| 93 |
+
return JobStopResponse(**result_json)
|
| 94 |
+
else:
|
| 95 |
+
await self._raise_error(resp)
|
| 96 |
+
|
| 97 |
+
async def delete_job_internal(self, job_id: str) -> JobDeleteResponse:
|
| 98 |
+
logger.debug(f"Deleting job with job_id={job_id}.")
|
| 99 |
+
|
| 100 |
+
async with self._session.delete(
|
| 101 |
+
f"{self._agent_address}/api/job_agent/jobs/{job_id}"
|
| 102 |
+
) as resp:
|
| 103 |
+
if resp.status == 200:
|
| 104 |
+
result_json = await resp.json()
|
| 105 |
+
return JobDeleteResponse(**result_json)
|
| 106 |
+
else:
|
| 107 |
+
await self._raise_error(resp)
|
| 108 |
+
|
| 109 |
+
async def get_job_logs_internal(self, job_id: str) -> JobLogsResponse:
|
| 110 |
+
async with self._session.get(
|
| 111 |
+
f"{self._agent_address}/api/job_agent/jobs/{job_id}/logs"
|
| 112 |
+
) as resp:
|
| 113 |
+
if resp.status == 200:
|
| 114 |
+
result_json = await resp.json()
|
| 115 |
+
return JobLogsResponse(**result_json)
|
| 116 |
+
else:
|
| 117 |
+
await self._raise_error(resp)
|
| 118 |
+
|
| 119 |
+
async def tail_job_logs(self, job_id: str) -> AsyncIterator[str]:
|
| 120 |
+
"""Get an iterator that follows the logs of a job."""
|
| 121 |
+
ws = await self._session.ws_connect(
|
| 122 |
+
f"{self._agent_address}/api/job_agent/jobs/{job_id}/logs/tail"
|
| 123 |
+
)
|
| 124 |
+
|
| 125 |
+
while True:
|
| 126 |
+
msg = await ws.receive()
|
| 127 |
+
|
| 128 |
+
if msg.type == aiohttp.WSMsgType.TEXT:
|
| 129 |
+
yield msg.data
|
| 130 |
+
elif msg.type == aiohttp.WSMsgType.CLOSED:
|
| 131 |
+
break
|
| 132 |
+
elif msg.type == aiohttp.WSMsgType.ERROR:
|
| 133 |
+
pass
|
| 134 |
+
|
| 135 |
+
async def close(self, ignore_error=True):
|
| 136 |
+
try:
|
| 137 |
+
await self._session.close()
|
| 138 |
+
except Exception:
|
| 139 |
+
if not ignore_error:
|
| 140 |
+
raise
|
| 141 |
+
|
| 142 |
+
|
| 143 |
+
class JobHead(dashboard_utils.DashboardHeadModule):
|
| 144 |
+
"""Runs on the head node of a Ray cluster and handles Ray Jobs APIs.
|
| 145 |
+
|
| 146 |
+
NOTE(architkulkarni): Please keep this class in sync with the OpenAPI spec at
|
| 147 |
+
`doc/source/cluster/running-applications/job-submission/openapi.yml`.
|
| 148 |
+
We currently do not automatically check that the OpenAPI
|
| 149 |
+
spec is in sync with the implementation. If any changes are made to the
|
| 150 |
+
paths in the @route decorators or in the Responses returned by the
|
| 151 |
+
methods (or any nested fields in the Responses), you will need to find the
|
| 152 |
+
corresponding field of the OpenAPI yaml file and update it manually. Also,
|
| 153 |
+
bump the version number in the yaml file and in this class's `get_version`.
|
| 154 |
+
"""
|
| 155 |
+
|
| 156 |
+
# Time that we sleep while tailing logs while waiting for
|
| 157 |
+
# the supervisor actor to start. We don't know which node
|
| 158 |
+
# to read the logs from until then.
|
| 159 |
+
WAIT_FOR_SUPERVISOR_ACTOR_INTERVAL_S = 1
|
| 160 |
+
|
| 161 |
+
def __init__(self, config: dashboard_utils.DashboardHeadModuleConfig):
|
| 162 |
+
super().__init__(config)
|
| 163 |
+
self._job_info_client = None
|
| 164 |
+
|
| 165 |
+
# It contains all `JobAgentSubmissionClient` that
|
| 166 |
+
# `JobHead` has ever used, and will not be deleted
|
| 167 |
+
# from it unless `JobAgentSubmissionClient` is no
|
| 168 |
+
# longer available (the corresponding agent process is dead)
|
| 169 |
+
self._agents = dict()
|
| 170 |
+
|
| 171 |
+
async def get_target_agent(self) -> Optional[JobAgentSubmissionClient]:
|
| 172 |
+
if RAY_JOB_AGENT_USE_HEAD_NODE_ONLY:
|
| 173 |
+
return await self._get_head_node_agent()
|
| 174 |
+
|
| 175 |
+
return await self._pick_random_agent()
|
| 176 |
+
|
| 177 |
+
async def _pick_random_agent(self) -> Optional[JobAgentSubmissionClient]:
|
| 178 |
+
"""
|
| 179 |
+
Try to disperse as much as possible to select one of
|
| 180 |
+
the `CANDIDATE_AGENT_NUMBER` agents to solve requests.
|
| 181 |
+
the agents will not pop from `self._agents` unless
|
| 182 |
+
it's dead. Saved in `self._agents` is the agent that was
|
| 183 |
+
used before.
|
| 184 |
+
Strategy:
|
| 185 |
+
1. if the number of `self._agents` has reached
|
| 186 |
+
`CANDIDATE_AGENT_NUMBER`, randomly select one agent from
|
| 187 |
+
`self._agents`.
|
| 188 |
+
2. if not, randomly select one agent from all available agents,
|
| 189 |
+
it is possible that the selected one already exists in
|
| 190 |
+
`self._agents`.
|
| 191 |
+
"""
|
| 192 |
+
# NOTE: Following call will block until there's at least 1 agent info
|
| 193 |
+
# being populated from GCS
|
| 194 |
+
agent_infos = await self._fetch_agent_infos()
|
| 195 |
+
|
| 196 |
+
# delete dead agents.
|
| 197 |
+
for dead_node in set(self._agents) - set(agent_infos):
|
| 198 |
+
client = self._agents.pop(dead_node)
|
| 199 |
+
await client.close()
|
| 200 |
+
|
| 201 |
+
if len(self._agents) >= dashboard_consts.CANDIDATE_AGENT_NUMBER:
|
| 202 |
+
node_id = sample(list(set(self._agents)), 1)[0]
|
| 203 |
+
return self._agents[node_id]
|
| 204 |
+
else:
|
| 205 |
+
# Randomly select one from among all agents, it is possible that
|
| 206 |
+
# the selected one already exists in `self._agents`
|
| 207 |
+
node_id = sample(sorted(agent_infos), 1)[0]
|
| 208 |
+
agent_info = agent_infos[node_id]
|
| 209 |
+
|
| 210 |
+
if node_id not in self._agents:
|
| 211 |
+
node_ip = agent_info["ipAddress"]
|
| 212 |
+
http_port = agent_info["httpPort"]
|
| 213 |
+
agent_http_address = f"http://{node_ip}:{http_port}"
|
| 214 |
+
self._agents[node_id] = JobAgentSubmissionClient(agent_http_address)
|
| 215 |
+
|
| 216 |
+
return self._agents[node_id]
|
| 217 |
+
|
| 218 |
+
async def _get_head_node_agent(self) -> Optional[JobAgentSubmissionClient]:
|
| 219 |
+
"""Retrieves HTTP client for `JobAgent` running on the Head node"""
|
| 220 |
+
|
| 221 |
+
head_node_id = await get_head_node_id(self.gcs_aio_client)
|
| 222 |
+
|
| 223 |
+
if not head_node_id:
|
| 224 |
+
logger.warning("Head node id has not yet been persisted in GCS")
|
| 225 |
+
return None
|
| 226 |
+
|
| 227 |
+
if head_node_id not in self._agents:
|
| 228 |
+
agent_infos = await self._fetch_agent_infos(target_node_ids=[head_node_id])
|
| 229 |
+
if head_node_id not in agent_infos:
|
| 230 |
+
logger.error("Head node agent's information was not found")
|
| 231 |
+
return None
|
| 232 |
+
|
| 233 |
+
agent_info = agent_infos[head_node_id]
|
| 234 |
+
|
| 235 |
+
node_ip = agent_info["ipAddress"]
|
| 236 |
+
http_port = agent_info["httpPort"]
|
| 237 |
+
agent_http_address = f"http://{node_ip}:{http_port}"
|
| 238 |
+
|
| 239 |
+
self._agents[head_node_id] = JobAgentSubmissionClient(agent_http_address)
|
| 240 |
+
|
| 241 |
+
return self._agents[head_node_id]
|
| 242 |
+
|
| 243 |
+
@staticmethod
|
| 244 |
+
async def _fetch_agent_infos(target_node_ids: Optional[List[str]] = None):
|
| 245 |
+
"""Fetches agent infos for nodes identified by provided node-ids (for all
|
| 246 |
+
nodes if not provided)
|
| 247 |
+
|
| 248 |
+
NOTE: This call will block until there's at least 1 valid agent info populated
|
| 249 |
+
"""
|
| 250 |
+
|
| 251 |
+
while True:
|
| 252 |
+
raw_agent_infos = await DataOrganizer.get_agent_infos(target_node_ids)
|
| 253 |
+
# Filter out invalid agent infos with unset HTTP port
|
| 254 |
+
agent_infos = {
|
| 255 |
+
key: value
|
| 256 |
+
for key, value in raw_agent_infos.items()
|
| 257 |
+
if value.get("httpPort", -1) > 0
|
| 258 |
+
}
|
| 259 |
+
|
| 260 |
+
if len(agent_infos) > 0:
|
| 261 |
+
return agent_infos
|
| 262 |
+
|
| 263 |
+
await asyncio.sleep(dashboard_consts.TRY_TO_GET_AGENT_INFO_INTERVAL_SECONDS)
|
| 264 |
+
|
| 265 |
+
@routes.get("/api/version")
|
| 266 |
+
async def get_version(self, req: Request) -> Response:
|
| 267 |
+
# NOTE(edoakes): CURRENT_VERSION should be bumped and checked on the
|
| 268 |
+
# client when we have backwards-incompatible changes.
|
| 269 |
+
resp = VersionResponse(
|
| 270 |
+
version=CURRENT_VERSION,
|
| 271 |
+
ray_version=ray.__version__,
|
| 272 |
+
ray_commit=ray.__commit__,
|
| 273 |
+
session_name=self.session_name,
|
| 274 |
+
)
|
| 275 |
+
return Response(
|
| 276 |
+
text=json.dumps(dataclasses.asdict(resp)),
|
| 277 |
+
content_type="application/json",
|
| 278 |
+
status=aiohttp.web.HTTPOk.status_code,
|
| 279 |
+
)
|
| 280 |
+
|
| 281 |
+
@routes.get("/api/packages/{protocol}/{package_name}")
|
| 282 |
+
async def get_package(self, req: Request) -> Response:
|
| 283 |
+
package_uri = http_uri_components_to_uri(
|
| 284 |
+
protocol=req.match_info["protocol"],
|
| 285 |
+
package_name=req.match_info["package_name"],
|
| 286 |
+
)
|
| 287 |
+
|
| 288 |
+
logger.debug(f"Adding temporary reference to package {package_uri}.")
|
| 289 |
+
try:
|
| 290 |
+
pin_runtime_env_uri(package_uri)
|
| 291 |
+
except Exception:
|
| 292 |
+
return Response(
|
| 293 |
+
text=traceback.format_exc(),
|
| 294 |
+
status=aiohttp.web.HTTPInternalServerError.status_code,
|
| 295 |
+
)
|
| 296 |
+
|
| 297 |
+
if not package_exists(package_uri):
|
| 298 |
+
return Response(
|
| 299 |
+
text=f"Package {package_uri} does not exist",
|
| 300 |
+
status=aiohttp.web.HTTPNotFound.status_code,
|
| 301 |
+
)
|
| 302 |
+
|
| 303 |
+
return Response()
|
| 304 |
+
|
| 305 |
+
@routes.put("/api/packages/{protocol}/{package_name}")
|
| 306 |
+
async def upload_package(self, req: Request):
|
| 307 |
+
package_uri = http_uri_components_to_uri(
|
| 308 |
+
protocol=req.match_info["protocol"],
|
| 309 |
+
package_name=req.match_info["package_name"],
|
| 310 |
+
)
|
| 311 |
+
logger.info(f"Uploading package {package_uri} to the GCS.")
|
| 312 |
+
try:
|
| 313 |
+
data = await req.read()
|
| 314 |
+
await get_or_create_event_loop().run_in_executor(
|
| 315 |
+
None,
|
| 316 |
+
upload_package_to_gcs,
|
| 317 |
+
package_uri,
|
| 318 |
+
data,
|
| 319 |
+
)
|
| 320 |
+
except Exception:
|
| 321 |
+
return Response(
|
| 322 |
+
text=traceback.format_exc(),
|
| 323 |
+
status=aiohttp.web.HTTPInternalServerError.status_code,
|
| 324 |
+
)
|
| 325 |
+
|
| 326 |
+
return Response(status=aiohttp.web.HTTPOk.status_code)
|
| 327 |
+
|
| 328 |
+
@routes.post("/api/jobs/")
|
| 329 |
+
async def submit_job(self, req: Request) -> Response:
|
| 330 |
+
result = await parse_and_validate_request(req, JobSubmitRequest)
|
| 331 |
+
# Request parsing failed, returned with Response object.
|
| 332 |
+
if isinstance(result, Response):
|
| 333 |
+
return result
|
| 334 |
+
else:
|
| 335 |
+
submit_request: JobSubmitRequest = result
|
| 336 |
+
|
| 337 |
+
try:
|
| 338 |
+
job_agent_client = await asyncio.wait_for(
|
| 339 |
+
self.get_target_agent(),
|
| 340 |
+
timeout=dashboard_consts.WAIT_AVAILABLE_AGENT_TIMEOUT,
|
| 341 |
+
)
|
| 342 |
+
resp = await job_agent_client.submit_job_internal(submit_request)
|
| 343 |
+
except asyncio.TimeoutError:
|
| 344 |
+
return Response(
|
| 345 |
+
text="No available agent to submit job, please try again later.",
|
| 346 |
+
status=aiohttp.web.HTTPInternalServerError.status_code,
|
| 347 |
+
)
|
| 348 |
+
except (TypeError, ValueError):
|
| 349 |
+
return Response(
|
| 350 |
+
text=traceback.format_exc(),
|
| 351 |
+
status=aiohttp.web.HTTPBadRequest.status_code,
|
| 352 |
+
)
|
| 353 |
+
except Exception:
|
| 354 |
+
return Response(
|
| 355 |
+
text=traceback.format_exc(),
|
| 356 |
+
status=aiohttp.web.HTTPInternalServerError.status_code,
|
| 357 |
+
)
|
| 358 |
+
|
| 359 |
+
return Response(
|
| 360 |
+
text=json.dumps(dataclasses.asdict(resp)),
|
| 361 |
+
content_type="application/json",
|
| 362 |
+
status=aiohttp.web.HTTPOk.status_code,
|
| 363 |
+
)
|
| 364 |
+
|
| 365 |
+
@routes.post("/api/jobs/{job_or_submission_id}/stop")
|
| 366 |
+
async def stop_job(self, req: Request) -> Response:
|
| 367 |
+
job_or_submission_id = req.match_info["job_or_submission_id"]
|
| 368 |
+
job = await find_job_by_ids(
|
| 369 |
+
self.gcs_aio_client,
|
| 370 |
+
self._job_info_client,
|
| 371 |
+
job_or_submission_id,
|
| 372 |
+
)
|
| 373 |
+
if not job:
|
| 374 |
+
return Response(
|
| 375 |
+
text=f"Job {job_or_submission_id} does not exist",
|
| 376 |
+
status=aiohttp.web.HTTPNotFound.status_code,
|
| 377 |
+
)
|
| 378 |
+
if job.type is not JobType.SUBMISSION:
|
| 379 |
+
return Response(
|
| 380 |
+
text="Can only stop submission type jobs",
|
| 381 |
+
status=aiohttp.web.HTTPBadRequest.status_code,
|
| 382 |
+
)
|
| 383 |
+
|
| 384 |
+
try:
|
| 385 |
+
job_agent_client = await asyncio.wait_for(
|
| 386 |
+
self.get_target_agent(),
|
| 387 |
+
timeout=dashboard_consts.WAIT_AVAILABLE_AGENT_TIMEOUT,
|
| 388 |
+
)
|
| 389 |
+
resp = await job_agent_client.stop_job_internal(job.submission_id)
|
| 390 |
+
except Exception:
|
| 391 |
+
return Response(
|
| 392 |
+
text=traceback.format_exc(),
|
| 393 |
+
status=aiohttp.web.HTTPInternalServerError.status_code,
|
| 394 |
+
)
|
| 395 |
+
|
| 396 |
+
return Response(
|
| 397 |
+
text=json.dumps(dataclasses.asdict(resp)), content_type="application/json"
|
| 398 |
+
)
|
| 399 |
+
|
| 400 |
+
@routes.delete("/api/jobs/{job_or_submission_id}")
|
| 401 |
+
async def delete_job(self, req: Request) -> Response:
|
| 402 |
+
job_or_submission_id = req.match_info["job_or_submission_id"]
|
| 403 |
+
job = await find_job_by_ids(
|
| 404 |
+
self.gcs_aio_client,
|
| 405 |
+
self._job_info_client,
|
| 406 |
+
job_or_submission_id,
|
| 407 |
+
)
|
| 408 |
+
if not job:
|
| 409 |
+
return Response(
|
| 410 |
+
text=f"Job {job_or_submission_id} does not exist",
|
| 411 |
+
status=aiohttp.web.HTTPNotFound.status_code,
|
| 412 |
+
)
|
| 413 |
+
if job.type is not JobType.SUBMISSION:
|
| 414 |
+
return Response(
|
| 415 |
+
text="Can only delete submission type jobs",
|
| 416 |
+
status=aiohttp.web.HTTPBadRequest.status_code,
|
| 417 |
+
)
|
| 418 |
+
|
| 419 |
+
try:
|
| 420 |
+
job_agent_client = await asyncio.wait_for(
|
| 421 |
+
self.get_target_agent(),
|
| 422 |
+
timeout=dashboard_consts.WAIT_AVAILABLE_AGENT_TIMEOUT,
|
| 423 |
+
)
|
| 424 |
+
resp = await job_agent_client.delete_job_internal(job.submission_id)
|
| 425 |
+
except Exception:
|
| 426 |
+
return Response(
|
| 427 |
+
text=traceback.format_exc(),
|
| 428 |
+
status=aiohttp.web.HTTPInternalServerError.status_code,
|
| 429 |
+
)
|
| 430 |
+
|
| 431 |
+
return Response(
|
| 432 |
+
text=json.dumps(dataclasses.asdict(resp)), content_type="application/json"
|
| 433 |
+
)
|
| 434 |
+
|
| 435 |
+
@routes.get("/api/jobs/{job_or_submission_id}")
|
| 436 |
+
async def get_job_info(self, req: Request) -> Response:
|
| 437 |
+
job_or_submission_id = req.match_info["job_or_submission_id"]
|
| 438 |
+
job = await find_job_by_ids(
|
| 439 |
+
self.gcs_aio_client,
|
| 440 |
+
self._job_info_client,
|
| 441 |
+
job_or_submission_id,
|
| 442 |
+
)
|
| 443 |
+
if not job:
|
| 444 |
+
return Response(
|
| 445 |
+
text=f"Job {job_or_submission_id} does not exist",
|
| 446 |
+
status=aiohttp.web.HTTPNotFound.status_code,
|
| 447 |
+
)
|
| 448 |
+
|
| 449 |
+
return Response(
|
| 450 |
+
text=json.dumps(job.dict()),
|
| 451 |
+
content_type="application/json",
|
| 452 |
+
)
|
| 453 |
+
|
| 454 |
+
# TODO(rickyx): This endpoint's logic is also mirrored in state API's endpoint.
|
| 455 |
+
# We should eventually unify the backend logic (and keep the logic in sync before
|
| 456 |
+
# that).
|
| 457 |
+
@routes.get("/api/jobs/")
|
| 458 |
+
async def list_jobs(self, req: Request) -> Response:
|
| 459 |
+
(driver_jobs, submission_job_drivers), submission_jobs = await asyncio.gather(
|
| 460 |
+
get_driver_jobs(self.gcs_aio_client), self._job_info_client.get_all_jobs()
|
| 461 |
+
)
|
| 462 |
+
|
| 463 |
+
submission_jobs = [
|
| 464 |
+
JobDetails(
|
| 465 |
+
**dataclasses.asdict(job),
|
| 466 |
+
submission_id=submission_id,
|
| 467 |
+
job_id=submission_job_drivers.get(submission_id).id
|
| 468 |
+
if submission_id in submission_job_drivers
|
| 469 |
+
else None,
|
| 470 |
+
driver_info=submission_job_drivers.get(submission_id),
|
| 471 |
+
type=JobType.SUBMISSION,
|
| 472 |
+
)
|
| 473 |
+
for submission_id, job in submission_jobs.items()
|
| 474 |
+
]
|
| 475 |
+
return Response(
|
| 476 |
+
text=json.dumps(
|
| 477 |
+
[
|
| 478 |
+
*[submission_job.dict() for submission_job in submission_jobs],
|
| 479 |
+
*[job_info.dict() for job_info in driver_jobs.values()],
|
| 480 |
+
]
|
| 481 |
+
),
|
| 482 |
+
content_type="application/json",
|
| 483 |
+
)
|
| 484 |
+
|
| 485 |
+
@routes.get("/api/jobs/{job_or_submission_id}/logs")
|
| 486 |
+
async def get_job_logs(self, req: Request) -> Response:
|
| 487 |
+
job_or_submission_id = req.match_info["job_or_submission_id"]
|
| 488 |
+
job = await find_job_by_ids(
|
| 489 |
+
self.gcs_aio_client,
|
| 490 |
+
self._job_info_client,
|
| 491 |
+
job_or_submission_id,
|
| 492 |
+
)
|
| 493 |
+
if not job:
|
| 494 |
+
return Response(
|
| 495 |
+
text=f"Job {job_or_submission_id} does not exist",
|
| 496 |
+
status=aiohttp.web.HTTPNotFound.status_code,
|
| 497 |
+
)
|
| 498 |
+
|
| 499 |
+
if job.type is not JobType.SUBMISSION:
|
| 500 |
+
return Response(
|
| 501 |
+
text="Can only get logs of submission type jobs",
|
| 502 |
+
status=aiohttp.web.HTTPBadRequest.status_code,
|
| 503 |
+
)
|
| 504 |
+
|
| 505 |
+
try:
|
| 506 |
+
job_agent_client = self.get_job_driver_agent_client(job)
|
| 507 |
+
payload = (
|
| 508 |
+
await job_agent_client.get_job_logs_internal(job.submission_id)
|
| 509 |
+
if job_agent_client
|
| 510 |
+
else JobLogsResponse("")
|
| 511 |
+
)
|
| 512 |
+
return Response(
|
| 513 |
+
text=json.dumps(dataclasses.asdict(payload)),
|
| 514 |
+
content_type="application/json",
|
| 515 |
+
)
|
| 516 |
+
except Exception:
|
| 517 |
+
return Response(
|
| 518 |
+
text=traceback.format_exc(),
|
| 519 |
+
status=aiohttp.web.HTTPInternalServerError.status_code,
|
| 520 |
+
)
|
| 521 |
+
|
| 522 |
+
@routes.get("/api/jobs/{job_or_submission_id}/logs/tail")
|
| 523 |
+
async def tail_job_logs(self, req: Request) -> Response:
|
| 524 |
+
job_or_submission_id = req.match_info["job_or_submission_id"]
|
| 525 |
+
job = await find_job_by_ids(
|
| 526 |
+
self.gcs_aio_client,
|
| 527 |
+
self._job_info_client,
|
| 528 |
+
job_or_submission_id,
|
| 529 |
+
)
|
| 530 |
+
if not job:
|
| 531 |
+
return Response(
|
| 532 |
+
text=f"Job {job_or_submission_id} does not exist",
|
| 533 |
+
status=aiohttp.web.HTTPNotFound.status_code,
|
| 534 |
+
)
|
| 535 |
+
|
| 536 |
+
if job.type is not JobType.SUBMISSION:
|
| 537 |
+
return Response(
|
| 538 |
+
text="Can only get logs of submission type jobs",
|
| 539 |
+
status=aiohttp.web.HTTPBadRequest.status_code,
|
| 540 |
+
)
|
| 541 |
+
|
| 542 |
+
ws = aiohttp.web.WebSocketResponse()
|
| 543 |
+
await ws.prepare(req)
|
| 544 |
+
|
| 545 |
+
driver_agent_http_address = None
|
| 546 |
+
while driver_agent_http_address is None:
|
| 547 |
+
job = await find_job_by_ids(
|
| 548 |
+
self.gcs_aio_client,
|
| 549 |
+
self._job_info_client,
|
| 550 |
+
job_or_submission_id,
|
| 551 |
+
)
|
| 552 |
+
driver_agent_http_address = job.driver_agent_http_address
|
| 553 |
+
status = job.status
|
| 554 |
+
if status.is_terminal() and driver_agent_http_address is None:
|
| 555 |
+
# Job exited before supervisor actor started.
|
| 556 |
+
return ws
|
| 557 |
+
|
| 558 |
+
await asyncio.sleep(self.WAIT_FOR_SUPERVISOR_ACTOR_INTERVAL_S)
|
| 559 |
+
|
| 560 |
+
job_agent_client = self.get_job_driver_agent_client(job)
|
| 561 |
+
|
| 562 |
+
async for lines in job_agent_client.tail_job_logs(job.submission_id):
|
| 563 |
+
await ws.send_str(lines)
|
| 564 |
+
|
| 565 |
+
return ws
|
| 566 |
+
|
| 567 |
+
def get_job_driver_agent_client(
|
| 568 |
+
self, job: JobDetails
|
| 569 |
+
) -> Optional[JobAgentSubmissionClient]:
|
| 570 |
+
if job.driver_agent_http_address is None:
|
| 571 |
+
return None
|
| 572 |
+
|
| 573 |
+
driver_node_id = job.driver_node_id
|
| 574 |
+
if driver_node_id not in self._agents:
|
| 575 |
+
self._agents[driver_node_id] = JobAgentSubmissionClient(
|
| 576 |
+
job.driver_agent_http_address
|
| 577 |
+
)
|
| 578 |
+
|
| 579 |
+
return self._agents[driver_node_id]
|
| 580 |
+
|
| 581 |
+
async def run(self, server):
|
| 582 |
+
if not self._job_info_client:
|
| 583 |
+
self._job_info_client = JobInfoStorageClient(self.gcs_aio_client)
|
| 584 |
+
|
| 585 |
+
@staticmethod
|
| 586 |
+
def is_minimal_module():
|
| 587 |
+
return False
|
.venv/lib/python3.11/site-packages/ray/dashboard/modules/job/job_log_storage_client.py
ADDED
|
@@ -0,0 +1,61 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
from collections import deque
|
| 3 |
+
from typing import AsyncIterator, List, Tuple
|
| 4 |
+
|
| 5 |
+
import ray
|
| 6 |
+
from ray.dashboard.modules.job.common import JOB_LOGS_PATH_TEMPLATE
|
| 7 |
+
from ray.dashboard.modules.job.utils import file_tail_iterator
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
class JobLogStorageClient:
|
| 11 |
+
"""
|
| 12 |
+
Disk storage for stdout / stderr of driver script logs.
|
| 13 |
+
"""
|
| 14 |
+
|
| 15 |
+
# Number of last N lines to put in job message upon failure.
|
| 16 |
+
NUM_LOG_LINES_ON_ERROR = 10
|
| 17 |
+
# Maximum number of characters to print out of the logs to avoid
|
| 18 |
+
# HUGE log outputs that bring down the api server
|
| 19 |
+
MAX_LOG_SIZE = 20000
|
| 20 |
+
|
| 21 |
+
def get_logs(self, job_id: str) -> str:
|
| 22 |
+
try:
|
| 23 |
+
with open(self.get_log_file_path(job_id), "r") as f:
|
| 24 |
+
return f.read()
|
| 25 |
+
except FileNotFoundError:
|
| 26 |
+
return ""
|
| 27 |
+
|
| 28 |
+
def tail_logs(self, job_id: str) -> AsyncIterator[List[str]]:
|
| 29 |
+
return file_tail_iterator(self.get_log_file_path(job_id))
|
| 30 |
+
|
| 31 |
+
async def get_last_n_log_lines(
|
| 32 |
+
self, job_id: str, num_log_lines=NUM_LOG_LINES_ON_ERROR
|
| 33 |
+
) -> str:
|
| 34 |
+
"""
|
| 35 |
+
Returns the last MAX_LOG_SIZE (20000) characters in the last
|
| 36 |
+
`num_log_lines` lines.
|
| 37 |
+
|
| 38 |
+
Args:
|
| 39 |
+
job_id: The id of the job whose logs we want to return
|
| 40 |
+
num_log_lines: The number of lines to return.
|
| 41 |
+
"""
|
| 42 |
+
log_tail_deque = deque(maxlen=num_log_lines)
|
| 43 |
+
async for lines in self.tail_logs(job_id):
|
| 44 |
+
if lines is None:
|
| 45 |
+
break
|
| 46 |
+
else:
|
| 47 |
+
# log_tail_iter can return batches of lines at a time.
|
| 48 |
+
for line in lines:
|
| 49 |
+
log_tail_deque.append(line)
|
| 50 |
+
|
| 51 |
+
return "".join(log_tail_deque)[-self.MAX_LOG_SIZE :]
|
| 52 |
+
|
| 53 |
+
def get_log_file_path(self, job_id: str) -> Tuple[str, str]:
|
| 54 |
+
"""
|
| 55 |
+
Get the file path to the logs of a given job. Example:
|
| 56 |
+
/tmp/ray/session_date/logs/job-driver-{job_id}.log
|
| 57 |
+
"""
|
| 58 |
+
return os.path.join(
|
| 59 |
+
ray._private.worker._global_node.get_logs_dir_path(),
|
| 60 |
+
JOB_LOGS_PATH_TEMPLATE.format(submission_id=job_id),
|
| 61 |
+
)
|
.venv/lib/python3.11/site-packages/ray/dashboard/modules/job/job_manager.py
ADDED
|
@@ -0,0 +1,640 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import asyncio
|
| 2 |
+
import copy
|
| 3 |
+
import logging
|
| 4 |
+
import os
|
| 5 |
+
import random
|
| 6 |
+
import string
|
| 7 |
+
import time
|
| 8 |
+
import traceback
|
| 9 |
+
from typing import Any, AsyncIterator, Dict, Optional, Union
|
| 10 |
+
|
| 11 |
+
import ray
|
| 12 |
+
import ray._private.ray_constants as ray_constants
|
| 13 |
+
from ray._private.event.event_logger import get_event_logger
|
| 14 |
+
from ray._private.gcs_utils import GcsAioClient
|
| 15 |
+
from ray._private.utils import run_background_task
|
| 16 |
+
from ray.actor import ActorHandle
|
| 17 |
+
from ray.core.generated.event_pb2 import Event
|
| 18 |
+
from ray.dashboard.consts import (
|
| 19 |
+
DEFAULT_JOB_START_TIMEOUT_SECONDS,
|
| 20 |
+
RAY_JOB_ALLOW_DRIVER_ON_WORKER_NODES_ENV_VAR,
|
| 21 |
+
RAY_JOB_START_TIMEOUT_SECONDS_ENV_VAR,
|
| 22 |
+
RAY_STREAM_RUNTIME_ENV_LOG_TO_JOB_DRIVER_LOG_ENV_VAR,
|
| 23 |
+
)
|
| 24 |
+
from ray.dashboard.modules.job.common import (
|
| 25 |
+
JOB_ACTOR_NAME_TEMPLATE,
|
| 26 |
+
SUPERVISOR_ACTOR_RAY_NAMESPACE,
|
| 27 |
+
JobInfo,
|
| 28 |
+
JobInfoStorageClient,
|
| 29 |
+
)
|
| 30 |
+
from ray.dashboard.modules.job.job_log_storage_client import JobLogStorageClient
|
| 31 |
+
from ray.dashboard.modules.job.job_supervisor import JobSupervisor
|
| 32 |
+
from ray.dashboard.modules.job.utils import get_head_node_id
|
| 33 |
+
from ray.dashboard.utils import close_logger_file_descriptor
|
| 34 |
+
from ray.exceptions import ActorUnschedulableError, RuntimeEnvSetupError
|
| 35 |
+
from ray.job_submission import JobStatus
|
| 36 |
+
from ray.runtime_env import RuntimeEnvConfig
|
| 37 |
+
from ray.util.scheduling_strategies import (
|
| 38 |
+
NodeAffinitySchedulingStrategy,
|
| 39 |
+
SchedulingStrategyT,
|
| 40 |
+
)
|
| 41 |
+
|
| 42 |
+
logger = logging.getLogger(__name__)
|
| 43 |
+
|
| 44 |
+
|
| 45 |
+
def generate_job_id() -> str:
|
| 46 |
+
"""Returns a job_id of the form 'raysubmit_XYZ'.
|
| 47 |
+
|
| 48 |
+
Prefixed with 'raysubmit' to avoid confusion with Ray JobID (driver ID).
|
| 49 |
+
"""
|
| 50 |
+
rand = random.SystemRandom()
|
| 51 |
+
possible_characters = list(
|
| 52 |
+
set(string.ascii_letters + string.digits)
|
| 53 |
+
- {"I", "l", "o", "O", "0"} # No confusing characters
|
| 54 |
+
)
|
| 55 |
+
id_part = "".join(rand.choices(possible_characters, k=16))
|
| 56 |
+
return f"raysubmit_{id_part}"
|
| 57 |
+
|
| 58 |
+
|
| 59 |
+
class JobManager:
|
| 60 |
+
"""Provide python APIs for job submission and management.
|
| 61 |
+
|
| 62 |
+
It does not provide persistence, all info will be lost if the cluster
|
| 63 |
+
goes down.
|
| 64 |
+
"""
|
| 65 |
+
|
| 66 |
+
# Time that we will sleep while tailing logs if no new log line is
|
| 67 |
+
# available.
|
| 68 |
+
LOG_TAIL_SLEEP_S = 1
|
| 69 |
+
JOB_MONITOR_LOOP_PERIOD_S = 1
|
| 70 |
+
WAIT_FOR_ACTOR_DEATH_TIMEOUT_S = 0.1
|
| 71 |
+
|
| 72 |
+
def __init__(self, gcs_aio_client: GcsAioClient, logs_dir: str):
|
| 73 |
+
self._gcs_aio_client = gcs_aio_client
|
| 74 |
+
self._logs_dir = logs_dir
|
| 75 |
+
self._job_info_client = JobInfoStorageClient(gcs_aio_client, logs_dir)
|
| 76 |
+
self._gcs_address = gcs_aio_client.address
|
| 77 |
+
self._cluster_id_hex = gcs_aio_client.cluster_id.hex()
|
| 78 |
+
self._log_client = JobLogStorageClient()
|
| 79 |
+
self._supervisor_actor_cls = ray.remote(JobSupervisor)
|
| 80 |
+
self.monitored_jobs = set()
|
| 81 |
+
try:
|
| 82 |
+
self.event_logger = get_event_logger(Event.SourceType.JOBS, logs_dir)
|
| 83 |
+
except Exception:
|
| 84 |
+
self.event_logger = None
|
| 85 |
+
|
| 86 |
+
self._recover_running_jobs_event = asyncio.Event()
|
| 87 |
+
run_background_task(self._recover_running_jobs())
|
| 88 |
+
|
| 89 |
+
def _get_job_driver_logger(self, job_id: str) -> logging.Logger:
|
| 90 |
+
"""Return job driver logger to log messages to the job driver log file.
|
| 91 |
+
|
| 92 |
+
If this function is called for the first time, configure the logger.
|
| 93 |
+
"""
|
| 94 |
+
job_driver_logger = logging.getLogger(f"{__name__}.driver-{job_id}")
|
| 95 |
+
|
| 96 |
+
# Configure the logger if it's not already configured.
|
| 97 |
+
if not job_driver_logger.handlers:
|
| 98 |
+
job_driver_log_path = self._log_client.get_log_file_path(job_id)
|
| 99 |
+
job_driver_handler = logging.FileHandler(job_driver_log_path)
|
| 100 |
+
job_driver_formatter = logging.Formatter(ray_constants.LOGGER_FORMAT)
|
| 101 |
+
job_driver_handler.setFormatter(job_driver_formatter)
|
| 102 |
+
job_driver_logger.addHandler(job_driver_handler)
|
| 103 |
+
|
| 104 |
+
return job_driver_logger
|
| 105 |
+
|
| 106 |
+
async def _recover_running_jobs(self):
|
| 107 |
+
"""Recovers all running jobs from the status client.
|
| 108 |
+
|
| 109 |
+
For each job, we will spawn a coroutine to monitor it.
|
| 110 |
+
Each will be added to self._running_jobs and reconciled.
|
| 111 |
+
"""
|
| 112 |
+
try:
|
| 113 |
+
all_jobs = await self._job_info_client.get_all_jobs()
|
| 114 |
+
for job_id, job_info in all_jobs.items():
|
| 115 |
+
if not job_info.status.is_terminal():
|
| 116 |
+
run_background_task(self._monitor_job(job_id))
|
| 117 |
+
finally:
|
| 118 |
+
# This event is awaited in `submit_job` to avoid race conditions between
|
| 119 |
+
# recovery and new job submission, so it must always get set even if there
|
| 120 |
+
# are exceptions.
|
| 121 |
+
self._recover_running_jobs_event.set()
|
| 122 |
+
|
| 123 |
+
def _get_actor_for_job(self, job_id: str) -> Optional[ActorHandle]:
|
| 124 |
+
try:
|
| 125 |
+
return ray.get_actor(
|
| 126 |
+
JOB_ACTOR_NAME_TEMPLATE.format(job_id=job_id),
|
| 127 |
+
namespace=SUPERVISOR_ACTOR_RAY_NAMESPACE,
|
| 128 |
+
)
|
| 129 |
+
except ValueError: # Ray returns ValueError for nonexistent actor.
|
| 130 |
+
return None
|
| 131 |
+
|
| 132 |
+
async def _monitor_job(
|
| 133 |
+
self, job_id: str, job_supervisor: Optional[ActorHandle] = None
|
| 134 |
+
):
|
| 135 |
+
"""Monitors the specified job until it enters a terminal state.
|
| 136 |
+
|
| 137 |
+
This is necessary because we need to handle the case where the
|
| 138 |
+
JobSupervisor dies unexpectedly.
|
| 139 |
+
"""
|
| 140 |
+
if job_id in self.monitored_jobs:
|
| 141 |
+
logger.debug(f"Job {job_id} is already being monitored.")
|
| 142 |
+
return
|
| 143 |
+
|
| 144 |
+
self.monitored_jobs.add(job_id)
|
| 145 |
+
try:
|
| 146 |
+
await self._monitor_job_internal(job_id, job_supervisor)
|
| 147 |
+
finally:
|
| 148 |
+
self.monitored_jobs.remove(job_id)
|
| 149 |
+
|
| 150 |
+
async def _monitor_job_internal(
|
| 151 |
+
self, job_id: str, job_supervisor: Optional[ActorHandle] = None
|
| 152 |
+
):
|
| 153 |
+
timeout = float(
|
| 154 |
+
os.environ.get(
|
| 155 |
+
RAY_JOB_START_TIMEOUT_SECONDS_ENV_VAR,
|
| 156 |
+
DEFAULT_JOB_START_TIMEOUT_SECONDS,
|
| 157 |
+
)
|
| 158 |
+
)
|
| 159 |
+
|
| 160 |
+
is_alive = True
|
| 161 |
+
|
| 162 |
+
while is_alive:
|
| 163 |
+
try:
|
| 164 |
+
job_status = await self._job_info_client.get_status(job_id)
|
| 165 |
+
if job_status == JobStatus.PENDING:
|
| 166 |
+
# Compare the current time with the job start time.
|
| 167 |
+
# If the job is still pending, we will set the status
|
| 168 |
+
# to FAILED.
|
| 169 |
+
job_info = await self._job_info_client.get_info(job_id)
|
| 170 |
+
|
| 171 |
+
if time.time() - job_info.start_time / 1000 > timeout:
|
| 172 |
+
err_msg = (
|
| 173 |
+
"Job supervisor actor failed to start within "
|
| 174 |
+
f"{timeout} seconds. This timeout can be "
|
| 175 |
+
f"configured by setting the environment "
|
| 176 |
+
f"variable {RAY_JOB_START_TIMEOUT_SECONDS_ENV_VAR}."
|
| 177 |
+
)
|
| 178 |
+
resources_specified = (
|
| 179 |
+
(
|
| 180 |
+
job_info.entrypoint_num_cpus is not None
|
| 181 |
+
and job_info.entrypoint_num_cpus > 0
|
| 182 |
+
)
|
| 183 |
+
or (
|
| 184 |
+
job_info.entrypoint_num_gpus is not None
|
| 185 |
+
and job_info.entrypoint_num_gpus > 0
|
| 186 |
+
)
|
| 187 |
+
or (
|
| 188 |
+
job_info.entrypoint_memory is not None
|
| 189 |
+
and job_info.entrypoint_memory > 0
|
| 190 |
+
)
|
| 191 |
+
or (
|
| 192 |
+
job_info.entrypoint_resources is not None
|
| 193 |
+
and len(job_info.entrypoint_resources) > 0
|
| 194 |
+
)
|
| 195 |
+
)
|
| 196 |
+
if resources_specified:
|
| 197 |
+
err_msg += (
|
| 198 |
+
" This may be because the job entrypoint's specified "
|
| 199 |
+
"resources (entrypoint_num_cpus, entrypoint_num_gpus, "
|
| 200 |
+
"entrypoint_resources, entrypoint_memory)"
|
| 201 |
+
"aren't available on the cluster."
|
| 202 |
+
" Try checking the cluster's available resources with "
|
| 203 |
+
"`ray status` and specifying fewer resources for the "
|
| 204 |
+
"job entrypoint."
|
| 205 |
+
)
|
| 206 |
+
await self._job_info_client.put_status(
|
| 207 |
+
job_id,
|
| 208 |
+
JobStatus.FAILED,
|
| 209 |
+
message=err_msg,
|
| 210 |
+
)
|
| 211 |
+
is_alive = False
|
| 212 |
+
logger.error(err_msg)
|
| 213 |
+
continue
|
| 214 |
+
|
| 215 |
+
if job_supervisor is None:
|
| 216 |
+
job_supervisor = self._get_actor_for_job(job_id)
|
| 217 |
+
|
| 218 |
+
if job_supervisor is None:
|
| 219 |
+
if job_status == JobStatus.PENDING:
|
| 220 |
+
# Maybe the job supervisor actor is not created yet.
|
| 221 |
+
# We will wait for the next loop.
|
| 222 |
+
continue
|
| 223 |
+
else:
|
| 224 |
+
# The job supervisor actor is not created, but the job
|
| 225 |
+
# status is not PENDING. This means the job supervisor
|
| 226 |
+
# actor is not created due to some unexpected errors.
|
| 227 |
+
# We will set the job status to FAILED.
|
| 228 |
+
logger.error(f"Failed to get job supervisor for job {job_id}.")
|
| 229 |
+
await self._job_info_client.put_status(
|
| 230 |
+
job_id,
|
| 231 |
+
JobStatus.FAILED,
|
| 232 |
+
message=(
|
| 233 |
+
"Unexpected error occurred: "
|
| 234 |
+
"failed to get job supervisor."
|
| 235 |
+
),
|
| 236 |
+
)
|
| 237 |
+
is_alive = False
|
| 238 |
+
continue
|
| 239 |
+
|
| 240 |
+
await job_supervisor.ping.remote()
|
| 241 |
+
|
| 242 |
+
await asyncio.sleep(self.JOB_MONITOR_LOOP_PERIOD_S)
|
| 243 |
+
except Exception as e:
|
| 244 |
+
is_alive = False
|
| 245 |
+
job_status = await self._job_info_client.get_status(job_id)
|
| 246 |
+
job_error_message = None
|
| 247 |
+
if job_status == JobStatus.FAILED:
|
| 248 |
+
job_error_message = (
|
| 249 |
+
"See more details from the dashboard "
|
| 250 |
+
"`Job` page or the state API `ray list jobs`."
|
| 251 |
+
)
|
| 252 |
+
|
| 253 |
+
job_error_message = ""
|
| 254 |
+
if job_status.is_terminal():
|
| 255 |
+
# If the job is already in a terminal state, then the actor
|
| 256 |
+
# exiting is expected.
|
| 257 |
+
pass
|
| 258 |
+
elif isinstance(e, RuntimeEnvSetupError):
|
| 259 |
+
logger.info(f"Failed to set up runtime_env for job {job_id}.")
|
| 260 |
+
job_error_message = f"runtime_env setup failed: {e}"
|
| 261 |
+
job_status = JobStatus.FAILED
|
| 262 |
+
await self._job_info_client.put_status(
|
| 263 |
+
job_id,
|
| 264 |
+
job_status,
|
| 265 |
+
message=job_error_message,
|
| 266 |
+
)
|
| 267 |
+
elif isinstance(e, ActorUnschedulableError):
|
| 268 |
+
logger.info(
|
| 269 |
+
f"Failed to schedule job {job_id} because the supervisor actor "
|
| 270 |
+
f"could not be scheduled: {e}"
|
| 271 |
+
)
|
| 272 |
+
job_error_message = (
|
| 273 |
+
f"Job supervisor actor could not be scheduled: {e}"
|
| 274 |
+
)
|
| 275 |
+
await self._job_info_client.put_status(
|
| 276 |
+
job_id,
|
| 277 |
+
JobStatus.FAILED,
|
| 278 |
+
message=job_error_message,
|
| 279 |
+
)
|
| 280 |
+
else:
|
| 281 |
+
logger.warning(
|
| 282 |
+
f"Job supervisor for job {job_id} failed unexpectedly: {e}."
|
| 283 |
+
)
|
| 284 |
+
job_error_message = f"Unexpected error occurred: {e}"
|
| 285 |
+
job_status = JobStatus.FAILED
|
| 286 |
+
await self._job_info_client.put_status(
|
| 287 |
+
job_id,
|
| 288 |
+
job_status,
|
| 289 |
+
message=job_error_message,
|
| 290 |
+
)
|
| 291 |
+
|
| 292 |
+
# Log error message to the job driver file for easy access.
|
| 293 |
+
if job_error_message:
|
| 294 |
+
log_path = self._log_client.get_log_file_path(job_id)
|
| 295 |
+
os.makedirs(os.path.dirname(log_path), exist_ok=True)
|
| 296 |
+
with open(log_path, "a") as log_file:
|
| 297 |
+
log_file.write(job_error_message)
|
| 298 |
+
|
| 299 |
+
# Log events
|
| 300 |
+
if self.event_logger:
|
| 301 |
+
event_log = (
|
| 302 |
+
f"Completed a ray job {job_id} with a status {job_status}."
|
| 303 |
+
)
|
| 304 |
+
if job_error_message:
|
| 305 |
+
event_log += f" {job_error_message}"
|
| 306 |
+
self.event_logger.error(event_log, submission_id=job_id)
|
| 307 |
+
else:
|
| 308 |
+
self.event_logger.info(event_log, submission_id=job_id)
|
| 309 |
+
|
| 310 |
+
# Kill the actor defensively to avoid leaking actors in unexpected error cases.
|
| 311 |
+
if job_supervisor is not None:
|
| 312 |
+
ray.kill(job_supervisor, no_restart=True)
|
| 313 |
+
|
| 314 |
+
def _handle_supervisor_startup(self, job_id: str, result: Optional[Exception]):
|
| 315 |
+
"""Handle the result of starting a job supervisor actor.
|
| 316 |
+
|
| 317 |
+
If started successfully, result should be None. Otherwise it should be
|
| 318 |
+
an Exception.
|
| 319 |
+
|
| 320 |
+
On failure, the job will be marked failed with a relevant error
|
| 321 |
+
message.
|
| 322 |
+
"""
|
| 323 |
+
if result is None:
|
| 324 |
+
return
|
| 325 |
+
|
| 326 |
+
def _get_supervisor_runtime_env(
|
| 327 |
+
self,
|
| 328 |
+
user_runtime_env: Dict[str, Any],
|
| 329 |
+
submission_id: str,
|
| 330 |
+
resources_specified: bool = False,
|
| 331 |
+
) -> Dict[str, Any]:
|
| 332 |
+
"""Configure and return the runtime_env for the supervisor actor.
|
| 333 |
+
|
| 334 |
+
Args:
|
| 335 |
+
user_runtime_env: The runtime_env specified by the user.
|
| 336 |
+
resources_specified: Whether the user specified resources in the
|
| 337 |
+
submit_job() call. If so, we will skip the workaround introduced
|
| 338 |
+
in #24546 for GPU detection and just use the user's resource
|
| 339 |
+
requests, so that the behavior matches that of the user specifying
|
| 340 |
+
resources for any other actor.
|
| 341 |
+
|
| 342 |
+
Returns:
|
| 343 |
+
The runtime_env for the supervisor actor.
|
| 344 |
+
"""
|
| 345 |
+
# Make a copy to avoid mutating passed runtime_env.
|
| 346 |
+
runtime_env = (
|
| 347 |
+
copy.deepcopy(user_runtime_env) if user_runtime_env is not None else {}
|
| 348 |
+
)
|
| 349 |
+
|
| 350 |
+
# NOTE(edoakes): Can't use .get(, {}) here because we need to handle the case
|
| 351 |
+
# where env_vars is explicitly set to `None`.
|
| 352 |
+
env_vars = runtime_env.get("env_vars")
|
| 353 |
+
if env_vars is None:
|
| 354 |
+
env_vars = {}
|
| 355 |
+
|
| 356 |
+
env_vars[ray_constants.RAY_WORKER_NICENESS] = "0"
|
| 357 |
+
|
| 358 |
+
if not resources_specified:
|
| 359 |
+
# Don't set CUDA_VISIBLE_DEVICES for the supervisor actor so the
|
| 360 |
+
# driver can use GPUs if it wants to. This will be removed from
|
| 361 |
+
# the driver's runtime_env so it isn't inherited by tasks & actors.
|
| 362 |
+
env_vars[ray_constants.NOSET_CUDA_VISIBLE_DEVICES_ENV_VAR] = "1"
|
| 363 |
+
runtime_env["env_vars"] = env_vars
|
| 364 |
+
|
| 365 |
+
if os.getenv(RAY_STREAM_RUNTIME_ENV_LOG_TO_JOB_DRIVER_LOG_ENV_VAR, "0") == "1":
|
| 366 |
+
config = runtime_env.get("config")
|
| 367 |
+
# Empty fields may be set to None, so we need to check for None explicitly.
|
| 368 |
+
if config is None:
|
| 369 |
+
config = RuntimeEnvConfig()
|
| 370 |
+
config["log_files"] = [self._log_client.get_log_file_path(submission_id)]
|
| 371 |
+
runtime_env["config"] = config
|
| 372 |
+
return runtime_env
|
| 373 |
+
|
| 374 |
+
async def _get_scheduling_strategy(
|
| 375 |
+
self, resources_specified: bool
|
| 376 |
+
) -> SchedulingStrategyT:
|
| 377 |
+
"""Get the scheduling strategy for the job.
|
| 378 |
+
|
| 379 |
+
If resources_specified is true, or if the environment variable is set to
|
| 380 |
+
allow the job to run on worker nodes, we will use Ray's default actor
|
| 381 |
+
placement strategy. Otherwise, we will force the job to use the head node.
|
| 382 |
+
|
| 383 |
+
Args:
|
| 384 |
+
resources_specified: Whether the job specified any resources
|
| 385 |
+
(CPUs, GPUs, or custom resources).
|
| 386 |
+
|
| 387 |
+
Returns:
|
| 388 |
+
The scheduling strategy to use for the job.
|
| 389 |
+
"""
|
| 390 |
+
if resources_specified:
|
| 391 |
+
return "DEFAULT"
|
| 392 |
+
|
| 393 |
+
if os.environ.get(RAY_JOB_ALLOW_DRIVER_ON_WORKER_NODES_ENV_VAR, "0") == "1":
|
| 394 |
+
logger.info(
|
| 395 |
+
f"{RAY_JOB_ALLOW_DRIVER_ON_WORKER_NODES_ENV_VAR} was set to 1. "
|
| 396 |
+
"Using Ray's default actor scheduling strategy for the job "
|
| 397 |
+
"driver instead of running it on the head node."
|
| 398 |
+
)
|
| 399 |
+
return "DEFAULT"
|
| 400 |
+
|
| 401 |
+
# If the user did not specify any resources or set the driver on worker nodes
|
| 402 |
+
# env var, we will run the driver on the head node.
|
| 403 |
+
|
| 404 |
+
head_node_id = await get_head_node_id(self._gcs_aio_client)
|
| 405 |
+
if head_node_id is None:
|
| 406 |
+
logger.info(
|
| 407 |
+
"Head node ID not found in GCS. Using Ray's default actor "
|
| 408 |
+
"scheduling strategy for the job driver instead of running "
|
| 409 |
+
"it on the head node."
|
| 410 |
+
)
|
| 411 |
+
scheduling_strategy = "DEFAULT"
|
| 412 |
+
else:
|
| 413 |
+
logger.info(
|
| 414 |
+
"Head node ID found in GCS; scheduling job driver on "
|
| 415 |
+
f"head node {head_node_id}"
|
| 416 |
+
)
|
| 417 |
+
scheduling_strategy = NodeAffinitySchedulingStrategy(
|
| 418 |
+
node_id=head_node_id, soft=False
|
| 419 |
+
)
|
| 420 |
+
return scheduling_strategy
|
| 421 |
+
|
| 422 |
+
async def submit_job(
|
| 423 |
+
self,
|
| 424 |
+
*,
|
| 425 |
+
entrypoint: str,
|
| 426 |
+
submission_id: Optional[str] = None,
|
| 427 |
+
runtime_env: Optional[Dict[str, Any]] = None,
|
| 428 |
+
metadata: Optional[Dict[str, str]] = None,
|
| 429 |
+
entrypoint_num_cpus: Optional[Union[int, float]] = None,
|
| 430 |
+
entrypoint_num_gpus: Optional[Union[int, float]] = None,
|
| 431 |
+
entrypoint_memory: Optional[int] = None,
|
| 432 |
+
entrypoint_resources: Optional[Dict[str, float]] = None,
|
| 433 |
+
_start_signal_actor: Optional[ActorHandle] = None,
|
| 434 |
+
) -> str:
|
| 435 |
+
"""
|
| 436 |
+
Job execution happens asynchronously.
|
| 437 |
+
|
| 438 |
+
1) Generate a new unique id for this job submission, each call of this
|
| 439 |
+
method assumes they're independent submission with its own new
|
| 440 |
+
ID, job supervisor actor, and child process.
|
| 441 |
+
2) Create new detached actor with same runtime_env as job spec
|
| 442 |
+
|
| 443 |
+
Actual setting up runtime_env, subprocess group, driver command
|
| 444 |
+
execution, subprocess cleaning up and running status update to GCS
|
| 445 |
+
is all handled by job supervisor actor.
|
| 446 |
+
|
| 447 |
+
Args:
|
| 448 |
+
entrypoint: Driver command to execute in subprocess shell.
|
| 449 |
+
Represents the entrypoint to start user application.
|
| 450 |
+
runtime_env: Runtime environment used to execute driver command,
|
| 451 |
+
which could contain its own ray.init() to configure runtime
|
| 452 |
+
env at ray cluster, task and actor level.
|
| 453 |
+
metadata: Support passing arbitrary data to driver command in
|
| 454 |
+
case needed.
|
| 455 |
+
entrypoint_num_cpus: The quantity of CPU cores to reserve for the execution
|
| 456 |
+
of the entrypoint command, separately from any tasks or actors launched
|
| 457 |
+
by it. Defaults to 0.
|
| 458 |
+
entrypoint_num_gpus: The quantity of GPUs to reserve for
|
| 459 |
+
the entrypoint command, separately from any tasks or actors launched
|
| 460 |
+
by it. Defaults to 0.
|
| 461 |
+
entrypoint_memory: The amount of total available memory for workers
|
| 462 |
+
requesting memory the entrypoint command, separately from any tasks
|
| 463 |
+
or actors launched by it. Defaults to 0.
|
| 464 |
+
entrypoint_resources: The quantity of various custom resources
|
| 465 |
+
to reserve for the entrypoint command, separately from any tasks or
|
| 466 |
+
actors launched by it.
|
| 467 |
+
_start_signal_actor: Used in testing only to capture state
|
| 468 |
+
transitions between PENDING -> RUNNING. Regular user shouldn't
|
| 469 |
+
need this.
|
| 470 |
+
|
| 471 |
+
Returns:
|
| 472 |
+
job_id: Generated uuid for further job management. Only valid
|
| 473 |
+
within the same ray cluster.
|
| 474 |
+
"""
|
| 475 |
+
if entrypoint_num_cpus is None:
|
| 476 |
+
entrypoint_num_cpus = 0
|
| 477 |
+
if entrypoint_num_gpus is None:
|
| 478 |
+
entrypoint_num_gpus = 0
|
| 479 |
+
if entrypoint_memory is None:
|
| 480 |
+
entrypoint_memory = 0
|
| 481 |
+
if submission_id is None:
|
| 482 |
+
submission_id = generate_job_id()
|
| 483 |
+
|
| 484 |
+
# Wait for `_recover_running_jobs` to run before accepting submissions to
|
| 485 |
+
# avoid duplicate monitoring of the same job.
|
| 486 |
+
await self._recover_running_jobs_event.wait()
|
| 487 |
+
|
| 488 |
+
logger.info(f"Starting job with submission_id: {submission_id}")
|
| 489 |
+
job_info = JobInfo(
|
| 490 |
+
entrypoint=entrypoint,
|
| 491 |
+
status=JobStatus.PENDING,
|
| 492 |
+
start_time=int(time.time() * 1000),
|
| 493 |
+
metadata=metadata,
|
| 494 |
+
runtime_env=runtime_env,
|
| 495 |
+
entrypoint_num_cpus=entrypoint_num_cpus,
|
| 496 |
+
entrypoint_num_gpus=entrypoint_num_gpus,
|
| 497 |
+
entrypoint_memory=entrypoint_memory,
|
| 498 |
+
entrypoint_resources=entrypoint_resources,
|
| 499 |
+
)
|
| 500 |
+
new_key_added = await self._job_info_client.put_info(
|
| 501 |
+
submission_id, job_info, overwrite=False
|
| 502 |
+
)
|
| 503 |
+
if not new_key_added:
|
| 504 |
+
raise ValueError(
|
| 505 |
+
f"Job with submission_id {submission_id} already exists. "
|
| 506 |
+
"Please use a different submission_id."
|
| 507 |
+
)
|
| 508 |
+
|
| 509 |
+
driver_logger = self._get_job_driver_logger(submission_id)
|
| 510 |
+
# Wait for the actor to start up asynchronously so this call always
|
| 511 |
+
# returns immediately and we can catch errors with the actor starting
|
| 512 |
+
# up.
|
| 513 |
+
try:
|
| 514 |
+
resources_specified = any(
|
| 515 |
+
[
|
| 516 |
+
entrypoint_num_cpus is not None and entrypoint_num_cpus > 0,
|
| 517 |
+
entrypoint_num_gpus is not None and entrypoint_num_gpus > 0,
|
| 518 |
+
entrypoint_memory is not None and entrypoint_memory > 0,
|
| 519 |
+
entrypoint_resources not in [None, {}],
|
| 520 |
+
]
|
| 521 |
+
)
|
| 522 |
+
scheduling_strategy = await self._get_scheduling_strategy(
|
| 523 |
+
resources_specified
|
| 524 |
+
)
|
| 525 |
+
if self.event_logger:
|
| 526 |
+
self.event_logger.info(
|
| 527 |
+
f"Started a ray job {submission_id}.", submission_id=submission_id
|
| 528 |
+
)
|
| 529 |
+
|
| 530 |
+
driver_logger.info("Runtime env is setting up.")
|
| 531 |
+
supervisor = self._supervisor_actor_cls.options(
|
| 532 |
+
lifetime="detached",
|
| 533 |
+
name=JOB_ACTOR_NAME_TEMPLATE.format(job_id=submission_id),
|
| 534 |
+
num_cpus=entrypoint_num_cpus,
|
| 535 |
+
num_gpus=entrypoint_num_gpus,
|
| 536 |
+
memory=entrypoint_memory,
|
| 537 |
+
resources=entrypoint_resources,
|
| 538 |
+
scheduling_strategy=scheduling_strategy,
|
| 539 |
+
runtime_env=self._get_supervisor_runtime_env(
|
| 540 |
+
runtime_env, submission_id, resources_specified
|
| 541 |
+
),
|
| 542 |
+
namespace=SUPERVISOR_ACTOR_RAY_NAMESPACE,
|
| 543 |
+
).remote(
|
| 544 |
+
submission_id,
|
| 545 |
+
entrypoint,
|
| 546 |
+
metadata or {},
|
| 547 |
+
self._gcs_address,
|
| 548 |
+
self._cluster_id_hex,
|
| 549 |
+
self._logs_dir,
|
| 550 |
+
)
|
| 551 |
+
supervisor.run.remote(
|
| 552 |
+
_start_signal_actor=_start_signal_actor,
|
| 553 |
+
resources_specified=resources_specified,
|
| 554 |
+
)
|
| 555 |
+
|
| 556 |
+
# Monitor the job in the background so we can detect errors without
|
| 557 |
+
# requiring a client to poll.
|
| 558 |
+
run_background_task(
|
| 559 |
+
self._monitor_job(submission_id, job_supervisor=supervisor)
|
| 560 |
+
)
|
| 561 |
+
except Exception as e:
|
| 562 |
+
tb_str = traceback.format_exc()
|
| 563 |
+
driver_logger.warning(
|
| 564 |
+
f"Failed to start supervisor actor for job {submission_id}: '{e}'"
|
| 565 |
+
f". Full traceback:\n{tb_str}"
|
| 566 |
+
)
|
| 567 |
+
await self._job_info_client.put_status(
|
| 568 |
+
submission_id,
|
| 569 |
+
JobStatus.FAILED,
|
| 570 |
+
message=(
|
| 571 |
+
f"Failed to start supervisor actor {submission_id}: '{e}'"
|
| 572 |
+
f". Full traceback:\n{tb_str}"
|
| 573 |
+
),
|
| 574 |
+
)
|
| 575 |
+
finally:
|
| 576 |
+
close_logger_file_descriptor(driver_logger)
|
| 577 |
+
|
| 578 |
+
return submission_id
|
| 579 |
+
|
| 580 |
+
def stop_job(self, job_id) -> bool:
|
| 581 |
+
"""Request a job to exit, fire and forget.
|
| 582 |
+
|
| 583 |
+
Returns whether or not the job was running.
|
| 584 |
+
"""
|
| 585 |
+
job_supervisor_actor = self._get_actor_for_job(job_id)
|
| 586 |
+
if job_supervisor_actor is not None:
|
| 587 |
+
# Actor is still alive, signal it to stop the driver, fire and
|
| 588 |
+
# forget
|
| 589 |
+
job_supervisor_actor.stop.remote()
|
| 590 |
+
return True
|
| 591 |
+
else:
|
| 592 |
+
return False
|
| 593 |
+
|
| 594 |
+
async def delete_job(self, job_id):
|
| 595 |
+
"""Delete a job's info and metadata from the cluster."""
|
| 596 |
+
job_status = await self._job_info_client.get_status(job_id)
|
| 597 |
+
|
| 598 |
+
if job_status is None or not job_status.is_terminal():
|
| 599 |
+
raise RuntimeError(
|
| 600 |
+
f"Attempted to delete job '{job_id}', "
|
| 601 |
+
f"but it is in a non-terminal state {job_status}."
|
| 602 |
+
)
|
| 603 |
+
|
| 604 |
+
await self._job_info_client.delete_info(job_id)
|
| 605 |
+
return True
|
| 606 |
+
|
| 607 |
+
def job_info_client(self) -> JobInfoStorageClient:
|
| 608 |
+
return self._job_info_client
|
| 609 |
+
|
| 610 |
+
async def get_job_status(self, job_id: str) -> Optional[JobStatus]:
|
| 611 |
+
"""Get latest status of a job."""
|
| 612 |
+
return await self._job_info_client.get_status(job_id)
|
| 613 |
+
|
| 614 |
+
async def get_job_info(self, job_id: str) -> Optional[JobInfo]:
|
| 615 |
+
"""Get latest info of a job."""
|
| 616 |
+
return await self._job_info_client.get_info(job_id)
|
| 617 |
+
|
| 618 |
+
async def list_jobs(self) -> Dict[str, JobInfo]:
|
| 619 |
+
"""Get info for all jobs."""
|
| 620 |
+
return await self._job_info_client.get_all_jobs()
|
| 621 |
+
|
| 622 |
+
def get_job_logs(self, job_id: str) -> str:
|
| 623 |
+
"""Get all logs produced by a job."""
|
| 624 |
+
return self._log_client.get_logs(job_id)
|
| 625 |
+
|
| 626 |
+
async def tail_job_logs(self, job_id: str) -> AsyncIterator[str]:
|
| 627 |
+
"""Return an iterator following the logs of a job."""
|
| 628 |
+
if await self.get_job_status(job_id) is None:
|
| 629 |
+
raise RuntimeError(f"Job '{job_id}' does not exist.")
|
| 630 |
+
|
| 631 |
+
async for lines in self._log_client.tail_logs(job_id):
|
| 632 |
+
if lines is None:
|
| 633 |
+
# Return if the job has exited and there are no new log lines.
|
| 634 |
+
status = await self.get_job_status(job_id)
|
| 635 |
+
if status.is_terminal():
|
| 636 |
+
return
|
| 637 |
+
|
| 638 |
+
await asyncio.sleep(self.LOG_TAIL_SLEEP_S)
|
| 639 |
+
else:
|
| 640 |
+
yield "".join(lines)
|
.venv/lib/python3.11/site-packages/ray/dashboard/modules/job/job_supervisor.py
ADDED
|
@@ -0,0 +1,477 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import asyncio
|
| 2 |
+
import json
|
| 3 |
+
import logging
|
| 4 |
+
import os
|
| 5 |
+
import signal
|
| 6 |
+
import subprocess
|
| 7 |
+
import sys
|
| 8 |
+
import traceback
|
| 9 |
+
from asyncio.tasks import FIRST_COMPLETED
|
| 10 |
+
from typing import Any, Dict, List, Optional
|
| 11 |
+
|
| 12 |
+
import ray
|
| 13 |
+
import ray._private.ray_constants as ray_constants
|
| 14 |
+
from ray._private.gcs_utils import GcsAioClient
|
| 15 |
+
from ray._private.ray_logging.filters import CoreContextFilter
|
| 16 |
+
from ray._private.ray_logging.formatters import JSONFormatter, TextFormatter
|
| 17 |
+
from ray._private.runtime_env.constants import RAY_JOB_CONFIG_JSON_ENV_VAR
|
| 18 |
+
from ray._private.utils import remove_ray_internal_flags_from_env
|
| 19 |
+
from ray.actor import ActorHandle
|
| 20 |
+
from ray.dashboard.modules.job.common import (
|
| 21 |
+
JOB_ID_METADATA_KEY,
|
| 22 |
+
JOB_NAME_METADATA_KEY,
|
| 23 |
+
JobInfoStorageClient,
|
| 24 |
+
)
|
| 25 |
+
from ray.dashboard.modules.job.job_log_storage_client import JobLogStorageClient
|
| 26 |
+
from ray.job_submission import JobStatus
|
| 27 |
+
|
| 28 |
+
import psutil
|
| 29 |
+
|
| 30 |
+
# asyncio python version compatibility
|
| 31 |
+
try:
|
| 32 |
+
create_task = asyncio.create_task
|
| 33 |
+
except AttributeError:
|
| 34 |
+
create_task = asyncio.ensure_future
|
| 35 |
+
|
| 36 |
+
# Windows requires additional packages for proper process control.
|
| 37 |
+
if sys.platform == "win32":
|
| 38 |
+
try:
|
| 39 |
+
import win32api
|
| 40 |
+
import win32con
|
| 41 |
+
import win32job
|
| 42 |
+
except (ModuleNotFoundError, ImportError) as e:
|
| 43 |
+
win32api = None
|
| 44 |
+
win32con = None
|
| 45 |
+
win32job = None
|
| 46 |
+
|
| 47 |
+
logger = logging.getLogger(__name__)
|
| 48 |
+
logger.warning(
|
| 49 |
+
"Failed to Import win32api. For best usage experience run "
|
| 50 |
+
f"'conda install pywin32'. Import error: {e}"
|
| 51 |
+
)
|
| 52 |
+
|
| 53 |
+
|
| 54 |
+
class JobSupervisor:
|
| 55 |
+
"""
|
| 56 |
+
Ray actor created by JobManager for each submitted job, responsible to
|
| 57 |
+
setup runtime_env, execute given shell command in subprocess, update job
|
| 58 |
+
status, persist job logs and manage subprocess group cleaning.
|
| 59 |
+
|
| 60 |
+
One job supervisor actor maps to one subprocess, for one job_id.
|
| 61 |
+
Job supervisor actor should fate share with subprocess it created.
|
| 62 |
+
"""
|
| 63 |
+
|
| 64 |
+
DEFAULT_RAY_JOB_STOP_WAIT_TIME_S = 3
|
| 65 |
+
SUBPROCESS_POLL_PERIOD_S = 0.1
|
| 66 |
+
VALID_STOP_SIGNALS = ["SIGINT", "SIGTERM"]
|
| 67 |
+
|
| 68 |
+
def __init__(
|
| 69 |
+
self,
|
| 70 |
+
job_id: str,
|
| 71 |
+
entrypoint: str,
|
| 72 |
+
user_metadata: Dict[str, str],
|
| 73 |
+
gcs_address: str,
|
| 74 |
+
cluster_id_hex: str,
|
| 75 |
+
logs_dir: Optional[str] = None,
|
| 76 |
+
):
|
| 77 |
+
self._job_id = job_id
|
| 78 |
+
gcs_aio_client = GcsAioClient(address=gcs_address, cluster_id=cluster_id_hex)
|
| 79 |
+
self._job_info_client = JobInfoStorageClient(gcs_aio_client, logs_dir)
|
| 80 |
+
self._log_client = JobLogStorageClient()
|
| 81 |
+
self._entrypoint = entrypoint
|
| 82 |
+
|
| 83 |
+
# Default metadata if not passed by the user.
|
| 84 |
+
self._metadata = {JOB_ID_METADATA_KEY: job_id, JOB_NAME_METADATA_KEY: job_id}
|
| 85 |
+
self._metadata.update(user_metadata)
|
| 86 |
+
|
| 87 |
+
# Event used to signal that a job should be stopped.
|
| 88 |
+
# Set in the `stop_job` method.
|
| 89 |
+
self._stop_event = asyncio.Event()
|
| 90 |
+
|
| 91 |
+
# Windows Job Object used to handle stopping the child processes.
|
| 92 |
+
self._win32_job_object = None
|
| 93 |
+
|
| 94 |
+
# Logger object to persist JobSupervisor logs in separate file.
|
| 95 |
+
self._logger = logging.getLogger(f"{__name__}.supervisor-{job_id}")
|
| 96 |
+
self._configure_logger()
|
| 97 |
+
|
| 98 |
+
def _configure_logger(self) -> None:
|
| 99 |
+
"""
|
| 100 |
+
Configure self._logger object to write logs to file based on job
|
| 101 |
+
submission ID and to console.
|
| 102 |
+
"""
|
| 103 |
+
supervisor_log_file_name = os.path.join(
|
| 104 |
+
ray._private.worker._global_node.get_logs_dir_path(),
|
| 105 |
+
f"jobs/supervisor-{self._job_id}.log",
|
| 106 |
+
)
|
| 107 |
+
os.makedirs(os.path.dirname(supervisor_log_file_name), exist_ok=True)
|
| 108 |
+
self._logger.addFilter(CoreContextFilter())
|
| 109 |
+
stream_handler = logging.StreamHandler()
|
| 110 |
+
file_handler = logging.FileHandler(supervisor_log_file_name)
|
| 111 |
+
formatter = TextFormatter()
|
| 112 |
+
if ray_constants.env_bool(ray_constants.RAY_BACKEND_LOG_JSON_ENV_VAR, False):
|
| 113 |
+
formatter = JSONFormatter()
|
| 114 |
+
stream_handler.setFormatter(formatter)
|
| 115 |
+
file_handler.setFormatter(formatter)
|
| 116 |
+
self._logger.addHandler(stream_handler)
|
| 117 |
+
self._logger.addHandler(file_handler)
|
| 118 |
+
self._logger.propagate = False
|
| 119 |
+
|
| 120 |
+
def _get_driver_runtime_env(
|
| 121 |
+
self, resources_specified: bool = False
|
| 122 |
+
) -> Dict[str, Any]:
|
| 123 |
+
"""Get the runtime env that should be set in the job driver.
|
| 124 |
+
|
| 125 |
+
Args:
|
| 126 |
+
resources_specified: Whether the user specified resources (CPUs, GPUs,
|
| 127 |
+
custom resources) in the submit_job request. If so, we will skip
|
| 128 |
+
the workaround for GPU detection introduced in #24546, so that the
|
| 129 |
+
behavior matches that of the user specifying resources for any
|
| 130 |
+
other actor.
|
| 131 |
+
|
| 132 |
+
Returns:
|
| 133 |
+
The runtime env that should be set in the job driver.
|
| 134 |
+
"""
|
| 135 |
+
# Get the runtime_env set for the supervisor actor.
|
| 136 |
+
curr_runtime_env = dict(ray.get_runtime_context().runtime_env)
|
| 137 |
+
if resources_specified:
|
| 138 |
+
return curr_runtime_env
|
| 139 |
+
# Allow CUDA_VISIBLE_DEVICES to be set normally for the driver's tasks
|
| 140 |
+
# & actors.
|
| 141 |
+
env_vars = curr_runtime_env.get("env_vars", {})
|
| 142 |
+
env_vars.pop(ray_constants.NOSET_CUDA_VISIBLE_DEVICES_ENV_VAR)
|
| 143 |
+
env_vars.pop(ray_constants.RAY_WORKER_NICENESS)
|
| 144 |
+
curr_runtime_env["env_vars"] = env_vars
|
| 145 |
+
return curr_runtime_env
|
| 146 |
+
|
| 147 |
+
def ping(self):
|
| 148 |
+
"""Used to check the health of the actor."""
|
| 149 |
+
pass
|
| 150 |
+
|
| 151 |
+
def _exec_entrypoint(self, env: dict, logs_path: str) -> subprocess.Popen:
|
| 152 |
+
"""
|
| 153 |
+
Runs the entrypoint command as a child process, streaming stderr &
|
| 154 |
+
stdout to given log files.
|
| 155 |
+
|
| 156 |
+
Unix systems:
|
| 157 |
+
Meanwhile we start a demon process and group driver
|
| 158 |
+
subprocess in same pgid, such that if job actor dies, entire process
|
| 159 |
+
group also fate share with it.
|
| 160 |
+
|
| 161 |
+
Windows systems:
|
| 162 |
+
A jobObject is created to enable fate sharing for the entire process group.
|
| 163 |
+
|
| 164 |
+
Args:
|
| 165 |
+
logs_path: File path on head node's local disk to store driver
|
| 166 |
+
command's stdout & stderr.
|
| 167 |
+
Returns:
|
| 168 |
+
child_process: Child process that runs the driver command. Can be
|
| 169 |
+
terminated or killed upon user calling stop().
|
| 170 |
+
"""
|
| 171 |
+
# Open in append mode to avoid overwriting runtime_env setup logs for the
|
| 172 |
+
# supervisor actor, which are also written to the same file.
|
| 173 |
+
with open(logs_path, "a") as logs_file:
|
| 174 |
+
child_process = subprocess.Popen(
|
| 175 |
+
self._entrypoint,
|
| 176 |
+
shell=True,
|
| 177 |
+
start_new_session=True,
|
| 178 |
+
stdout=logs_file,
|
| 179 |
+
stderr=subprocess.STDOUT,
|
| 180 |
+
env=env,
|
| 181 |
+
# Ray intentionally blocks SIGINT in all processes, so if the user wants
|
| 182 |
+
# to stop job through SIGINT, we need to unblock it in the child process
|
| 183 |
+
preexec_fn=(
|
| 184 |
+
(
|
| 185 |
+
lambda: signal.pthread_sigmask(
|
| 186 |
+
signal.SIG_UNBLOCK, {signal.SIGINT}
|
| 187 |
+
)
|
| 188 |
+
)
|
| 189 |
+
if sys.platform != "win32"
|
| 190 |
+
and os.environ.get("RAY_JOB_STOP_SIGNAL") == "SIGINT"
|
| 191 |
+
else None
|
| 192 |
+
),
|
| 193 |
+
)
|
| 194 |
+
parent_pid = os.getpid()
|
| 195 |
+
child_pid = child_process.pid
|
| 196 |
+
# Create new pgid with new subprocess to execute driver command
|
| 197 |
+
|
| 198 |
+
if sys.platform != "win32":
|
| 199 |
+
try:
|
| 200 |
+
child_pgid = os.getpgid(child_pid)
|
| 201 |
+
except ProcessLookupError:
|
| 202 |
+
# Process died before we could get its pgid.
|
| 203 |
+
return child_process
|
| 204 |
+
|
| 205 |
+
# Open a new subprocess to kill the child process when the parent
|
| 206 |
+
# process dies kill -s 0 parent_pid will succeed if the parent is
|
| 207 |
+
# alive. If it fails, SIGKILL the child process group and exit
|
| 208 |
+
subprocess.Popen(
|
| 209 |
+
f"while kill -s 0 {parent_pid}; do sleep 1; done; kill -9 -{child_pgid}", # noqa: E501
|
| 210 |
+
shell=True,
|
| 211 |
+
# Suppress output
|
| 212 |
+
stdout=subprocess.DEVNULL,
|
| 213 |
+
stderr=subprocess.DEVNULL,
|
| 214 |
+
)
|
| 215 |
+
|
| 216 |
+
elif sys.platform == "win32" and win32api:
|
| 217 |
+
# Create a JobObject to which the child process (and its children)
|
| 218 |
+
# will be connected. This job object can be used to kill the child
|
| 219 |
+
# processes explicitly or when the jobObject gets deleted during
|
| 220 |
+
# garbage collection.
|
| 221 |
+
self._win32_job_object = win32job.CreateJobObject(None, "")
|
| 222 |
+
win32_job_info = win32job.QueryInformationJobObject(
|
| 223 |
+
self._win32_job_object, win32job.JobObjectExtendedLimitInformation
|
| 224 |
+
)
|
| 225 |
+
win32_job_info["BasicLimitInformation"][
|
| 226 |
+
"LimitFlags"
|
| 227 |
+
] = win32job.JOB_OBJECT_LIMIT_KILL_ON_JOB_CLOSE
|
| 228 |
+
win32job.SetInformationJobObject(
|
| 229 |
+
self._win32_job_object,
|
| 230 |
+
win32job.JobObjectExtendedLimitInformation,
|
| 231 |
+
win32_job_info,
|
| 232 |
+
)
|
| 233 |
+
child_handle = win32api.OpenProcess(
|
| 234 |
+
win32con.PROCESS_TERMINATE | win32con.PROCESS_SET_QUOTA,
|
| 235 |
+
False,
|
| 236 |
+
child_pid,
|
| 237 |
+
)
|
| 238 |
+
win32job.AssignProcessToJobObject(self._win32_job_object, child_handle)
|
| 239 |
+
|
| 240 |
+
return child_process
|
| 241 |
+
|
| 242 |
+
def _get_driver_env_vars(self, resources_specified: bool) -> Dict[str, str]:
|
| 243 |
+
"""Returns environment variables that should be set in the driver."""
|
| 244 |
+
# RAY_ADDRESS may be the dashboard URL but not the gcs address,
|
| 245 |
+
# so when the environment variable is not empty, we force set RAY_ADDRESS
|
| 246 |
+
# to "auto" to avoid function `canonicalize_bootstrap_address_or_die` returning
|
| 247 |
+
# the wrong GCS address.
|
| 248 |
+
# TODO(Jialing He, Archit Kulkarni): Definition of Specification RAY_ADDRESS
|
| 249 |
+
if ray_constants.RAY_ADDRESS_ENVIRONMENT_VARIABLE in os.environ:
|
| 250 |
+
os.environ[ray_constants.RAY_ADDRESS_ENVIRONMENT_VARIABLE] = "auto"
|
| 251 |
+
ray_addr = ray._private.services.canonicalize_bootstrap_address_or_die(
|
| 252 |
+
"auto", ray.worker._global_node._ray_params.temp_dir
|
| 253 |
+
)
|
| 254 |
+
assert ray_addr is not None
|
| 255 |
+
return {
|
| 256 |
+
# Set JobConfig for the child process (runtime_env, metadata).
|
| 257 |
+
RAY_JOB_CONFIG_JSON_ENV_VAR: json.dumps(
|
| 258 |
+
{
|
| 259 |
+
"runtime_env": self._get_driver_runtime_env(resources_specified),
|
| 260 |
+
"metadata": self._metadata,
|
| 261 |
+
}
|
| 262 |
+
),
|
| 263 |
+
# Always set RAY_ADDRESS as find_bootstrap_address address for
|
| 264 |
+
# job submission. In case of local development, prevent user from
|
| 265 |
+
# re-using http://{address}:{dashboard_port} to interact with
|
| 266 |
+
# jobs SDK.
|
| 267 |
+
# TODO:(mwtian) Check why "auto" does not work in entrypoint script
|
| 268 |
+
ray_constants.RAY_ADDRESS_ENVIRONMENT_VARIABLE: ray_addr,
|
| 269 |
+
# Set PYTHONUNBUFFERED=1 to stream logs during the job instead of
|
| 270 |
+
# only streaming them upon completion of the job.
|
| 271 |
+
"PYTHONUNBUFFERED": "1",
|
| 272 |
+
}
|
| 273 |
+
|
| 274 |
+
async def _polling(self, child_process: subprocess.Popen) -> int:
|
| 275 |
+
while child_process is not None:
|
| 276 |
+
return_code = child_process.poll()
|
| 277 |
+
if return_code is not None:
|
| 278 |
+
# subprocess finished with return code
|
| 279 |
+
return return_code
|
| 280 |
+
else:
|
| 281 |
+
# still running, yield control, 0.1s by default
|
| 282 |
+
await asyncio.sleep(self.SUBPROCESS_POLL_PERIOD_S)
|
| 283 |
+
|
| 284 |
+
async def _poll_all(self, processes: List[psutil.Process]):
|
| 285 |
+
"""Poll processes until all are completed."""
|
| 286 |
+
while True:
|
| 287 |
+
(_, alive) = psutil.wait_procs(processes, timeout=0)
|
| 288 |
+
if len(alive) == 0:
|
| 289 |
+
return
|
| 290 |
+
else:
|
| 291 |
+
await asyncio.sleep(self.SUBPROCESS_POLL_PERIOD_S)
|
| 292 |
+
|
| 293 |
+
def _kill_processes(self, processes: List[psutil.Process], sig: signal.Signals):
|
| 294 |
+
"""Ensure each process is already finished or send a kill signal."""
|
| 295 |
+
for proc in processes:
|
| 296 |
+
try:
|
| 297 |
+
os.kill(proc.pid, sig)
|
| 298 |
+
except ProcessLookupError:
|
| 299 |
+
# Process is already dead
|
| 300 |
+
pass
|
| 301 |
+
|
| 302 |
+
async def run(
|
| 303 |
+
self,
|
| 304 |
+
# Signal actor used in testing to capture PENDING -> RUNNING cases
|
| 305 |
+
_start_signal_actor: Optional[ActorHandle] = None,
|
| 306 |
+
resources_specified: bool = False,
|
| 307 |
+
):
|
| 308 |
+
"""
|
| 309 |
+
Stop and start both happen asynchronously, coordinated by asyncio event
|
| 310 |
+
and coroutine, respectively.
|
| 311 |
+
|
| 312 |
+
1) Sets job status as running
|
| 313 |
+
2) Pass runtime env and metadata to subprocess as serialized env
|
| 314 |
+
variables.
|
| 315 |
+
3) Handle concurrent events of driver execution and
|
| 316 |
+
"""
|
| 317 |
+
curr_info = await self._job_info_client.get_info(self._job_id)
|
| 318 |
+
if curr_info is None:
|
| 319 |
+
raise RuntimeError(f"Status could not be retrieved for job {self._job_id}.")
|
| 320 |
+
curr_status = curr_info.status
|
| 321 |
+
curr_message = curr_info.message
|
| 322 |
+
if curr_status == JobStatus.RUNNING:
|
| 323 |
+
raise RuntimeError(
|
| 324 |
+
f"Job {self._job_id} is already in RUNNING state. "
|
| 325 |
+
f"JobSupervisor.run() should only be called once. "
|
| 326 |
+
)
|
| 327 |
+
if curr_status != JobStatus.PENDING:
|
| 328 |
+
raise RuntimeError(
|
| 329 |
+
f"Job {self._job_id} is not in PENDING state. "
|
| 330 |
+
f"Current status is {curr_status} with message {curr_message}."
|
| 331 |
+
)
|
| 332 |
+
|
| 333 |
+
if _start_signal_actor:
|
| 334 |
+
# Block in PENDING state until start signal received.
|
| 335 |
+
await _start_signal_actor.wait.remote()
|
| 336 |
+
|
| 337 |
+
driver_agent_http_address = (
|
| 338 |
+
"http://"
|
| 339 |
+
f"{ray.worker.global_worker.node.node_ip_address}:"
|
| 340 |
+
f"{ray.worker.global_worker.node.dashboard_agent_listen_port}"
|
| 341 |
+
)
|
| 342 |
+
driver_node_id = ray.get_runtime_context().get_node_id()
|
| 343 |
+
|
| 344 |
+
await self._job_info_client.put_status(
|
| 345 |
+
self._job_id,
|
| 346 |
+
JobStatus.RUNNING,
|
| 347 |
+
jobinfo_replace_kwargs={
|
| 348 |
+
"driver_agent_http_address": driver_agent_http_address,
|
| 349 |
+
"driver_node_id": driver_node_id,
|
| 350 |
+
},
|
| 351 |
+
)
|
| 352 |
+
|
| 353 |
+
try:
|
| 354 |
+
# Configure environment variables for the child process.
|
| 355 |
+
env = os.environ.copy()
|
| 356 |
+
# Remove internal Ray flags. They present because JobSuperVisor itself is
|
| 357 |
+
# a Ray worker process but we don't want to pass them to the driver.
|
| 358 |
+
remove_ray_internal_flags_from_env(env)
|
| 359 |
+
# These will *not* be set in the runtime_env, so they apply to the driver
|
| 360 |
+
# only, not its tasks & actors.
|
| 361 |
+
env.update(self._get_driver_env_vars(resources_specified))
|
| 362 |
+
|
| 363 |
+
self._logger.info(
|
| 364 |
+
"Submitting job with RAY_ADDRESS = "
|
| 365 |
+
f"{env[ray_constants.RAY_ADDRESS_ENVIRONMENT_VARIABLE]}"
|
| 366 |
+
)
|
| 367 |
+
log_path = self._log_client.get_log_file_path(self._job_id)
|
| 368 |
+
child_process = self._exec_entrypoint(env, log_path)
|
| 369 |
+
child_pid = child_process.pid
|
| 370 |
+
|
| 371 |
+
polling_task = create_task(self._polling(child_process))
|
| 372 |
+
finished, _ = await asyncio.wait(
|
| 373 |
+
[polling_task, create_task(self._stop_event.wait())],
|
| 374 |
+
return_when=FIRST_COMPLETED,
|
| 375 |
+
)
|
| 376 |
+
|
| 377 |
+
if self._stop_event.is_set():
|
| 378 |
+
polling_task.cancel()
|
| 379 |
+
if sys.platform == "win32" and self._win32_job_object:
|
| 380 |
+
win32job.TerminateJobObject(self._win32_job_object, -1)
|
| 381 |
+
elif sys.platform != "win32":
|
| 382 |
+
stop_signal = os.environ.get("RAY_JOB_STOP_SIGNAL", "SIGTERM")
|
| 383 |
+
if stop_signal not in self.VALID_STOP_SIGNALS:
|
| 384 |
+
self._logger.warning(
|
| 385 |
+
f"{stop_signal} not a valid stop signal. Terminating "
|
| 386 |
+
"job with SIGTERM."
|
| 387 |
+
)
|
| 388 |
+
stop_signal = "SIGTERM"
|
| 389 |
+
|
| 390 |
+
job_process = psutil.Process(child_pid)
|
| 391 |
+
proc_to_kill = [job_process] + job_process.children(recursive=True)
|
| 392 |
+
|
| 393 |
+
# Send stop signal and wait for job to terminate gracefully,
|
| 394 |
+
# otherwise SIGKILL job forcefully after timeout.
|
| 395 |
+
self._kill_processes(proc_to_kill, getattr(signal, stop_signal))
|
| 396 |
+
try:
|
| 397 |
+
stop_job_wait_time = int(
|
| 398 |
+
os.environ.get(
|
| 399 |
+
"RAY_JOB_STOP_WAIT_TIME_S",
|
| 400 |
+
self.DEFAULT_RAY_JOB_STOP_WAIT_TIME_S,
|
| 401 |
+
)
|
| 402 |
+
)
|
| 403 |
+
poll_job_stop_task = create_task(self._poll_all(proc_to_kill))
|
| 404 |
+
await asyncio.wait_for(poll_job_stop_task, stop_job_wait_time)
|
| 405 |
+
self._logger.info(
|
| 406 |
+
f"Job {self._job_id} has been terminated gracefully "
|
| 407 |
+
f"with {stop_signal}."
|
| 408 |
+
)
|
| 409 |
+
except asyncio.TimeoutError:
|
| 410 |
+
self._logger.warning(
|
| 411 |
+
f"Attempt to gracefully terminate job {self._job_id} "
|
| 412 |
+
f"through {stop_signal} has timed out after "
|
| 413 |
+
f"{stop_job_wait_time} seconds. Job is now being "
|
| 414 |
+
"force-killed with SIGKILL."
|
| 415 |
+
)
|
| 416 |
+
self._kill_processes(proc_to_kill, signal.SIGKILL)
|
| 417 |
+
|
| 418 |
+
await self._job_info_client.put_status(self._job_id, JobStatus.STOPPED)
|
| 419 |
+
else:
|
| 420 |
+
# Child process finished execution and no stop event is set
|
| 421 |
+
# at the same time
|
| 422 |
+
assert len(finished) == 1, "Should have only one coroutine done"
|
| 423 |
+
[child_process_task] = finished
|
| 424 |
+
return_code = child_process_task.result()
|
| 425 |
+
self._logger.info(
|
| 426 |
+
f"Job {self._job_id} entrypoint command "
|
| 427 |
+
f"exited with code {return_code}"
|
| 428 |
+
)
|
| 429 |
+
if return_code == 0:
|
| 430 |
+
await self._job_info_client.put_status(
|
| 431 |
+
self._job_id,
|
| 432 |
+
JobStatus.SUCCEEDED,
|
| 433 |
+
driver_exit_code=return_code,
|
| 434 |
+
)
|
| 435 |
+
else:
|
| 436 |
+
log_tail = await self._log_client.get_last_n_log_lines(self._job_id)
|
| 437 |
+
if log_tail is not None and log_tail != "":
|
| 438 |
+
message = (
|
| 439 |
+
"Job entrypoint command "
|
| 440 |
+
f"failed with exit code {return_code}, "
|
| 441 |
+
"last available logs (truncated to 20,000 chars):\n"
|
| 442 |
+
+ log_tail
|
| 443 |
+
)
|
| 444 |
+
else:
|
| 445 |
+
message = (
|
| 446 |
+
"Job entrypoint command "
|
| 447 |
+
f"failed with exit code {return_code}. No logs available."
|
| 448 |
+
)
|
| 449 |
+
await self._job_info_client.put_status(
|
| 450 |
+
self._job_id,
|
| 451 |
+
JobStatus.FAILED,
|
| 452 |
+
message=message,
|
| 453 |
+
driver_exit_code=return_code,
|
| 454 |
+
)
|
| 455 |
+
except Exception:
|
| 456 |
+
self._logger.error(
|
| 457 |
+
"Got unexpected exception while trying to execute driver "
|
| 458 |
+
f"command. {traceback.format_exc()}"
|
| 459 |
+
)
|
| 460 |
+
try:
|
| 461 |
+
await self._job_info_client.put_status(
|
| 462 |
+
self._job_id,
|
| 463 |
+
JobStatus.FAILED,
|
| 464 |
+
message=traceback.format_exc(),
|
| 465 |
+
)
|
| 466 |
+
except Exception:
|
| 467 |
+
self._logger.error(
|
| 468 |
+
"Failed to update job status to FAILED. "
|
| 469 |
+
f"Exception: {traceback.format_exc()}"
|
| 470 |
+
)
|
| 471 |
+
finally:
|
| 472 |
+
# clean up actor after tasks are finished
|
| 473 |
+
ray.actor.exit_actor()
|
| 474 |
+
|
| 475 |
+
def stop(self):
|
| 476 |
+
"""Set step_event and let run() handle the rest in its asyncio.wait()."""
|
| 477 |
+
self._stop_event.set()
|
.venv/lib/python3.11/site-packages/ray/dashboard/modules/job/pydantic_models.py
ADDED
|
@@ -0,0 +1,110 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from enum import Enum
|
| 2 |
+
from typing import Any, Dict, Optional
|
| 3 |
+
|
| 4 |
+
from ray._private.pydantic_compat import PYDANTIC_INSTALLED, BaseModel, Field
|
| 5 |
+
from ray.dashboard.modules.job.common import JobStatus
|
| 6 |
+
from ray.util.annotations import PublicAPI
|
| 7 |
+
|
| 8 |
+
# Pydantic is not part of the minimal Ray installation.
|
| 9 |
+
if PYDANTIC_INSTALLED:
|
| 10 |
+
|
| 11 |
+
@PublicAPI(stability="beta")
|
| 12 |
+
class DriverInfo(BaseModel):
|
| 13 |
+
"""A class for recording information about the driver related to the job."""
|
| 14 |
+
|
| 15 |
+
id: str = Field(..., description="The id of the driver")
|
| 16 |
+
node_ip_address: str = Field(
|
| 17 |
+
..., description="The IP address of the node the driver is running on."
|
| 18 |
+
)
|
| 19 |
+
pid: str = Field(
|
| 20 |
+
..., description="The PID of the worker process the driver is using."
|
| 21 |
+
)
|
| 22 |
+
# TODO(aguo): Add node_id as a field.
|
| 23 |
+
|
| 24 |
+
@PublicAPI(stability="beta")
|
| 25 |
+
class JobType(str, Enum):
|
| 26 |
+
"""An enumeration for describing the different job types.
|
| 27 |
+
|
| 28 |
+
NOTE:
|
| 29 |
+
This field is still experimental and may change in the future.
|
| 30 |
+
"""
|
| 31 |
+
|
| 32 |
+
#: A job that was initiated by the Ray Jobs API.
|
| 33 |
+
SUBMISSION = "SUBMISSION"
|
| 34 |
+
#: A job that was initiated by a driver script.
|
| 35 |
+
DRIVER = "DRIVER"
|
| 36 |
+
|
| 37 |
+
@PublicAPI(stability="beta")
|
| 38 |
+
class JobDetails(BaseModel):
|
| 39 |
+
"""
|
| 40 |
+
Job data with extra details about its driver and its submission.
|
| 41 |
+
"""
|
| 42 |
+
|
| 43 |
+
type: JobType = Field(..., description="The type of job.")
|
| 44 |
+
job_id: Optional[str] = Field(
|
| 45 |
+
None,
|
| 46 |
+
description="The job ID. An ID that is created for every job that is "
|
| 47 |
+
"launched in Ray. This can be used to fetch data about jobs using Ray "
|
| 48 |
+
"Core APIs.",
|
| 49 |
+
)
|
| 50 |
+
submission_id: Optional[str] = Field(
|
| 51 |
+
None,
|
| 52 |
+
description="A submission ID is an ID created for every job submitted via"
|
| 53 |
+
"the Ray Jobs API. It can "
|
| 54 |
+
"be used to fetch data about jobs using the Ray Jobs API.",
|
| 55 |
+
)
|
| 56 |
+
driver_info: Optional[DriverInfo] = Field(
|
| 57 |
+
None,
|
| 58 |
+
description="The driver related to this job. For jobs submitted via "
|
| 59 |
+
"the Ray Jobs API, "
|
| 60 |
+
"it is the last driver launched by that job submission, "
|
| 61 |
+
"or None if there is no driver.",
|
| 62 |
+
)
|
| 63 |
+
|
| 64 |
+
# The following fields are copied from JobInfo.
|
| 65 |
+
# TODO(aguo): Inherit from JobInfo once it's migrated to pydantic.
|
| 66 |
+
status: JobStatus = Field(..., description="The status of the job.")
|
| 67 |
+
entrypoint: str = Field(..., description="The entrypoint command for this job.")
|
| 68 |
+
message: Optional[str] = Field(
|
| 69 |
+
None, description="A message describing the status in more detail."
|
| 70 |
+
)
|
| 71 |
+
error_type: Optional[str] = Field(
|
| 72 |
+
None, description="Internal error or user script error."
|
| 73 |
+
)
|
| 74 |
+
start_time: Optional[int] = Field(
|
| 75 |
+
None,
|
| 76 |
+
description="The time when the job was started. " "A Unix timestamp in ms.",
|
| 77 |
+
)
|
| 78 |
+
end_time: Optional[int] = Field(
|
| 79 |
+
None,
|
| 80 |
+
description="The time when the job moved into a terminal state. "
|
| 81 |
+
"A Unix timestamp in ms.",
|
| 82 |
+
)
|
| 83 |
+
metadata: Optional[Dict[str, str]] = Field(
|
| 84 |
+
None, description="Arbitrary user-provided metadata for the job."
|
| 85 |
+
)
|
| 86 |
+
runtime_env: Optional[Dict[str, Any]] = Field(
|
| 87 |
+
None, description="The runtime environment for the job."
|
| 88 |
+
)
|
| 89 |
+
# the node info where the driver running on.
|
| 90 |
+
# - driver_agent_http_address: this node's agent http address
|
| 91 |
+
# - driver_node_id: this node's id.
|
| 92 |
+
driver_agent_http_address: Optional[str] = Field(
|
| 93 |
+
None,
|
| 94 |
+
description="The HTTP address of the JobAgent on the node the job "
|
| 95 |
+
"entrypoint command is running on.",
|
| 96 |
+
)
|
| 97 |
+
driver_node_id: Optional[str] = Field(
|
| 98 |
+
None,
|
| 99 |
+
description="The ID of the node the job entrypoint command is running on.",
|
| 100 |
+
)
|
| 101 |
+
driver_exit_code: Optional[int] = Field(
|
| 102 |
+
None,
|
| 103 |
+
description="The driver process exit code after the driver executed. "
|
| 104 |
+
"Return None if driver doesn't finish executing.",
|
| 105 |
+
)
|
| 106 |
+
|
| 107 |
+
else:
|
| 108 |
+
DriverInfo = None
|
| 109 |
+
JobType = None
|
| 110 |
+
JobDetails = None
|
.venv/lib/python3.11/site-packages/ray/dashboard/modules/job/sdk.py
ADDED
|
@@ -0,0 +1,492 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import dataclasses
|
| 2 |
+
import logging
|
| 3 |
+
from typing import Any, AsyncIterator, Dict, List, Optional, Union
|
| 4 |
+
|
| 5 |
+
import packaging.version
|
| 6 |
+
|
| 7 |
+
import ray
|
| 8 |
+
from ray.dashboard.modules.dashboard_sdk import SubmissionClient
|
| 9 |
+
from ray.dashboard.modules.job.common import (
|
| 10 |
+
JobDeleteResponse,
|
| 11 |
+
JobLogsResponse,
|
| 12 |
+
JobStatus,
|
| 13 |
+
JobStopResponse,
|
| 14 |
+
JobSubmitRequest,
|
| 15 |
+
JobSubmitResponse,
|
| 16 |
+
)
|
| 17 |
+
from ray.dashboard.modules.job.pydantic_models import JobDetails
|
| 18 |
+
from ray.dashboard.modules.job.utils import strip_keys_with_value_none
|
| 19 |
+
from ray.dashboard.utils import get_address_for_submission_client
|
| 20 |
+
from ray.runtime_env import RuntimeEnv
|
| 21 |
+
from ray.util.annotations import PublicAPI
|
| 22 |
+
|
| 23 |
+
try:
|
| 24 |
+
import aiohttp
|
| 25 |
+
import requests
|
| 26 |
+
except ImportError:
|
| 27 |
+
aiohttp = None
|
| 28 |
+
requests = None
|
| 29 |
+
|
| 30 |
+
|
| 31 |
+
logger = logging.getLogger(__name__)
|
| 32 |
+
logger.setLevel(logging.INFO)
|
| 33 |
+
|
| 34 |
+
|
| 35 |
+
class JobSubmissionClient(SubmissionClient):
|
| 36 |
+
"""A local client for submitting and interacting with jobs on a remote cluster.
|
| 37 |
+
|
| 38 |
+
Submits requests over HTTP to the job server on the cluster using the REST API.
|
| 39 |
+
|
| 40 |
+
|
| 41 |
+
Args:
|
| 42 |
+
address: Either (1) the address of the Ray cluster, or (2) the HTTP address
|
| 43 |
+
of the dashboard server on the head node, e.g. "http://<head-node-ip>:8265".
|
| 44 |
+
In case (1) it must be specified as an address that can be passed to
|
| 45 |
+
ray.init(), e.g. a Ray Client address (ray://<head_node_host>:10001),
|
| 46 |
+
or "auto", or "localhost:<port>". If unspecified, will try to connect to
|
| 47 |
+
a running local Ray cluster. This argument is always overridden by the
|
| 48 |
+
RAY_ADDRESS environment variable.
|
| 49 |
+
create_cluster_if_needed: Indicates whether the cluster at the specified
|
| 50 |
+
address needs to already be running. Ray doesn't start a cluster
|
| 51 |
+
before interacting with jobs, but third-party job managers may do so.
|
| 52 |
+
cookies: Cookies to use when sending requests to the HTTP job server.
|
| 53 |
+
metadata: Arbitrary metadata to store along with all jobs. New metadata
|
| 54 |
+
specified per job will be merged with the global metadata provided here
|
| 55 |
+
via a simple dict update.
|
| 56 |
+
headers: Headers to use when sending requests to the HTTP job server, used
|
| 57 |
+
for cases like authentication to a remote cluster.
|
| 58 |
+
verify: Boolean indication to verify the server's TLS certificate or a path to
|
| 59 |
+
a file or directory of trusted certificates. Default: True.
|
| 60 |
+
"""
|
| 61 |
+
|
| 62 |
+
def __init__(
|
| 63 |
+
self,
|
| 64 |
+
address: Optional[str] = None,
|
| 65 |
+
create_cluster_if_needed: bool = False,
|
| 66 |
+
cookies: Optional[Dict[str, Any]] = None,
|
| 67 |
+
metadata: Optional[Dict[str, Any]] = None,
|
| 68 |
+
headers: Optional[Dict[str, Any]] = None,
|
| 69 |
+
verify: Optional[Union[str, bool]] = True,
|
| 70 |
+
):
|
| 71 |
+
self._client_ray_version = ray.__version__
|
| 72 |
+
"""Initialize a JobSubmissionClient and check the connection to the cluster."""
|
| 73 |
+
if requests is None:
|
| 74 |
+
raise RuntimeError(
|
| 75 |
+
"The Ray jobs CLI & SDK require the ray[default] "
|
| 76 |
+
"installation: `pip install 'ray[default]'`"
|
| 77 |
+
)
|
| 78 |
+
# Check types of arguments
|
| 79 |
+
if address is not None and not isinstance(address, str):
|
| 80 |
+
raise TypeError(f"address must be a string, got {type(address)}")
|
| 81 |
+
if not isinstance(create_cluster_if_needed, bool):
|
| 82 |
+
raise TypeError(
|
| 83 |
+
f"create_cluster_if_needed must be a bool, got"
|
| 84 |
+
f" {type(create_cluster_if_needed)}"
|
| 85 |
+
)
|
| 86 |
+
if cookies is not None and not isinstance(cookies, dict):
|
| 87 |
+
raise TypeError(f"cookies must be a dict, got {type(cookies)}")
|
| 88 |
+
if metadata is not None and not isinstance(metadata, dict):
|
| 89 |
+
raise TypeError(f"metadata must be a dict, got {type(metadata)}")
|
| 90 |
+
if headers is not None and not isinstance(headers, dict):
|
| 91 |
+
raise TypeError(f"headers must be a dict, got {type(headers)}")
|
| 92 |
+
if not (isinstance(verify, str) or isinstance(verify, bool)):
|
| 93 |
+
raise TypeError(f"verify must be a str or bool, got {type(verify)}")
|
| 94 |
+
|
| 95 |
+
api_server_url = get_address_for_submission_client(address)
|
| 96 |
+
|
| 97 |
+
super().__init__(
|
| 98 |
+
address=api_server_url,
|
| 99 |
+
create_cluster_if_needed=create_cluster_if_needed,
|
| 100 |
+
cookies=cookies,
|
| 101 |
+
metadata=metadata,
|
| 102 |
+
headers=headers,
|
| 103 |
+
verify=verify,
|
| 104 |
+
)
|
| 105 |
+
self._check_connection_and_version(
|
| 106 |
+
min_version="1.9",
|
| 107 |
+
version_error_message="Jobs API is not supported on the Ray "
|
| 108 |
+
"cluster. Please ensure the cluster is "
|
| 109 |
+
"running Ray 1.9 or higher.",
|
| 110 |
+
)
|
| 111 |
+
|
| 112 |
+
# In ray>=2.0, the client sends the new kwarg `submission_id` to the server
|
| 113 |
+
# upon every job submission, which causes servers with ray<2.0 to error.
|
| 114 |
+
if packaging.version.parse(self._client_ray_version) > packaging.version.parse(
|
| 115 |
+
"2.0"
|
| 116 |
+
):
|
| 117 |
+
self._check_connection_and_version(
|
| 118 |
+
min_version="2.0",
|
| 119 |
+
version_error_message=f"Client Ray version {self._client_ray_version} "
|
| 120 |
+
"is not compatible with the Ray cluster. Please ensure the cluster is "
|
| 121 |
+
"running Ray 2.0 or higher or downgrade the client Ray version.",
|
| 122 |
+
)
|
| 123 |
+
|
| 124 |
+
@PublicAPI(stability="stable")
|
| 125 |
+
def submit_job(
|
| 126 |
+
self,
|
| 127 |
+
*,
|
| 128 |
+
entrypoint: str,
|
| 129 |
+
job_id: Optional[str] = None,
|
| 130 |
+
runtime_env: Optional[Dict[str, Any]] = None,
|
| 131 |
+
metadata: Optional[Dict[str, str]] = None,
|
| 132 |
+
submission_id: Optional[str] = None,
|
| 133 |
+
entrypoint_num_cpus: Optional[Union[int, float]] = None,
|
| 134 |
+
entrypoint_num_gpus: Optional[Union[int, float]] = None,
|
| 135 |
+
entrypoint_memory: Optional[int] = None,
|
| 136 |
+
entrypoint_resources: Optional[Dict[str, float]] = None,
|
| 137 |
+
) -> str:
|
| 138 |
+
"""Submit and execute a job asynchronously.
|
| 139 |
+
|
| 140 |
+
When a job is submitted, it runs once to completion or failure. Retries or
|
| 141 |
+
different runs with different parameters should be handled by the
|
| 142 |
+
submitter. Jobs are bound to the lifetime of a Ray cluster, so if the
|
| 143 |
+
cluster goes down, all running jobs on that cluster will be terminated.
|
| 144 |
+
|
| 145 |
+
Example:
|
| 146 |
+
>>> from ray.job_submission import JobSubmissionClient
|
| 147 |
+
>>> client = JobSubmissionClient("http://127.0.0.1:8265") # doctest: +SKIP
|
| 148 |
+
>>> client.submit_job( # doctest: +SKIP
|
| 149 |
+
... entrypoint="python script.py",
|
| 150 |
+
... runtime_env={
|
| 151 |
+
... "working_dir": "./",
|
| 152 |
+
... "pip": ["requests==2.26.0"]
|
| 153 |
+
... }
|
| 154 |
+
... ) # doctest: +SKIP
|
| 155 |
+
'raysubmit_4LamXRuQpYdSMg7J'
|
| 156 |
+
|
| 157 |
+
Args:
|
| 158 |
+
entrypoint: The shell command to run for this job.
|
| 159 |
+
submission_id: A unique ID for this job.
|
| 160 |
+
runtime_env: The runtime environment to install and run this job in.
|
| 161 |
+
metadata: Arbitrary data to store along with this job.
|
| 162 |
+
job_id: DEPRECATED. This has been renamed to submission_id
|
| 163 |
+
entrypoint_num_cpus: The quantity of CPU cores to reserve for the execution
|
| 164 |
+
of the entrypoint command, separately from any tasks or actors launched
|
| 165 |
+
by it. Defaults to 0.
|
| 166 |
+
entrypoint_num_gpus: The quantity of GPUs to reserve for the execution
|
| 167 |
+
of the entrypoint command, separately from any tasks or actors launched
|
| 168 |
+
by it. Defaults to 0.
|
| 169 |
+
entrypoint_memory: The quantity of memory to reserve for the
|
| 170 |
+
execution of the entrypoint command, separately from any tasks or
|
| 171 |
+
actors launched by it. Defaults to 0.
|
| 172 |
+
entrypoint_resources: The quantity of custom resources to reserve for the
|
| 173 |
+
execution of the entrypoint command, separately from any tasks or
|
| 174 |
+
actors launched by it.
|
| 175 |
+
|
| 176 |
+
Returns:
|
| 177 |
+
The submission ID of the submitted job. If not specified,
|
| 178 |
+
this is a randomly generated unique ID.
|
| 179 |
+
|
| 180 |
+
Raises:
|
| 181 |
+
RuntimeError: If the request to the job server fails, or if the specified
|
| 182 |
+
submission_id has already been used by a job on this cluster.
|
| 183 |
+
"""
|
| 184 |
+
if job_id:
|
| 185 |
+
logger.warning(
|
| 186 |
+
"job_id kwarg is deprecated. Please use submission_id instead."
|
| 187 |
+
)
|
| 188 |
+
|
| 189 |
+
if entrypoint_num_cpus or entrypoint_num_gpus or entrypoint_resources:
|
| 190 |
+
self._check_connection_and_version(
|
| 191 |
+
min_version="2.2",
|
| 192 |
+
version_error_message="`entrypoint_num_cpus`, `entrypoint_num_gpus`, "
|
| 193 |
+
"and `entrypoint_resources` kwargs "
|
| 194 |
+
"are not supported on the Ray cluster. Please ensure the cluster is "
|
| 195 |
+
"running Ray 2.2 or higher.",
|
| 196 |
+
)
|
| 197 |
+
|
| 198 |
+
if entrypoint_memory:
|
| 199 |
+
self._check_connection_and_version(
|
| 200 |
+
min_version="2.8",
|
| 201 |
+
version_error_message="`entrypoint_memory` kwarg "
|
| 202 |
+
"is not supported on the Ray cluster. Please ensure the cluster is "
|
| 203 |
+
"running Ray 2.8 or higher.",
|
| 204 |
+
)
|
| 205 |
+
|
| 206 |
+
runtime_env = runtime_env or {}
|
| 207 |
+
metadata = metadata or {}
|
| 208 |
+
metadata.update(self._default_metadata)
|
| 209 |
+
|
| 210 |
+
self._upload_working_dir_if_needed(runtime_env)
|
| 211 |
+
self._upload_py_modules_if_needed(runtime_env)
|
| 212 |
+
|
| 213 |
+
# Verify worker_process_setup_hook type.
|
| 214 |
+
setup_hook = runtime_env.get("worker_process_setup_hook")
|
| 215 |
+
if setup_hook and not isinstance(setup_hook, str):
|
| 216 |
+
raise ValueError(
|
| 217 |
+
f"Invalid type {type(setup_hook)} for `worker_process_setup_hook`. "
|
| 218 |
+
"When a job submission API is used, `worker_process_setup_hook` "
|
| 219 |
+
"only allows a string type (module name). "
|
| 220 |
+
"Specify `worker_process_setup_hook` via "
|
| 221 |
+
"ray.init within a driver to use a `Callable` type. "
|
| 222 |
+
)
|
| 223 |
+
|
| 224 |
+
# Run the RuntimeEnv constructor to parse local pip/conda requirements files.
|
| 225 |
+
runtime_env = RuntimeEnv(**runtime_env).to_dict()
|
| 226 |
+
|
| 227 |
+
submission_id = submission_id or job_id
|
| 228 |
+
req = JobSubmitRequest(
|
| 229 |
+
entrypoint=entrypoint,
|
| 230 |
+
submission_id=submission_id,
|
| 231 |
+
runtime_env=runtime_env,
|
| 232 |
+
metadata=metadata,
|
| 233 |
+
entrypoint_num_cpus=entrypoint_num_cpus,
|
| 234 |
+
entrypoint_num_gpus=entrypoint_num_gpus,
|
| 235 |
+
entrypoint_memory=entrypoint_memory,
|
| 236 |
+
entrypoint_resources=entrypoint_resources,
|
| 237 |
+
)
|
| 238 |
+
|
| 239 |
+
# Remove keys with value None so that new clients with new optional fields
|
| 240 |
+
# are still compatible with older servers. This is also done on the server,
|
| 241 |
+
# but we do it here as well to be extra defensive.
|
| 242 |
+
json_data = strip_keys_with_value_none(dataclasses.asdict(req))
|
| 243 |
+
|
| 244 |
+
logger.debug(f"Submitting job with submission_id={submission_id}.")
|
| 245 |
+
r = self._do_request("POST", "/api/jobs/", json_data=json_data)
|
| 246 |
+
|
| 247 |
+
if r.status_code == 200:
|
| 248 |
+
return JobSubmitResponse(**r.json()).submission_id
|
| 249 |
+
else:
|
| 250 |
+
self._raise_error(r)
|
| 251 |
+
|
| 252 |
+
@PublicAPI(stability="stable")
|
| 253 |
+
def stop_job(
|
| 254 |
+
self,
|
| 255 |
+
job_id: str,
|
| 256 |
+
) -> bool:
|
| 257 |
+
"""Request a job to exit asynchronously.
|
| 258 |
+
|
| 259 |
+
Attempts to terminate process first, then kills process after timeout.
|
| 260 |
+
|
| 261 |
+
Example:
|
| 262 |
+
>>> from ray.job_submission import JobSubmissionClient
|
| 263 |
+
>>> client = JobSubmissionClient("http://127.0.0.1:8265") # doctest: +SKIP
|
| 264 |
+
>>> sub_id = client.submit_job(entrypoint="sleep 10") # doctest: +SKIP
|
| 265 |
+
>>> client.stop_job(sub_id) # doctest: +SKIP
|
| 266 |
+
True
|
| 267 |
+
|
| 268 |
+
Args:
|
| 269 |
+
job_id: The job ID or submission ID for the job to be stopped.
|
| 270 |
+
|
| 271 |
+
Returns:
|
| 272 |
+
True if the job was running, otherwise False.
|
| 273 |
+
|
| 274 |
+
Raises:
|
| 275 |
+
RuntimeError: If the job does not exist or if the request to the
|
| 276 |
+
job server fails.
|
| 277 |
+
"""
|
| 278 |
+
logger.debug(f"Stopping job with job_id={job_id}.")
|
| 279 |
+
r = self._do_request("POST", f"/api/jobs/{job_id}/stop")
|
| 280 |
+
|
| 281 |
+
if r.status_code == 200:
|
| 282 |
+
return JobStopResponse(**r.json()).stopped
|
| 283 |
+
else:
|
| 284 |
+
self._raise_error(r)
|
| 285 |
+
|
| 286 |
+
@PublicAPI(stability="stable")
|
| 287 |
+
def delete_job(
|
| 288 |
+
self,
|
| 289 |
+
job_id: str,
|
| 290 |
+
) -> bool:
|
| 291 |
+
"""Delete a job in a terminal state and all of its associated data.
|
| 292 |
+
|
| 293 |
+
If the job is not already in a terminal state, raises an error.
|
| 294 |
+
This does not delete the job logs from disk.
|
| 295 |
+
Submitting a job with the same submission ID as a previously
|
| 296 |
+
deleted job is not supported and may lead to unexpected behavior.
|
| 297 |
+
|
| 298 |
+
Example:
|
| 299 |
+
>>> from ray.job_submission import JobSubmissionClient
|
| 300 |
+
>>> client = JobSubmissionClient() # doctest: +SKIP
|
| 301 |
+
>>> job_id = client.submit_job(entrypoint="echo hello") # doctest: +SKIP
|
| 302 |
+
>>> client.delete_job(job_id) # doctest: +SKIP
|
| 303 |
+
True
|
| 304 |
+
|
| 305 |
+
Args:
|
| 306 |
+
job_id: submission ID for the job to be deleted.
|
| 307 |
+
|
| 308 |
+
Returns:
|
| 309 |
+
True if the job was deleted, otherwise False.
|
| 310 |
+
|
| 311 |
+
Raises:
|
| 312 |
+
RuntimeError: If the job does not exist, if the request to the
|
| 313 |
+
job server fails, or if the job is not in a terminal state.
|
| 314 |
+
"""
|
| 315 |
+
logger.debug(f"Deleting job with job_id={job_id}.")
|
| 316 |
+
r = self._do_request("DELETE", f"/api/jobs/{job_id}")
|
| 317 |
+
|
| 318 |
+
if r.status_code == 200:
|
| 319 |
+
return JobDeleteResponse(**r.json()).deleted
|
| 320 |
+
else:
|
| 321 |
+
self._raise_error(r)
|
| 322 |
+
|
| 323 |
+
@PublicAPI(stability="stable")
|
| 324 |
+
def get_job_info(
|
| 325 |
+
self,
|
| 326 |
+
job_id: str,
|
| 327 |
+
) -> JobDetails:
|
| 328 |
+
"""Get the latest status and other information associated with a job.
|
| 329 |
+
|
| 330 |
+
Example:
|
| 331 |
+
>>> from ray.job_submission import JobSubmissionClient
|
| 332 |
+
>>> client = JobSubmissionClient("http://127.0.0.1:8265") # doctest: +SKIP
|
| 333 |
+
>>> submission_id = client.submit_job(entrypoint="sleep 1") # doctest: +SKIP
|
| 334 |
+
>>> job_submission_client.get_job_info(submission_id) # doctest: +SKIP
|
| 335 |
+
JobInfo(status='SUCCEEDED', message='Job finished successfully.',
|
| 336 |
+
error_type=None, start_time=1647388711, end_time=1647388712,
|
| 337 |
+
metadata={}, runtime_env={})
|
| 338 |
+
|
| 339 |
+
Args:
|
| 340 |
+
job_id: The job ID or submission ID of the job whose information
|
| 341 |
+
is being requested.
|
| 342 |
+
|
| 343 |
+
Returns:
|
| 344 |
+
The JobInfo for the job.
|
| 345 |
+
|
| 346 |
+
Raises:
|
| 347 |
+
RuntimeError: If the job does not exist or if the request to the
|
| 348 |
+
job server fails.
|
| 349 |
+
"""
|
| 350 |
+
r = self._do_request("GET", f"/api/jobs/{job_id}")
|
| 351 |
+
|
| 352 |
+
if r.status_code == 200:
|
| 353 |
+
return JobDetails(**r.json())
|
| 354 |
+
else:
|
| 355 |
+
self._raise_error(r)
|
| 356 |
+
|
| 357 |
+
@PublicAPI(stability="stable")
|
| 358 |
+
def list_jobs(self) -> List[JobDetails]:
|
| 359 |
+
"""List all jobs along with their status and other information.
|
| 360 |
+
|
| 361 |
+
Lists all jobs that have ever run on the cluster, including jobs that are
|
| 362 |
+
currently running and jobs that are no longer running.
|
| 363 |
+
|
| 364 |
+
Example:
|
| 365 |
+
>>> from ray.job_submission import JobSubmissionClient
|
| 366 |
+
>>> client = JobSubmissionClient("http://127.0.0.1:8265") # doctest: +SKIP
|
| 367 |
+
>>> client.submit_job(entrypoint="echo hello") # doctest: +SKIP
|
| 368 |
+
>>> client.submit_job(entrypoint="sleep 2") # doctest: +SKIP
|
| 369 |
+
>>> client.list_jobs() # doctest: +SKIP
|
| 370 |
+
[JobDetails(status='SUCCEEDED',
|
| 371 |
+
job_id='03000000', type='submission',
|
| 372 |
+
submission_id='raysubmit_4LamXRuQpYdSMg7J',
|
| 373 |
+
message='Job finished successfully.', error_type=None,
|
| 374 |
+
start_time=1647388711, end_time=1647388712, metadata={}, runtime_env={}),
|
| 375 |
+
JobDetails(status='RUNNING',
|
| 376 |
+
job_id='04000000', type='submission',
|
| 377 |
+
submission_id='raysubmit_1dxCeNvG1fCMVNHG',
|
| 378 |
+
message='Job is currently running.', error_type=None,
|
| 379 |
+
start_time=1647454832, end_time=None, metadata={}, runtime_env={})]
|
| 380 |
+
|
| 381 |
+
Returns:
|
| 382 |
+
A dictionary mapping job_ids to their information.
|
| 383 |
+
|
| 384 |
+
Raises:
|
| 385 |
+
RuntimeError: If the request to the job server fails.
|
| 386 |
+
"""
|
| 387 |
+
r = self._do_request("GET", "/api/jobs/")
|
| 388 |
+
|
| 389 |
+
if r.status_code == 200:
|
| 390 |
+
jobs_info_json = r.json()
|
| 391 |
+
jobs_info = [
|
| 392 |
+
JobDetails(**job_info_json) for job_info_json in jobs_info_json
|
| 393 |
+
]
|
| 394 |
+
return jobs_info
|
| 395 |
+
else:
|
| 396 |
+
self._raise_error(r)
|
| 397 |
+
|
| 398 |
+
@PublicAPI(stability="stable")
|
| 399 |
+
def get_job_status(self, job_id: str) -> JobStatus:
|
| 400 |
+
"""Get the most recent status of a job.
|
| 401 |
+
|
| 402 |
+
Example:
|
| 403 |
+
>>> from ray.job_submission import JobSubmissionClient
|
| 404 |
+
>>> client = JobSubmissionClient("http://127.0.0.1:8265") # doctest: +SKIP
|
| 405 |
+
>>> client.submit_job(entrypoint="echo hello") # doctest: +SKIP
|
| 406 |
+
>>> client.get_job_status("raysubmit_4LamXRuQpYdSMg7J") # doctest: +SKIP
|
| 407 |
+
'SUCCEEDED'
|
| 408 |
+
|
| 409 |
+
Args:
|
| 410 |
+
job_id: The job ID or submission ID of the job whose status is being
|
| 411 |
+
requested.
|
| 412 |
+
|
| 413 |
+
Returns:
|
| 414 |
+
The JobStatus of the job.
|
| 415 |
+
|
| 416 |
+
Raises:
|
| 417 |
+
RuntimeError: If the job does not exist or if the request to the
|
| 418 |
+
job server fails.
|
| 419 |
+
"""
|
| 420 |
+
return self.get_job_info(job_id).status
|
| 421 |
+
|
| 422 |
+
@PublicAPI(stability="stable")
|
| 423 |
+
def get_job_logs(self, job_id: str) -> str:
|
| 424 |
+
"""Get all logs produced by a job.
|
| 425 |
+
|
| 426 |
+
Example:
|
| 427 |
+
>>> from ray.job_submission import JobSubmissionClient
|
| 428 |
+
>>> client = JobSubmissionClient("http://127.0.0.1:8265") # doctest: +SKIP
|
| 429 |
+
>>> sub_id = client.submit_job(entrypoint="echo hello") # doctest: +SKIP
|
| 430 |
+
>>> client.get_job_logs(sub_id) # doctest: +SKIP
|
| 431 |
+
'hello\\n'
|
| 432 |
+
|
| 433 |
+
Args:
|
| 434 |
+
job_id: The job ID or submission ID of the job whose logs are being
|
| 435 |
+
requested.
|
| 436 |
+
|
| 437 |
+
Returns:
|
| 438 |
+
A string containing the full logs of the job.
|
| 439 |
+
|
| 440 |
+
Raises:
|
| 441 |
+
RuntimeError: If the job does not exist or if the request to the
|
| 442 |
+
job server fails.
|
| 443 |
+
"""
|
| 444 |
+
r = self._do_request("GET", f"/api/jobs/{job_id}/logs")
|
| 445 |
+
|
| 446 |
+
if r.status_code == 200:
|
| 447 |
+
return JobLogsResponse(**r.json()).logs
|
| 448 |
+
else:
|
| 449 |
+
self._raise_error(r)
|
| 450 |
+
|
| 451 |
+
@PublicAPI(stability="stable")
|
| 452 |
+
async def tail_job_logs(self, job_id: str) -> AsyncIterator[str]:
|
| 453 |
+
"""Get an iterator that follows the logs of a job.
|
| 454 |
+
|
| 455 |
+
Example:
|
| 456 |
+
>>> from ray.job_submission import JobSubmissionClient
|
| 457 |
+
>>> client = JobSubmissionClient("http://127.0.0.1:8265") # doctest: +SKIP
|
| 458 |
+
>>> submission_id = client.submit_job( # doctest: +SKIP
|
| 459 |
+
... entrypoint="echo hi && sleep 5 && echo hi2")
|
| 460 |
+
>>> async for lines in client.tail_job_logs( # doctest: +SKIP
|
| 461 |
+
... 'raysubmit_Xe7cvjyGJCyuCvm2'):
|
| 462 |
+
... print(lines, end="") # doctest: +SKIP
|
| 463 |
+
hi
|
| 464 |
+
hi2
|
| 465 |
+
|
| 466 |
+
Args:
|
| 467 |
+
job_id: The job ID or submission ID of the job whose logs are being
|
| 468 |
+
requested.
|
| 469 |
+
|
| 470 |
+
Returns:
|
| 471 |
+
The iterator.
|
| 472 |
+
|
| 473 |
+
Raises:
|
| 474 |
+
RuntimeError: If the job does not exist or if the request to the
|
| 475 |
+
job server fails.
|
| 476 |
+
"""
|
| 477 |
+
async with aiohttp.ClientSession(
|
| 478 |
+
cookies=self._cookies, headers=self._headers
|
| 479 |
+
) as session:
|
| 480 |
+
ws = await session.ws_connect(
|
| 481 |
+
f"{self._address}/api/jobs/{job_id}/logs/tail", ssl=self._ssl_context
|
| 482 |
+
)
|
| 483 |
+
|
| 484 |
+
while True:
|
| 485 |
+
msg = await ws.receive()
|
| 486 |
+
|
| 487 |
+
if msg.type == aiohttp.WSMsgType.TEXT:
|
| 488 |
+
yield msg.data
|
| 489 |
+
elif msg.type == aiohttp.WSMsgType.CLOSED:
|
| 490 |
+
break
|
| 491 |
+
elif msg.type == aiohttp.WSMsgType.ERROR:
|
| 492 |
+
pass
|
.venv/lib/python3.11/site-packages/ray/dashboard/modules/job/utils.py
ADDED
|
@@ -0,0 +1,304 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import asyncio
|
| 2 |
+
import dataclasses
|
| 3 |
+
import logging
|
| 4 |
+
import os
|
| 5 |
+
import re
|
| 6 |
+
import traceback
|
| 7 |
+
from dataclasses import dataclass
|
| 8 |
+
from typing import Any, AsyncIterator, Dict, List, Optional, Tuple, Union
|
| 9 |
+
|
| 10 |
+
from ray._private import ray_constants
|
| 11 |
+
from ray._private.gcs_utils import GcsAioClient
|
| 12 |
+
from ray.dashboard.modules.job.common import (
|
| 13 |
+
JOB_ID_METADATA_KEY,
|
| 14 |
+
JobInfoStorageClient,
|
| 15 |
+
JobStatus,
|
| 16 |
+
validate_request_type,
|
| 17 |
+
)
|
| 18 |
+
from ray.dashboard.modules.job.pydantic_models import DriverInfo, JobDetails, JobType
|
| 19 |
+
from ray.runtime_env import RuntimeEnv
|
| 20 |
+
|
| 21 |
+
try:
|
| 22 |
+
# package `aiohttp` is not in ray's minimal dependencies
|
| 23 |
+
import aiohttp
|
| 24 |
+
from aiohttp.web import Request, Response
|
| 25 |
+
except Exception:
|
| 26 |
+
aiohttp = None
|
| 27 |
+
Request = None
|
| 28 |
+
Response = None
|
| 29 |
+
|
| 30 |
+
|
| 31 |
+
logger = logging.getLogger(__name__)
|
| 32 |
+
|
| 33 |
+
MAX_CHUNK_LINE_LENGTH = 10
|
| 34 |
+
MAX_CHUNK_CHAR_LENGTH = 20000
|
| 35 |
+
|
| 36 |
+
|
| 37 |
+
async def get_head_node_id(gcs_aio_client: GcsAioClient) -> Optional[str]:
|
| 38 |
+
"""Fetches Head node id persisted in GCS"""
|
| 39 |
+
head_node_id_bytes = await gcs_aio_client.internal_kv_get(
|
| 40 |
+
ray_constants.KV_HEAD_NODE_ID_KEY,
|
| 41 |
+
namespace=ray_constants.KV_NAMESPACE_JOB,
|
| 42 |
+
timeout=30,
|
| 43 |
+
)
|
| 44 |
+
|
| 45 |
+
return head_node_id_bytes.decode() if head_node_id_bytes is not None else None
|
| 46 |
+
|
| 47 |
+
|
| 48 |
+
def strip_keys_with_value_none(d: Dict[str, Any]) -> Dict[str, Any]:
|
| 49 |
+
"""Strip keys with value None from a dictionary."""
|
| 50 |
+
return {k: v for k, v in d.items() if v is not None}
|
| 51 |
+
|
| 52 |
+
|
| 53 |
+
def redact_url_password(url: str) -> str:
|
| 54 |
+
"""Redact any passwords in a URL."""
|
| 55 |
+
secret = re.findall(r"https?:\/\/.*:(.*)@.*", url)
|
| 56 |
+
if len(secret) > 0:
|
| 57 |
+
url = url.replace(f":{secret[0]}@", ":<redacted>@")
|
| 58 |
+
|
| 59 |
+
return url
|
| 60 |
+
|
| 61 |
+
|
| 62 |
+
async def file_tail_iterator(path: str) -> AsyncIterator[Optional[List[str]]]:
|
| 63 |
+
"""Yield lines from a file as it's written.
|
| 64 |
+
|
| 65 |
+
Returns lines in batches of up to 10 lines or 20000 characters,
|
| 66 |
+
whichever comes first. If it's a chunk of 20000 characters, then
|
| 67 |
+
the last line that is yielded could be an incomplete line.
|
| 68 |
+
New line characters are kept in the line string.
|
| 69 |
+
|
| 70 |
+
Returns None until the file exists or if no new line has been written.
|
| 71 |
+
"""
|
| 72 |
+
if not isinstance(path, str):
|
| 73 |
+
raise TypeError(f"path must be a string, got {type(path)}.")
|
| 74 |
+
|
| 75 |
+
while not os.path.exists(path):
|
| 76 |
+
logger.debug(f"Path {path} doesn't exist yet.")
|
| 77 |
+
yield None
|
| 78 |
+
|
| 79 |
+
EOF = ""
|
| 80 |
+
|
| 81 |
+
with open(path, "r") as f:
|
| 82 |
+
lines = []
|
| 83 |
+
|
| 84 |
+
chunk_char_count = 0
|
| 85 |
+
curr_line = None
|
| 86 |
+
|
| 87 |
+
while True:
|
| 88 |
+
# We want to flush current chunk in following cases:
|
| 89 |
+
# - We accumulated 10 lines
|
| 90 |
+
# - We accumulated at least MAX_CHUNK_CHAR_LENGTH total chars
|
| 91 |
+
# - We reached EOF
|
| 92 |
+
if (
|
| 93 |
+
len(lines) >= 10
|
| 94 |
+
or chunk_char_count > MAX_CHUNK_CHAR_LENGTH
|
| 95 |
+
or curr_line == EOF
|
| 96 |
+
):
|
| 97 |
+
# Too many lines, return 10 lines in this chunk, and then
|
| 98 |
+
# continue reading the file.
|
| 99 |
+
yield lines or None
|
| 100 |
+
|
| 101 |
+
lines = []
|
| 102 |
+
chunk_char_count = 0
|
| 103 |
+
|
| 104 |
+
# Read next line
|
| 105 |
+
curr_line = f.readline()
|
| 106 |
+
|
| 107 |
+
# `readline` will return
|
| 108 |
+
# - '' for EOF
|
| 109 |
+
# - '\n' for an empty line in the file
|
| 110 |
+
if curr_line != EOF:
|
| 111 |
+
# Add line to current chunk
|
| 112 |
+
lines.append(curr_line)
|
| 113 |
+
chunk_char_count += len(curr_line)
|
| 114 |
+
else:
|
| 115 |
+
# If EOF is reached sleep for 1s before continuing
|
| 116 |
+
await asyncio.sleep(1)
|
| 117 |
+
|
| 118 |
+
|
| 119 |
+
async def parse_and_validate_request(
|
| 120 |
+
req: Request, request_type: dataclass
|
| 121 |
+
) -> Union[dataclass, Response]:
|
| 122 |
+
"""Parse request and cast to request type.
|
| 123 |
+
|
| 124 |
+
Remove keys with value None to allow newer client versions with new optional fields
|
| 125 |
+
to work with older servers.
|
| 126 |
+
|
| 127 |
+
If parsing failed, return a Response object with status 400 and stacktrace instead.
|
| 128 |
+
|
| 129 |
+
Args:
|
| 130 |
+
req: aiohttp request object.
|
| 131 |
+
request_type: dataclass type to cast request to.
|
| 132 |
+
|
| 133 |
+
Returns:
|
| 134 |
+
Parsed request object or Response object with status 400 and stacktrace.
|
| 135 |
+
"""
|
| 136 |
+
import aiohttp
|
| 137 |
+
|
| 138 |
+
json_data = strip_keys_with_value_none(await req.json())
|
| 139 |
+
try:
|
| 140 |
+
return validate_request_type(json_data, request_type)
|
| 141 |
+
except Exception as e:
|
| 142 |
+
logger.info(f"Got invalid request type: {e}")
|
| 143 |
+
return Response(
|
| 144 |
+
text=traceback.format_exc(),
|
| 145 |
+
status=aiohttp.web.HTTPBadRequest.status_code,
|
| 146 |
+
)
|
| 147 |
+
|
| 148 |
+
|
| 149 |
+
async def get_driver_jobs(
|
| 150 |
+
gcs_aio_client: GcsAioClient,
|
| 151 |
+
job_or_submission_id: Optional[str] = None,
|
| 152 |
+
timeout: Optional[int] = None,
|
| 153 |
+
) -> Tuple[Dict[str, JobDetails], Dict[str, DriverInfo]]:
|
| 154 |
+
"""Returns a tuple of dictionaries related to drivers.
|
| 155 |
+
|
| 156 |
+
The first dictionary contains all driver jobs and is keyed by the job's id.
|
| 157 |
+
The second dictionary contains drivers that belong to submission jobs.
|
| 158 |
+
It's keyed by the submission job's submission id.
|
| 159 |
+
Only the last driver of a submission job is returned.
|
| 160 |
+
|
| 161 |
+
An optional job_or_submission_id filter can be provided to only return
|
| 162 |
+
jobs with the job id or submission id.
|
| 163 |
+
"""
|
| 164 |
+
job_infos = await gcs_aio_client.get_all_job_info(
|
| 165 |
+
job_or_submission_id=job_or_submission_id,
|
| 166 |
+
skip_submission_job_info_field=True,
|
| 167 |
+
skip_is_running_tasks_field=True,
|
| 168 |
+
timeout=timeout,
|
| 169 |
+
)
|
| 170 |
+
# Sort jobs from GCS to follow convention of returning only last driver
|
| 171 |
+
# of submission job.
|
| 172 |
+
sorted_job_infos = sorted(
|
| 173 |
+
job_infos.values(), key=lambda job_table_entry: job_table_entry.job_id.hex()
|
| 174 |
+
)
|
| 175 |
+
|
| 176 |
+
jobs = {}
|
| 177 |
+
submission_job_drivers = {}
|
| 178 |
+
for job_table_entry in sorted_job_infos:
|
| 179 |
+
if job_table_entry.config.ray_namespace.startswith(
|
| 180 |
+
ray_constants.RAY_INTERNAL_NAMESPACE_PREFIX
|
| 181 |
+
):
|
| 182 |
+
# Skip jobs in any _ray_internal_ namespace
|
| 183 |
+
continue
|
| 184 |
+
job_id = job_table_entry.job_id.hex()
|
| 185 |
+
metadata = dict(job_table_entry.config.metadata)
|
| 186 |
+
job_submission_id = metadata.get(JOB_ID_METADATA_KEY)
|
| 187 |
+
if not job_submission_id:
|
| 188 |
+
driver = DriverInfo(
|
| 189 |
+
id=job_id,
|
| 190 |
+
node_ip_address=job_table_entry.driver_address.ip_address,
|
| 191 |
+
pid=str(job_table_entry.driver_pid),
|
| 192 |
+
)
|
| 193 |
+
job = JobDetails(
|
| 194 |
+
job_id=job_id,
|
| 195 |
+
type=JobType.DRIVER,
|
| 196 |
+
status=JobStatus.SUCCEEDED
|
| 197 |
+
if job_table_entry.is_dead
|
| 198 |
+
else JobStatus.RUNNING,
|
| 199 |
+
entrypoint=job_table_entry.entrypoint,
|
| 200 |
+
start_time=job_table_entry.start_time,
|
| 201 |
+
end_time=job_table_entry.end_time,
|
| 202 |
+
metadata=metadata,
|
| 203 |
+
runtime_env=RuntimeEnv.deserialize(
|
| 204 |
+
job_table_entry.config.runtime_env_info.serialized_runtime_env
|
| 205 |
+
).to_dict(),
|
| 206 |
+
driver_info=driver,
|
| 207 |
+
)
|
| 208 |
+
jobs[job_id] = job
|
| 209 |
+
else:
|
| 210 |
+
driver = DriverInfo(
|
| 211 |
+
id=job_id,
|
| 212 |
+
node_ip_address=job_table_entry.driver_address.ip_address,
|
| 213 |
+
pid=str(job_table_entry.driver_pid),
|
| 214 |
+
)
|
| 215 |
+
submission_job_drivers[job_submission_id] = driver
|
| 216 |
+
|
| 217 |
+
return jobs, submission_job_drivers
|
| 218 |
+
|
| 219 |
+
|
| 220 |
+
async def find_job_by_ids(
|
| 221 |
+
gcs_aio_client: GcsAioClient,
|
| 222 |
+
job_info_client: JobInfoStorageClient,
|
| 223 |
+
job_or_submission_id: str,
|
| 224 |
+
) -> Optional[JobDetails]:
|
| 225 |
+
"""
|
| 226 |
+
Attempts to find the job with a given submission_id or job id.
|
| 227 |
+
"""
|
| 228 |
+
# First try to find by job_id
|
| 229 |
+
driver_jobs, submission_job_drivers = await get_driver_jobs(
|
| 230 |
+
gcs_aio_client, job_or_submission_id=job_or_submission_id
|
| 231 |
+
)
|
| 232 |
+
job = driver_jobs.get(job_or_submission_id)
|
| 233 |
+
if job:
|
| 234 |
+
return job
|
| 235 |
+
# Try to find a driver with the given id
|
| 236 |
+
submission_id = next(
|
| 237 |
+
(
|
| 238 |
+
id
|
| 239 |
+
for id, driver in submission_job_drivers.items()
|
| 240 |
+
if driver.id == job_or_submission_id
|
| 241 |
+
),
|
| 242 |
+
None,
|
| 243 |
+
)
|
| 244 |
+
|
| 245 |
+
if not submission_id:
|
| 246 |
+
# If we didn't find a driver with the given id,
|
| 247 |
+
# then lets try to search for a submission with given id
|
| 248 |
+
submission_id = job_or_submission_id
|
| 249 |
+
|
| 250 |
+
job_info = await job_info_client.get_info(submission_id)
|
| 251 |
+
if job_info:
|
| 252 |
+
driver = submission_job_drivers.get(submission_id)
|
| 253 |
+
job = JobDetails(
|
| 254 |
+
**dataclasses.asdict(job_info),
|
| 255 |
+
submission_id=submission_id,
|
| 256 |
+
job_id=driver.id if driver else None,
|
| 257 |
+
driver_info=driver,
|
| 258 |
+
type=JobType.SUBMISSION,
|
| 259 |
+
)
|
| 260 |
+
return job
|
| 261 |
+
|
| 262 |
+
return None
|
| 263 |
+
|
| 264 |
+
|
| 265 |
+
async def find_jobs_by_job_ids(
|
| 266 |
+
gcs_aio_client: GcsAioClient,
|
| 267 |
+
job_info_client: JobInfoStorageClient,
|
| 268 |
+
job_ids: List[str],
|
| 269 |
+
) -> Dict[str, JobDetails]:
|
| 270 |
+
"""
|
| 271 |
+
Returns a dictionary of submission jobs with the given job ids, keyed by the job id.
|
| 272 |
+
|
| 273 |
+
This only accepts job ids and not submission ids.
|
| 274 |
+
"""
|
| 275 |
+
driver_jobs, submission_job_drivers = await get_driver_jobs(gcs_aio_client)
|
| 276 |
+
|
| 277 |
+
# Filter down to the request job_ids
|
| 278 |
+
driver_jobs = {key: job for key, job in driver_jobs.items() if key in job_ids}
|
| 279 |
+
submission_job_drivers = {
|
| 280 |
+
key: job for key, job in submission_job_drivers.items() if job.id in job_ids
|
| 281 |
+
}
|
| 282 |
+
|
| 283 |
+
# Fetch job details for each job
|
| 284 |
+
job_submission_ids = submission_job_drivers.keys()
|
| 285 |
+
job_infos = await asyncio.gather(
|
| 286 |
+
*[
|
| 287 |
+
job_info_client.get_info(submission_id)
|
| 288 |
+
for submission_id in job_submission_ids
|
| 289 |
+
]
|
| 290 |
+
)
|
| 291 |
+
|
| 292 |
+
return {
|
| 293 |
+
**driver_jobs,
|
| 294 |
+
**{
|
| 295 |
+
submission_job_drivers.get(submission_id).id: JobDetails(
|
| 296 |
+
**dataclasses.asdict(job_info),
|
| 297 |
+
submission_id=submission_id,
|
| 298 |
+
job_id=submission_job_drivers.get(submission_id).id,
|
| 299 |
+
driver_info=submission_job_drivers.get(submission_id),
|
| 300 |
+
type=JobType.SUBMISSION,
|
| 301 |
+
)
|
| 302 |
+
for job_info, submission_id in zip(job_infos, job_submission_ids)
|
| 303 |
+
},
|
| 304 |
+
}
|
.venv/lib/python3.11/site-packages/ray/dashboard/modules/log/__init__.py
ADDED
|
File without changes
|
.venv/lib/python3.11/site-packages/ray/dashboard/modules/log/__pycache__/log_utils.cpython-311.pyc
ADDED
|
Binary file (775 Bytes). View file
|
|
|
.venv/lib/python3.11/site-packages/ray/dashboard/modules/log/log_agent.py
ADDED
|
@@ -0,0 +1,404 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import asyncio
|
| 2 |
+
import concurrent.futures
|
| 3 |
+
import io
|
| 4 |
+
import logging
|
| 5 |
+
import os
|
| 6 |
+
from pathlib import Path
|
| 7 |
+
from typing import Optional
|
| 8 |
+
|
| 9 |
+
import grpc
|
| 10 |
+
|
| 11 |
+
import ray.dashboard.modules.log.log_consts as log_consts
|
| 12 |
+
import ray.dashboard.modules.log.log_utils as log_utils
|
| 13 |
+
import ray.dashboard.optional_utils as dashboard_optional_utils
|
| 14 |
+
import ray.dashboard.utils as dashboard_utils
|
| 15 |
+
from ray._private.ray_constants import env_integer
|
| 16 |
+
from ray.core.generated import reporter_pb2, reporter_pb2_grpc
|
| 17 |
+
|
| 18 |
+
logger = logging.getLogger(__name__)
|
| 19 |
+
routes = dashboard_optional_utils.DashboardAgentRouteTable
|
| 20 |
+
|
| 21 |
+
# 64 KB
|
| 22 |
+
BLOCK_SIZE = 1 << 16
|
| 23 |
+
|
| 24 |
+
# Keep-alive interval for reading the file
|
| 25 |
+
DEFAULT_KEEP_ALIVE_INTERVAL_SEC = 1
|
| 26 |
+
|
| 27 |
+
RAY_DASHBOARD_LOG_TASK_LOG_SEARCH_MAX_WORKER_COUNT = env_integer(
|
| 28 |
+
"RAY_DASHBOARD_LOG_TASK_LOG_SEARCH_MAX_WORKER_COUNT", default=2
|
| 29 |
+
)
|
| 30 |
+
|
| 31 |
+
|
| 32 |
+
def find_offset_of_content_in_file(
|
| 33 |
+
file: io.BufferedIOBase, content: bytes, start_offset: int = 0
|
| 34 |
+
) -> int:
|
| 35 |
+
"""Find the offset of the first occurrence of content in a file.
|
| 36 |
+
|
| 37 |
+
Args:
|
| 38 |
+
file: File object
|
| 39 |
+
content: Content to find
|
| 40 |
+
start_offset: Start offset to read from, inclusive.
|
| 41 |
+
|
| 42 |
+
Returns:
|
| 43 |
+
Offset of the first occurrence of content in a file.
|
| 44 |
+
"""
|
| 45 |
+
logger.debug(f"Finding offset of content {content} in file")
|
| 46 |
+
file.seek(start_offset, io.SEEK_SET) # move file pointer to start of file
|
| 47 |
+
offset = start_offset
|
| 48 |
+
while True:
|
| 49 |
+
# Read in block
|
| 50 |
+
block_data = file.read(BLOCK_SIZE)
|
| 51 |
+
if block_data == b"":
|
| 52 |
+
# Stop reading
|
| 53 |
+
return -1
|
| 54 |
+
# Find the offset of the first occurrence of content in the block
|
| 55 |
+
block_offset = block_data.find(content)
|
| 56 |
+
if block_offset != -1:
|
| 57 |
+
# Found the offset in the block
|
| 58 |
+
return offset + block_offset
|
| 59 |
+
# Continue reading
|
| 60 |
+
offset += len(block_data)
|
| 61 |
+
|
| 62 |
+
|
| 63 |
+
def find_end_offset_file(file: io.BufferedIOBase) -> int:
|
| 64 |
+
"""
|
| 65 |
+
Find the offset of the end of a file without changing the file pointer.
|
| 66 |
+
|
| 67 |
+
Args:
|
| 68 |
+
file: File object
|
| 69 |
+
|
| 70 |
+
Returns:
|
| 71 |
+
Offset of the end of a file.
|
| 72 |
+
"""
|
| 73 |
+
old_pos = file.tell() # store old position
|
| 74 |
+
file.seek(0, io.SEEK_END) # move file pointer to end of file
|
| 75 |
+
end = file.tell() # return end of file offset
|
| 76 |
+
file.seek(old_pos, io.SEEK_SET)
|
| 77 |
+
return end
|
| 78 |
+
|
| 79 |
+
|
| 80 |
+
def find_end_offset_next_n_lines_from_offset(
|
| 81 |
+
file: io.BufferedIOBase, start_offset: int, n: int
|
| 82 |
+
) -> int:
|
| 83 |
+
"""
|
| 84 |
+
Find the offsets of next n lines from a start offset.
|
| 85 |
+
|
| 86 |
+
Args:
|
| 87 |
+
file: File object
|
| 88 |
+
start_offset: Start offset to read from, inclusive.
|
| 89 |
+
n: Number of lines to find.
|
| 90 |
+
|
| 91 |
+
Returns:
|
| 92 |
+
Offset of the end of the next n line (exclusive)
|
| 93 |
+
"""
|
| 94 |
+
file.seek(start_offset) # move file pointer to start offset
|
| 95 |
+
end_offset = None
|
| 96 |
+
for _ in range(n): # loop until we find n lines or reach end of file
|
| 97 |
+
line = file.readline() # read a line and consume new line character
|
| 98 |
+
if not line: # end of file
|
| 99 |
+
break
|
| 100 |
+
end_offset = file.tell() # end offset.
|
| 101 |
+
|
| 102 |
+
logger.debug(f"Found next {n} lines from {start_offset} offset")
|
| 103 |
+
return (
|
| 104 |
+
end_offset if end_offset is not None else file.seek(0, io.SEEK_END)
|
| 105 |
+
) # return last line offset or end of file offset if no lines found
|
| 106 |
+
|
| 107 |
+
|
| 108 |
+
def find_start_offset_last_n_lines_from_offset(
|
| 109 |
+
file: io.BufferedIOBase, offset: int, n: int, block_size: int = BLOCK_SIZE
|
| 110 |
+
) -> int:
|
| 111 |
+
"""
|
| 112 |
+
Find the offset of the beginning of the line of the last X lines from an offset.
|
| 113 |
+
|
| 114 |
+
Args:
|
| 115 |
+
file: File object
|
| 116 |
+
offset: Start offset from which to find last X lines, -1 means end of file.
|
| 117 |
+
The offset is exclusive, i.e. data at the offset is not included
|
| 118 |
+
in the result.
|
| 119 |
+
n: Number of lines to find
|
| 120 |
+
block_size: Block size to read from file
|
| 121 |
+
|
| 122 |
+
Returns:
|
| 123 |
+
Offset of the beginning of the line of the last X lines from a start offset.
|
| 124 |
+
"""
|
| 125 |
+
logger.debug(f"Finding last {n} lines from {offset} offset")
|
| 126 |
+
if offset == -1:
|
| 127 |
+
offset = file.seek(0, io.SEEK_END) # move file pointer to end of file
|
| 128 |
+
else:
|
| 129 |
+
file.seek(offset, io.SEEK_SET) # move file pointer to start offset
|
| 130 |
+
|
| 131 |
+
if n == 0:
|
| 132 |
+
return offset
|
| 133 |
+
nbytes_from_end = (
|
| 134 |
+
0 # Number of bytes that should be tailed from the end of the file
|
| 135 |
+
)
|
| 136 |
+
# Non new line terminating offset, adjust the line count and treat the non-newline
|
| 137 |
+
# terminated line as the last line. e.g. line 1\nline 2
|
| 138 |
+
file.seek(max(0, offset - 1), os.SEEK_SET)
|
| 139 |
+
if file.read(1) != b"\n":
|
| 140 |
+
n -= 1
|
| 141 |
+
|
| 142 |
+
# Remaining number of lines to tail
|
| 143 |
+
lines_more = n
|
| 144 |
+
read_offset = max(0, offset - block_size)
|
| 145 |
+
# So that we know how much to read on the last block (the block 0)
|
| 146 |
+
prev_offset = offset
|
| 147 |
+
|
| 148 |
+
while lines_more >= 0 and read_offset >= 0:
|
| 149 |
+
# Seek to the current block start
|
| 150 |
+
file.seek(read_offset, 0)
|
| 151 |
+
# Read the current block (or less than block) data
|
| 152 |
+
block_data = file.read(min(block_size, prev_offset - read_offset))
|
| 153 |
+
num_lines = block_data.count(b"\n")
|
| 154 |
+
if num_lines > lines_more:
|
| 155 |
+
# This is the last block to read.
|
| 156 |
+
# Need to find the offset of exact number of lines to tail
|
| 157 |
+
# in the block.
|
| 158 |
+
# Use `split` here to split away the extra lines, i.e.
|
| 159 |
+
# first `num_lines - lines_more` lines.
|
| 160 |
+
lines = block_data.split(b"\n", num_lines - lines_more)
|
| 161 |
+
# Added the len of those lines that at the end of the block.
|
| 162 |
+
nbytes_from_end += len(lines[-1])
|
| 163 |
+
break
|
| 164 |
+
|
| 165 |
+
# Need to read more blocks.
|
| 166 |
+
lines_more -= num_lines
|
| 167 |
+
nbytes_from_end += len(block_data)
|
| 168 |
+
|
| 169 |
+
if read_offset == 0:
|
| 170 |
+
# We have read all blocks (since the start)
|
| 171 |
+
break
|
| 172 |
+
# Continuing with the previous block
|
| 173 |
+
prev_offset = read_offset
|
| 174 |
+
read_offset = max(0, read_offset - block_size)
|
| 175 |
+
|
| 176 |
+
offset_read_start = offset - nbytes_from_end
|
| 177 |
+
assert (
|
| 178 |
+
offset_read_start >= 0
|
| 179 |
+
), f"Read start offset({offset_read_start}) should be non-negative"
|
| 180 |
+
return offset_read_start
|
| 181 |
+
|
| 182 |
+
|
| 183 |
+
async def _stream_log_in_chunk(
|
| 184 |
+
context: grpc.aio.ServicerContext,
|
| 185 |
+
file: io.BufferedIOBase,
|
| 186 |
+
start_offset: int,
|
| 187 |
+
end_offset: int = -1,
|
| 188 |
+
keep_alive_interval_sec: int = -1,
|
| 189 |
+
block_size: int = BLOCK_SIZE,
|
| 190 |
+
):
|
| 191 |
+
"""Streaming log in chunk from start to end offset.
|
| 192 |
+
|
| 193 |
+
Stream binary file content in chunks from start offset to an end
|
| 194 |
+
offset if provided, else to the end of the file.
|
| 195 |
+
|
| 196 |
+
Args:
|
| 197 |
+
context: gRPC server side context
|
| 198 |
+
file: Binary file to stream
|
| 199 |
+
start_offset: File offset where streaming starts
|
| 200 |
+
end_offset: If -1, implying streaming til the EOF.
|
| 201 |
+
keep_alive_interval_sec: Duration for which streaming will be
|
| 202 |
+
retried when reaching the file end, -1 means no retry.
|
| 203 |
+
block_size: Number of bytes per chunk, exposed for testing
|
| 204 |
+
|
| 205 |
+
Return:
|
| 206 |
+
Async generator of StreamReply
|
| 207 |
+
"""
|
| 208 |
+
assert "b" in file.mode, "Only binary file is supported."
|
| 209 |
+
assert not (
|
| 210 |
+
keep_alive_interval_sec >= 0 and end_offset != -1
|
| 211 |
+
), "Keep-alive is not allowed when specifying an end offset"
|
| 212 |
+
|
| 213 |
+
file.seek(start_offset, 0)
|
| 214 |
+
cur_offset = start_offset
|
| 215 |
+
|
| 216 |
+
# Until gRPC is done
|
| 217 |
+
while not context.done():
|
| 218 |
+
# Read in block
|
| 219 |
+
if end_offset != -1:
|
| 220 |
+
to_read = min(end_offset - cur_offset, block_size)
|
| 221 |
+
else:
|
| 222 |
+
to_read = block_size
|
| 223 |
+
|
| 224 |
+
bytes = file.read(to_read)
|
| 225 |
+
|
| 226 |
+
if bytes == b"":
|
| 227 |
+
# Stop reading
|
| 228 |
+
if keep_alive_interval_sec >= 0:
|
| 229 |
+
await asyncio.sleep(keep_alive_interval_sec)
|
| 230 |
+
# Try reading again
|
| 231 |
+
continue
|
| 232 |
+
|
| 233 |
+
# Have read the entire file, done
|
| 234 |
+
break
|
| 235 |
+
logger.debug(f"Sending {len(bytes)} bytes at {cur_offset}")
|
| 236 |
+
yield reporter_pb2.StreamLogReply(data=bytes)
|
| 237 |
+
|
| 238 |
+
# Have read the requested section [start_offset, end_offset), done
|
| 239 |
+
cur_offset += len(bytes)
|
| 240 |
+
if end_offset != -1 and cur_offset >= end_offset:
|
| 241 |
+
break
|
| 242 |
+
|
| 243 |
+
|
| 244 |
+
class LogAgent(dashboard_utils.DashboardAgentModule):
|
| 245 |
+
def __init__(self, dashboard_agent):
|
| 246 |
+
super().__init__(dashboard_agent)
|
| 247 |
+
log_utils.register_mimetypes()
|
| 248 |
+
routes.static("/logs", self._dashboard_agent.log_dir, show_index=True)
|
| 249 |
+
|
| 250 |
+
async def run(self, server):
|
| 251 |
+
pass
|
| 252 |
+
|
| 253 |
+
@staticmethod
|
| 254 |
+
def is_minimal_module():
|
| 255 |
+
return False
|
| 256 |
+
|
| 257 |
+
|
| 258 |
+
_task_log_search_worker_pool = concurrent.futures.ThreadPoolExecutor(
|
| 259 |
+
max_workers=RAY_DASHBOARD_LOG_TASK_LOG_SEARCH_MAX_WORKER_COUNT
|
| 260 |
+
)
|
| 261 |
+
|
| 262 |
+
|
| 263 |
+
class LogAgentV1Grpc(dashboard_utils.DashboardAgentModule):
|
| 264 |
+
def __init__(self, dashboard_agent):
|
| 265 |
+
super().__init__(dashboard_agent)
|
| 266 |
+
|
| 267 |
+
async def run(self, server):
|
| 268 |
+
if server:
|
| 269 |
+
reporter_pb2_grpc.add_LogServiceServicer_to_server(self, server)
|
| 270 |
+
|
| 271 |
+
@property
|
| 272 |
+
def node_id(self) -> Optional[str]:
|
| 273 |
+
return self._dashboard_agent.get_node_id()
|
| 274 |
+
|
| 275 |
+
@staticmethod
|
| 276 |
+
def is_minimal_module():
|
| 277 |
+
# Dashboard is only available with non-minimal install now.
|
| 278 |
+
return False
|
| 279 |
+
|
| 280 |
+
async def ListLogs(self, request, context):
|
| 281 |
+
"""
|
| 282 |
+
Lists all files in the active Ray logs directory.
|
| 283 |
+
|
| 284 |
+
Part of `LogService` gRPC.
|
| 285 |
+
|
| 286 |
+
NOTE: These RPCs are used by state_head.py, not log_head.py
|
| 287 |
+
"""
|
| 288 |
+
path = Path(self._dashboard_agent.log_dir)
|
| 289 |
+
if not path.exists():
|
| 290 |
+
raise FileNotFoundError(
|
| 291 |
+
f"Could not find log dir at path: {self._dashboard_agent.log_dir}"
|
| 292 |
+
"It is unexpected. Please report an issue to Ray Github."
|
| 293 |
+
)
|
| 294 |
+
log_files = []
|
| 295 |
+
for p in path.glob(request.glob_filter):
|
| 296 |
+
log_files.append(str(p.relative_to(path)) + ("/" if p.is_dir() else ""))
|
| 297 |
+
return reporter_pb2.ListLogsReply(log_files=log_files)
|
| 298 |
+
|
| 299 |
+
@classmethod
|
| 300 |
+
def _resolve_filename(cls, root_log_dir: Path, filename: str) -> Path:
|
| 301 |
+
"""
|
| 302 |
+
Resolves the file path relative to the root log directory.
|
| 303 |
+
|
| 304 |
+
Args:
|
| 305 |
+
root_log_dir: Root log directory.
|
| 306 |
+
filename: File path relative to the root log directory.
|
| 307 |
+
|
| 308 |
+
Raises:
|
| 309 |
+
FileNotFoundError: If the file path is invalid.
|
| 310 |
+
|
| 311 |
+
Returns:
|
| 312 |
+
The absolute file path resolved from the root log directory.
|
| 313 |
+
"""
|
| 314 |
+
if not Path(filename).is_absolute():
|
| 315 |
+
filepath = root_log_dir / filename
|
| 316 |
+
else:
|
| 317 |
+
filepath = Path(filename)
|
| 318 |
+
|
| 319 |
+
# We want to allow relative paths that include symlinks pointing outside of the
|
| 320 |
+
# `root_log_dir`, so use `os.path.abspath` instead of `Path.resolve()` because
|
| 321 |
+
# `os.path.abspath` does not resolve symlinks.
|
| 322 |
+
filepath = Path(os.path.abspath(filepath))
|
| 323 |
+
|
| 324 |
+
if not filepath.is_file():
|
| 325 |
+
raise FileNotFoundError(f"A file is not found at: {filepath}")
|
| 326 |
+
|
| 327 |
+
try:
|
| 328 |
+
filepath.relative_to(root_log_dir)
|
| 329 |
+
except ValueError as e:
|
| 330 |
+
raise FileNotFoundError(f"{filepath} not in {root_log_dir}: {e}")
|
| 331 |
+
|
| 332 |
+
# Fully resolve the path before returning (including following symlinks).
|
| 333 |
+
return filepath.resolve()
|
| 334 |
+
|
| 335 |
+
async def StreamLog(self, request, context):
|
| 336 |
+
"""
|
| 337 |
+
Streams the log in real time starting from `request.lines` number of lines from
|
| 338 |
+
the end of the file if `request.keep_alive == True`. Else, it terminates the
|
| 339 |
+
stream once there are no more bytes to read from the log file.
|
| 340 |
+
|
| 341 |
+
Part of `LogService` gRPC.
|
| 342 |
+
|
| 343 |
+
NOTE: These RPCs are used by state_head.py, not log_head.py
|
| 344 |
+
"""
|
| 345 |
+
# NOTE: If the client side connection is closed, this handler will
|
| 346 |
+
# be automatically terminated.
|
| 347 |
+
lines = request.lines if request.lines else 1000
|
| 348 |
+
|
| 349 |
+
try:
|
| 350 |
+
filepath = self._resolve_filename(
|
| 351 |
+
Path(self._dashboard_agent.log_dir), request.log_file_name
|
| 352 |
+
)
|
| 353 |
+
except FileNotFoundError as e:
|
| 354 |
+
await context.send_initial_metadata([[log_consts.LOG_GRPC_ERROR, str(e)]])
|
| 355 |
+
else:
|
| 356 |
+
with open(filepath, "rb") as f:
|
| 357 |
+
await context.send_initial_metadata([])
|
| 358 |
+
|
| 359 |
+
# Default stream entire file
|
| 360 |
+
start_offset = (
|
| 361 |
+
request.start_offset if request.HasField("start_offset") else 0
|
| 362 |
+
)
|
| 363 |
+
end_offset = (
|
| 364 |
+
request.end_offset
|
| 365 |
+
if request.HasField("end_offset")
|
| 366 |
+
else find_end_offset_file(f)
|
| 367 |
+
)
|
| 368 |
+
|
| 369 |
+
if lines != -1:
|
| 370 |
+
# If specified tail line number, cap the start offset
|
| 371 |
+
# with lines from the current end offset
|
| 372 |
+
start_offset = max(
|
| 373 |
+
find_start_offset_last_n_lines_from_offset(
|
| 374 |
+
f, offset=end_offset, n=lines
|
| 375 |
+
),
|
| 376 |
+
start_offset,
|
| 377 |
+
)
|
| 378 |
+
|
| 379 |
+
# If keep alive: following the log every 'interval'
|
| 380 |
+
keep_alive_interval_sec = -1
|
| 381 |
+
if request.keep_alive:
|
| 382 |
+
keep_alive_interval_sec = (
|
| 383 |
+
request.interval
|
| 384 |
+
if request.interval
|
| 385 |
+
else DEFAULT_KEEP_ALIVE_INTERVAL_SEC
|
| 386 |
+
)
|
| 387 |
+
|
| 388 |
+
# When following (keep_alive), it will read beyond the end
|
| 389 |
+
end_offset = -1
|
| 390 |
+
|
| 391 |
+
logger.info(
|
| 392 |
+
f"Tailing logs from {start_offset} to {end_offset} for "
|
| 393 |
+
f"lines={lines}, with keep_alive={keep_alive_interval_sec}"
|
| 394 |
+
)
|
| 395 |
+
|
| 396 |
+
# Read and send the file data in chunk
|
| 397 |
+
async for chunk_res in _stream_log_in_chunk(
|
| 398 |
+
context=context,
|
| 399 |
+
file=f,
|
| 400 |
+
start_offset=start_offset,
|
| 401 |
+
end_offset=end_offset,
|
| 402 |
+
keep_alive_interval_sec=keep_alive_interval_sec,
|
| 403 |
+
):
|
| 404 |
+
yield chunk_res
|
.venv/lib/python3.11/site-packages/ray/dashboard/modules/log/log_consts.py
ADDED
|
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
MIME_TYPES = {
|
| 2 |
+
"text/plain": [".err", ".out", ".log"],
|
| 3 |
+
}
|
| 4 |
+
|
| 5 |
+
LOG_GRPC_ERROR = "log_grpc_status"
|
| 6 |
+
|
| 7 |
+
# 10 seconds
|
| 8 |
+
GRPC_TIMEOUT = 10
|
.venv/lib/python3.11/site-packages/ray/dashboard/modules/log/log_manager.py
ADDED
|
@@ -0,0 +1,481 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import logging
|
| 2 |
+
import re
|
| 3 |
+
from collections import defaultdict
|
| 4 |
+
from typing import AsyncIterable, Awaitable, Callable, Dict, List, Optional, Tuple
|
| 5 |
+
|
| 6 |
+
from ray import ActorID, NodeID, WorkerID
|
| 7 |
+
from ray._private.pydantic_compat import BaseModel
|
| 8 |
+
from ray.core.generated.gcs_pb2 import ActorTableData
|
| 9 |
+
from ray.dashboard.modules.job.common import JOB_LOGS_PATH_TEMPLATE
|
| 10 |
+
from ray.util.state.common import (
|
| 11 |
+
DEFAULT_RPC_TIMEOUT,
|
| 12 |
+
GetLogOptions,
|
| 13 |
+
protobuf_to_task_state_dict,
|
| 14 |
+
)
|
| 15 |
+
from ray.util.state.exception import DataSourceUnavailable
|
| 16 |
+
from ray.util.state.state_manager import StateDataSourceClient
|
| 17 |
+
|
| 18 |
+
if BaseModel is None:
|
| 19 |
+
raise ModuleNotFoundError("Please install pydantic via `pip install pydantic`.")
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
logger = logging.getLogger(__name__)
|
| 23 |
+
|
| 24 |
+
WORKER_LOG_PATTERN = re.compile(".*worker-([0-9a-f]+)-([0-9a-f]+)-(\d+).(out|err)")
|
| 25 |
+
|
| 26 |
+
|
| 27 |
+
class ResolvedStreamFileInfo(BaseModel):
|
| 28 |
+
# The node id where the log file is located.
|
| 29 |
+
node_id: str
|
| 30 |
+
|
| 31 |
+
# The log file path name. Could be a relative path relative to ray's logging folder,
|
| 32 |
+
# or an absolute path.
|
| 33 |
+
filename: str
|
| 34 |
+
|
| 35 |
+
# Start offset in the log file to stream from. None to indicate beginning of
|
| 36 |
+
# the file, or determined by last tail lines.
|
| 37 |
+
start_offset: Optional[int]
|
| 38 |
+
|
| 39 |
+
# End offset in the log file to stream from. None to indicate the end of the file.
|
| 40 |
+
end_offset: Optional[int]
|
| 41 |
+
|
| 42 |
+
|
| 43 |
+
class LogsManager:
|
| 44 |
+
def __init__(self, data_source_client: StateDataSourceClient):
|
| 45 |
+
self.client = data_source_client
|
| 46 |
+
|
| 47 |
+
@property
|
| 48 |
+
def data_source_client(self) -> StateDataSourceClient:
|
| 49 |
+
return self.client
|
| 50 |
+
|
| 51 |
+
def ip_to_node_id(self, node_ip: Optional[str]):
|
| 52 |
+
"""Resolve the node id from a given node ip.
|
| 53 |
+
|
| 54 |
+
Args:
|
| 55 |
+
node_ip: The node ip.
|
| 56 |
+
|
| 57 |
+
Returns:
|
| 58 |
+
node_id if there's a node id that matches the given node ip and is alive.
|
| 59 |
+
None otherwise.
|
| 60 |
+
"""
|
| 61 |
+
return self.client.ip_to_node_id(node_ip)
|
| 62 |
+
|
| 63 |
+
async def list_logs(
|
| 64 |
+
self, node_id: str, timeout: int, glob_filter: str = "*"
|
| 65 |
+
) -> Dict[str, List[str]]:
|
| 66 |
+
"""Return a list of log files on a given node id filtered by the glob.
|
| 67 |
+
|
| 68 |
+
Args:
|
| 69 |
+
node_id: The node id where log files present.
|
| 70 |
+
timeout: The timeout of the API.
|
| 71 |
+
glob_filter: The glob filter to filter out log files.
|
| 72 |
+
|
| 73 |
+
Returns:
|
| 74 |
+
Dictionary of {component_name -> list of log files}
|
| 75 |
+
|
| 76 |
+
Raises:
|
| 77 |
+
DataSourceUnavailable: If a source is unresponsive.
|
| 78 |
+
"""
|
| 79 |
+
self._verify_node_registered(node_id)
|
| 80 |
+
reply = await self.client.list_logs(node_id, glob_filter, timeout=timeout)
|
| 81 |
+
return self._categorize_log_files(reply.log_files)
|
| 82 |
+
|
| 83 |
+
async def stream_logs(
|
| 84 |
+
self,
|
| 85 |
+
options: GetLogOptions,
|
| 86 |
+
get_actor_fn: Callable[[ActorID], Awaitable[Optional[ActorTableData]]],
|
| 87 |
+
) -> AsyncIterable[bytes]:
|
| 88 |
+
"""Generate a stream of logs in bytes.
|
| 89 |
+
|
| 90 |
+
Args:
|
| 91 |
+
options: The option for streaming logs.
|
| 92 |
+
|
| 93 |
+
Return:
|
| 94 |
+
Async generator of streamed logs in bytes.
|
| 95 |
+
"""
|
| 96 |
+
node_id = options.node_id or self.ip_to_node_id(options.node_ip)
|
| 97 |
+
|
| 98 |
+
res = await self.resolve_filename(
|
| 99 |
+
node_id=node_id,
|
| 100 |
+
log_filename=options.filename,
|
| 101 |
+
actor_id=options.actor_id,
|
| 102 |
+
task_id=options.task_id,
|
| 103 |
+
attempt_number=options.attempt_number,
|
| 104 |
+
pid=options.pid,
|
| 105 |
+
get_actor_fn=get_actor_fn,
|
| 106 |
+
timeout=options.timeout,
|
| 107 |
+
suffix=options.suffix,
|
| 108 |
+
submission_id=options.submission_id,
|
| 109 |
+
)
|
| 110 |
+
|
| 111 |
+
keep_alive = options.media_type == "stream"
|
| 112 |
+
stream = await self.client.stream_log(
|
| 113 |
+
node_id=res.node_id,
|
| 114 |
+
log_file_name=res.filename,
|
| 115 |
+
keep_alive=keep_alive,
|
| 116 |
+
lines=options.lines,
|
| 117 |
+
interval=options.interval,
|
| 118 |
+
# If we keepalive logs connection, we shouldn't have timeout
|
| 119 |
+
# otherwise the stream will be terminated forcefully
|
| 120 |
+
# after the deadline is expired.
|
| 121 |
+
timeout=options.timeout if not keep_alive else None,
|
| 122 |
+
start_offset=res.start_offset,
|
| 123 |
+
end_offset=res.end_offset,
|
| 124 |
+
)
|
| 125 |
+
|
| 126 |
+
async for streamed_log in stream:
|
| 127 |
+
yield streamed_log.data
|
| 128 |
+
|
| 129 |
+
def _verify_node_registered(self, node_id: str):
|
| 130 |
+
if node_id not in self.client.get_all_registered_log_agent_ids():
|
| 131 |
+
raise DataSourceUnavailable(
|
| 132 |
+
f"Given node id {node_id} is not available. "
|
| 133 |
+
"It's either the node is dead, or it is not registered. "
|
| 134 |
+
"Use `ray list nodes` "
|
| 135 |
+
"to see the node status. If the node is registered, "
|
| 136 |
+
"it is highly likely "
|
| 137 |
+
"a transient issue. Try again."
|
| 138 |
+
)
|
| 139 |
+
assert node_id is not None
|
| 140 |
+
|
| 141 |
+
async def _resolve_job_filename(self, sub_job_id: str) -> Tuple[str, str]:
|
| 142 |
+
"""Return the log file name and node id for a given job submission id.
|
| 143 |
+
|
| 144 |
+
Args:
|
| 145 |
+
sub_job_id: The job submission id.
|
| 146 |
+
|
| 147 |
+
Returns:
|
| 148 |
+
The log file name and node id.
|
| 149 |
+
"""
|
| 150 |
+
job_infos = await self.client.get_job_info(timeout=DEFAULT_RPC_TIMEOUT)
|
| 151 |
+
target_job = None
|
| 152 |
+
for job_info in job_infos:
|
| 153 |
+
if job_info.submission_id == sub_job_id:
|
| 154 |
+
target_job = job_info
|
| 155 |
+
break
|
| 156 |
+
if target_job is None:
|
| 157 |
+
logger.info(f"Submission job ID {sub_job_id} not found.")
|
| 158 |
+
return None, None
|
| 159 |
+
|
| 160 |
+
node_id = job_info.driver_node_id
|
| 161 |
+
if node_id is None:
|
| 162 |
+
raise ValueError(
|
| 163 |
+
f"Job {sub_job_id} has no driver node id info. "
|
| 164 |
+
"This is likely a bug. Please file an issue."
|
| 165 |
+
)
|
| 166 |
+
|
| 167 |
+
log_filename = JOB_LOGS_PATH_TEMPLATE.format(submission_id=sub_job_id)
|
| 168 |
+
return node_id, log_filename
|
| 169 |
+
|
| 170 |
+
async def _resolve_worker_file(
|
| 171 |
+
self,
|
| 172 |
+
node_id_hex: str,
|
| 173 |
+
worker_id_hex: Optional[str],
|
| 174 |
+
pid: Optional[int],
|
| 175 |
+
suffix: str,
|
| 176 |
+
timeout: int,
|
| 177 |
+
) -> Optional[str]:
|
| 178 |
+
"""Resolve worker log file."""
|
| 179 |
+
if worker_id_hex is not None and pid is not None:
|
| 180 |
+
raise ValueError(
|
| 181 |
+
f"Only one of worker id({worker_id_hex}) or pid({pid}) should be"
|
| 182 |
+
"provided."
|
| 183 |
+
)
|
| 184 |
+
|
| 185 |
+
if worker_id_hex is not None:
|
| 186 |
+
log_files = await self.list_logs(
|
| 187 |
+
node_id_hex, timeout, glob_filter=f"*{worker_id_hex}*{suffix}"
|
| 188 |
+
)
|
| 189 |
+
else:
|
| 190 |
+
log_files = await self.list_logs(
|
| 191 |
+
node_id_hex, timeout, glob_filter=f"*{pid}*{suffix}"
|
| 192 |
+
)
|
| 193 |
+
|
| 194 |
+
# Find matching worker logs.
|
| 195 |
+
for filename in [*log_files["worker_out"], *log_files["worker_err"]]:
|
| 196 |
+
# Worker logs look like worker-[worker_id]-[job_id]-[pid].out
|
| 197 |
+
if worker_id_hex is not None:
|
| 198 |
+
worker_id_from_filename = WORKER_LOG_PATTERN.match(filename).group(1)
|
| 199 |
+
if worker_id_from_filename == worker_id_hex:
|
| 200 |
+
return filename
|
| 201 |
+
else:
|
| 202 |
+
worker_pid_from_filename = int(
|
| 203 |
+
WORKER_LOG_PATTERN.match(filename).group(3)
|
| 204 |
+
)
|
| 205 |
+
if worker_pid_from_filename == pid:
|
| 206 |
+
return filename
|
| 207 |
+
return None
|
| 208 |
+
|
| 209 |
+
async def _resolve_actor_filename(
|
| 210 |
+
self,
|
| 211 |
+
actor_id: ActorID,
|
| 212 |
+
get_actor_fn: Callable[[ActorID], Awaitable[Optional[ActorTableData]]],
|
| 213 |
+
suffix: str,
|
| 214 |
+
timeout: int,
|
| 215 |
+
):
|
| 216 |
+
"""
|
| 217 |
+
Resolve actor log file
|
| 218 |
+
Args:
|
| 219 |
+
actor_id: The actor id.
|
| 220 |
+
get_actor_fn: The function to get actor information.
|
| 221 |
+
suffix: The suffix of the log file.
|
| 222 |
+
timeout: Timeout in seconds.
|
| 223 |
+
Returns:
|
| 224 |
+
The log file name and node id.
|
| 225 |
+
|
| 226 |
+
Raises:
|
| 227 |
+
ValueError if actor data is not found or get_actor_fn is not provided.
|
| 228 |
+
"""
|
| 229 |
+
if get_actor_fn is None:
|
| 230 |
+
raise ValueError("get_actor_fn needs to be specified for actor_id")
|
| 231 |
+
|
| 232 |
+
actor_data = await get_actor_fn(actor_id)
|
| 233 |
+
if actor_data is None:
|
| 234 |
+
raise ValueError(f"Actor ID {actor_id} not found.")
|
| 235 |
+
# TODO(sang): Only the latest worker id can be obtained from
|
| 236 |
+
# actor information now. That means, if actors are restarted,
|
| 237 |
+
# there's no way for us to get the past worker ids.
|
| 238 |
+
worker_id_binary = actor_data.address.worker_id
|
| 239 |
+
if not worker_id_binary:
|
| 240 |
+
raise ValueError(
|
| 241 |
+
f"Worker ID for Actor ID {actor_id} not found. "
|
| 242 |
+
"Actor is not scheduled yet."
|
| 243 |
+
)
|
| 244 |
+
worker_id = WorkerID(worker_id_binary)
|
| 245 |
+
node_id_binary = actor_data.address.raylet_id
|
| 246 |
+
if not node_id_binary:
|
| 247 |
+
raise ValueError(
|
| 248 |
+
f"Node ID for Actor ID {actor_id} not found. "
|
| 249 |
+
"Actor is not scheduled yet."
|
| 250 |
+
)
|
| 251 |
+
node_id = NodeID(node_id_binary)
|
| 252 |
+
self._verify_node_registered(node_id.hex())
|
| 253 |
+
log_filename = await self._resolve_worker_file(
|
| 254 |
+
node_id_hex=node_id.hex(),
|
| 255 |
+
worker_id_hex=worker_id.hex(),
|
| 256 |
+
pid=None,
|
| 257 |
+
suffix=suffix,
|
| 258 |
+
timeout=timeout,
|
| 259 |
+
)
|
| 260 |
+
return node_id.hex(), log_filename
|
| 261 |
+
|
| 262 |
+
async def _resolve_task_filename(
|
| 263 |
+
self, task_id: str, attempt_number: int, suffix: str, timeout: int
|
| 264 |
+
):
|
| 265 |
+
"""
|
| 266 |
+
Resolve log file for a task.
|
| 267 |
+
|
| 268 |
+
Args:
|
| 269 |
+
task_id: The task id.
|
| 270 |
+
attempt_number: The attempt number.
|
| 271 |
+
suffix: The suffix of the log file, e.g. out or err
|
| 272 |
+
timeout: Timeout in seconds.
|
| 273 |
+
|
| 274 |
+
Returns:
|
| 275 |
+
The log file name, node id, the start and end offsets of the
|
| 276 |
+
corresponding task log in the file.
|
| 277 |
+
|
| 278 |
+
Raises:
|
| 279 |
+
FileNotFoundError if the log file is not found.
|
| 280 |
+
ValueError if the suffix is not out or err.
|
| 281 |
+
|
| 282 |
+
"""
|
| 283 |
+
log_filename = None
|
| 284 |
+
node_id = None
|
| 285 |
+
start_offset = None
|
| 286 |
+
end_offset = None
|
| 287 |
+
|
| 288 |
+
if suffix not in ["out", "err"]:
|
| 289 |
+
raise ValueError(f"Suffix {suffix} is not supported.")
|
| 290 |
+
|
| 291 |
+
reply = await self.client.get_all_task_info(
|
| 292 |
+
filters=[("task_id", "=", task_id)], timeout=timeout
|
| 293 |
+
)
|
| 294 |
+
# Check if the task is found.
|
| 295 |
+
if len(reply.events_by_task) == 0:
|
| 296 |
+
raise FileNotFoundError(
|
| 297 |
+
f"Could not find log file for task: {task_id}"
|
| 298 |
+
f" (attempt {attempt_number}) with suffix: {suffix}"
|
| 299 |
+
)
|
| 300 |
+
task_event = None
|
| 301 |
+
for t in reply.events_by_task:
|
| 302 |
+
if t.attempt_number == attempt_number:
|
| 303 |
+
task_event = t
|
| 304 |
+
break
|
| 305 |
+
|
| 306 |
+
if task_event is None:
|
| 307 |
+
raise FileNotFoundError(
|
| 308 |
+
"Could not find log file for task attempt:"
|
| 309 |
+
f"{task_id}({attempt_number})"
|
| 310 |
+
)
|
| 311 |
+
# Get the worker id and node id.
|
| 312 |
+
task = protobuf_to_task_state_dict(task_event)
|
| 313 |
+
|
| 314 |
+
worker_id = task.get("worker_id", None)
|
| 315 |
+
node_id = task.get("node_id", None)
|
| 316 |
+
log_info = task.get("task_log_info", None)
|
| 317 |
+
actor_id = task.get("actor_id", None)
|
| 318 |
+
|
| 319 |
+
if node_id is None:
|
| 320 |
+
raise FileNotFoundError(
|
| 321 |
+
"Could not find log file for task attempt."
|
| 322 |
+
f"{task_id}({attempt_number}) due to missing node info."
|
| 323 |
+
)
|
| 324 |
+
|
| 325 |
+
if log_info is None and actor_id is not None:
|
| 326 |
+
# This is a concurrent actor task. The logs will be interleaved.
|
| 327 |
+
# So we return the log file of the actor instead.
|
| 328 |
+
raise FileNotFoundError(
|
| 329 |
+
f"For actor task, please query actor log for "
|
| 330 |
+
f"actor({actor_id}): e.g. ray logs actor --id {actor_id} . Or "
|
| 331 |
+
"set RAY_ENABLE_RECORD_ACTOR_TASK_LOGGING=1 in actor's runtime env "
|
| 332 |
+
"or when starting the cluster. Recording actor task's log could be "
|
| 333 |
+
"expensive, so Ray turns it off by default."
|
| 334 |
+
)
|
| 335 |
+
elif log_info is None:
|
| 336 |
+
raise FileNotFoundError(
|
| 337 |
+
"Could not find log file for task attempt:"
|
| 338 |
+
f"{task_id}({attempt_number})."
|
| 339 |
+
f"Worker id = {worker_id}, node id = {node_id},"
|
| 340 |
+
f"log_info = {log_info}"
|
| 341 |
+
)
|
| 342 |
+
|
| 343 |
+
filename_key = "stdout_file" if suffix == "out" else "stderr_file"
|
| 344 |
+
log_filename = log_info.get(filename_key, None)
|
| 345 |
+
if log_filename is None:
|
| 346 |
+
raise FileNotFoundError(
|
| 347 |
+
f"Missing log filename info in {log_info} for task {task_id},"
|
| 348 |
+
f"attempt {attempt_number}"
|
| 349 |
+
)
|
| 350 |
+
|
| 351 |
+
start_offset = log_info.get(f"std{suffix}_start", None)
|
| 352 |
+
end_offset = log_info.get(f"std{suffix}_end", None)
|
| 353 |
+
|
| 354 |
+
return node_id, log_filename, start_offset, end_offset
|
| 355 |
+
|
| 356 |
+
async def resolve_filename(
|
| 357 |
+
self,
|
| 358 |
+
*,
|
| 359 |
+
node_id: Optional[str] = None,
|
| 360 |
+
log_filename: Optional[str] = None,
|
| 361 |
+
actor_id: Optional[str] = None,
|
| 362 |
+
task_id: Optional[str] = None,
|
| 363 |
+
attempt_number: Optional[int] = None,
|
| 364 |
+
pid: Optional[str] = None,
|
| 365 |
+
get_actor_fn: Optional[
|
| 366 |
+
Callable[[ActorID], Awaitable[Optional[ActorTableData]]]
|
| 367 |
+
] = None,
|
| 368 |
+
timeout: int = DEFAULT_RPC_TIMEOUT,
|
| 369 |
+
suffix: str = "out",
|
| 370 |
+
submission_id: Optional[str] = None,
|
| 371 |
+
) -> ResolvedStreamFileInfo:
|
| 372 |
+
"""Return the file name given all options.
|
| 373 |
+
|
| 374 |
+
Args:
|
| 375 |
+
node_id: The node's id from which logs are resolved.
|
| 376 |
+
log_filename: Filename of the log file.
|
| 377 |
+
actor_id: Id of the actor that generates the log file.
|
| 378 |
+
task_id: Id of the task that generates the log file.
|
| 379 |
+
pid: Id of the worker process that generates the log file.
|
| 380 |
+
get_actor_fn: Callback to get the actor's data by id.
|
| 381 |
+
timeout: Timeout for the gRPC to listing logs on the node
|
| 382 |
+
specified by `node_id`.
|
| 383 |
+
suffix: Log suffix if no `log_filename` is provided, when
|
| 384 |
+
resolving by other ids'. Default to "out".
|
| 385 |
+
submission_id: The submission id for a submission job.
|
| 386 |
+
"""
|
| 387 |
+
start_offset = None
|
| 388 |
+
end_offset = None
|
| 389 |
+
if suffix not in ["out", "err"]:
|
| 390 |
+
raise ValueError(f"Suffix {suffix} is not supported. ")
|
| 391 |
+
|
| 392 |
+
# TODO(rickyx): We should make sure we do some sort of checking on the log
|
| 393 |
+
# filename
|
| 394 |
+
if actor_id:
|
| 395 |
+
node_id, log_filename = await self._resolve_actor_filename(
|
| 396 |
+
ActorID.from_hex(actor_id), get_actor_fn, suffix, timeout
|
| 397 |
+
)
|
| 398 |
+
|
| 399 |
+
elif task_id:
|
| 400 |
+
(
|
| 401 |
+
node_id,
|
| 402 |
+
log_filename,
|
| 403 |
+
start_offset,
|
| 404 |
+
end_offset,
|
| 405 |
+
) = await self._resolve_task_filename(
|
| 406 |
+
task_id, attempt_number, suffix, timeout
|
| 407 |
+
)
|
| 408 |
+
|
| 409 |
+
elif submission_id:
|
| 410 |
+
node_id, log_filename = await self._resolve_job_filename(submission_id)
|
| 411 |
+
|
| 412 |
+
elif pid:
|
| 413 |
+
if node_id is None:
|
| 414 |
+
raise ValueError(
|
| 415 |
+
"Node id needs to be specified for resolving"
|
| 416 |
+
f" filenames of pid {pid}"
|
| 417 |
+
)
|
| 418 |
+
self._verify_node_registered(node_id)
|
| 419 |
+
log_filename = await self._resolve_worker_file(
|
| 420 |
+
node_id_hex=node_id,
|
| 421 |
+
worker_id_hex=None,
|
| 422 |
+
pid=pid,
|
| 423 |
+
suffix=suffix,
|
| 424 |
+
timeout=timeout,
|
| 425 |
+
)
|
| 426 |
+
|
| 427 |
+
if log_filename is None:
|
| 428 |
+
raise FileNotFoundError(
|
| 429 |
+
"Could not find a log file. Please make sure the given "
|
| 430 |
+
"option exists in the cluster.\n"
|
| 431 |
+
f"\tnode_id: {node_id}\n"
|
| 432 |
+
f"\tfilename: {log_filename}\n"
|
| 433 |
+
f"\tactor_id: {actor_id}\n"
|
| 434 |
+
f"\ttask_id: {task_id}\n"
|
| 435 |
+
f"\tpid: {pid}\n"
|
| 436 |
+
f"\tsuffix: {suffix}\n"
|
| 437 |
+
f"\tsubmission_id: {submission_id}\n"
|
| 438 |
+
f"\tattempt_number: {attempt_number}\n"
|
| 439 |
+
)
|
| 440 |
+
|
| 441 |
+
res = ResolvedStreamFileInfo(
|
| 442 |
+
node_id=node_id,
|
| 443 |
+
filename=log_filename,
|
| 444 |
+
start_offset=start_offset,
|
| 445 |
+
end_offset=end_offset,
|
| 446 |
+
)
|
| 447 |
+
logger.info(f"Resolved log file: {res}")
|
| 448 |
+
return res
|
| 449 |
+
|
| 450 |
+
def _categorize_log_files(self, log_files: List[str]) -> Dict[str, List[str]]:
|
| 451 |
+
"""Categorize the given log files after filterieng them out using a given glob.
|
| 452 |
+
|
| 453 |
+
Returns:
|
| 454 |
+
Dictionary of {component_name -> list of log files}
|
| 455 |
+
"""
|
| 456 |
+
result = defaultdict(list)
|
| 457 |
+
for log_file in log_files:
|
| 458 |
+
if "worker" in log_file and (log_file.endswith(".out")):
|
| 459 |
+
result["worker_out"].append(log_file)
|
| 460 |
+
elif "worker" in log_file and (log_file.endswith(".err")):
|
| 461 |
+
result["worker_err"].append(log_file)
|
| 462 |
+
elif "core-worker" in log_file and log_file.endswith(".log"):
|
| 463 |
+
result["core_worker"].append(log_file)
|
| 464 |
+
elif "core-driver" in log_file and log_file.endswith(".log"):
|
| 465 |
+
result["driver"].append(log_file)
|
| 466 |
+
elif "raylet." in log_file:
|
| 467 |
+
result["raylet"].append(log_file)
|
| 468 |
+
elif "gcs_server." in log_file:
|
| 469 |
+
result["gcs_server"].append(log_file)
|
| 470 |
+
elif "log_monitor" in log_file:
|
| 471 |
+
result["internal"].append(log_file)
|
| 472 |
+
elif "monitor" in log_file:
|
| 473 |
+
result["autoscaler"].append(log_file)
|
| 474 |
+
elif "agent." in log_file:
|
| 475 |
+
result["agent"].append(log_file)
|
| 476 |
+
elif "dashboard." in log_file:
|
| 477 |
+
result["dashboard"].append(log_file)
|
| 478 |
+
else:
|
| 479 |
+
result["internal"].append(log_file)
|
| 480 |
+
|
| 481 |
+
return result
|
.venv/lib/python3.11/site-packages/ray/dashboard/modules/log/log_utils.py
ADDED
|
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import mimetypes
|
| 2 |
+
|
| 3 |
+
import ray.dashboard.modules.log.log_consts as log_consts
|
| 4 |
+
|
| 5 |
+
|
| 6 |
+
def register_mimetypes():
|
| 7 |
+
for _type, extensions in log_consts.MIME_TYPES.items():
|
| 8 |
+
for ext in extensions:
|
| 9 |
+
mimetypes.add_type(_type, ext)
|
.venv/lib/python3.11/site-packages/ray/dashboard/modules/node/__pycache__/node_head.cpython-311.pyc
ADDED
|
Binary file (24.5 kB). View file
|
|
|
.venv/lib/python3.11/site-packages/ray/dashboard/modules/serve/__init__.py
ADDED
|
File without changes
|
.venv/lib/python3.11/site-packages/ray/dashboard/modules/serve/__pycache__/__init__.cpython-311.pyc
ADDED
|
Binary file (200 Bytes). View file
|
|
|
.venv/lib/python3.11/site-packages/ray/dashboard/modules/serve/__pycache__/sdk.cpython-311.pyc
ADDED
|
Binary file (4.12 kB). View file
|
|
|
.venv/lib/python3.11/site-packages/ray/dashboard/modules/serve/__pycache__/serve_agent.cpython-311.pyc
ADDED
|
Binary file (765 Bytes). View file
|
|
|
.venv/lib/python3.11/site-packages/ray/dashboard/modules/serve/__pycache__/serve_head.cpython-311.pyc
ADDED
|
Binary file (729 Bytes). View file
|
|
|
.venv/lib/python3.11/site-packages/ray/dashboard/modules/serve/__pycache__/serve_rest_api_impl.cpython-311.pyc
ADDED
|
Binary file (13.7 kB). View file
|
|
|